From e6d1592492a3a379186bfb02bd0f4eda0669c0d5 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Tue, 20 Aug 2019 20:50:12 +0000 Subject: Vendor import of stripped llvm trunk r366426 (just before the release_90 branch point): https://llvm.org/svn/llvm-project/llvm/trunk@366426 --- lib/Target/AArch64/AArch64.h | 9 +- lib/Target/AArch64/AArch64.td | 65 +- lib/Target/AArch64/AArch64A53Fix835769.cpp | 7 +- lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp | 7 +- lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp | 7 +- lib/Target/AArch64/AArch64AsmPrinter.cpp | 281 +- lib/Target/AArch64/AArch64BranchTargets.cpp | 7 +- lib/Target/AArch64/AArch64CallLowering.cpp | 205 +- lib/Target/AArch64/AArch64CallLowering.h | 28 +- lib/Target/AArch64/AArch64CallingConvention.cpp | 134 + lib/Target/AArch64/AArch64CallingConvention.h | 156 +- lib/Target/AArch64/AArch64CallingConvention.td | 33 +- .../AArch64/AArch64CleanupLocalDynamicTLSPass.cpp | 7 +- lib/Target/AArch64/AArch64CollectLOH.cpp | 7 +- lib/Target/AArch64/AArch64CompressJumpTables.cpp | 10 +- lib/Target/AArch64/AArch64CondBrTuning.cpp | 7 +- lib/Target/AArch64/AArch64ConditionOptimizer.cpp | 7 +- lib/Target/AArch64/AArch64ConditionalCompares.cpp | 9 +- .../AArch64/AArch64DeadRegisterDefinitionsPass.cpp | 108 +- lib/Target/AArch64/AArch64ExpandImm.cpp | 411 + lib/Target/AArch64/AArch64ExpandImm.h | 35 + lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 619 +- lib/Target/AArch64/AArch64FalkorHWPFFix.cpp | 13 +- lib/Target/AArch64/AArch64FastISel.cpp | 34 +- lib/Target/AArch64/AArch64FrameLowering.cpp | 215 +- lib/Target/AArch64/AArch64FrameLowering.h | 17 +- lib/Target/AArch64/AArch64GenRegisterBankInfo.def | 11 +- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 140 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 583 +- lib/Target/AArch64/AArch64ISelLowering.h | 42 +- lib/Target/AArch64/AArch64InstrAtomics.td | 7 +- lib/Target/AArch64/AArch64InstrFormats.td | 50 +- lib/Target/AArch64/AArch64InstrInfo.cpp | 472 +- lib/Target/AArch64/AArch64InstrInfo.h | 51 +- lib/Target/AArch64/AArch64InstrInfo.td | 172 +- lib/Target/AArch64/AArch64InstructionSelector.cpp | 2803 +++++- lib/Target/AArch64/AArch64LegalizerInfo.cpp | 388 +- lib/Target/AArch64/AArch64LegalizerInfo.h | 13 +- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp | 13 +- lib/Target/AArch64/AArch64MCInstLower.cpp | 7 +- lib/Target/AArch64/AArch64MCInstLower.h | 7 +- lib/Target/AArch64/AArch64MachineFunctionInfo.h | 28 +- lib/Target/AArch64/AArch64MacroFusion.cpp | 7 +- lib/Target/AArch64/AArch64MacroFusion.h | 7 +- lib/Target/AArch64/AArch64PBQPRegAlloc.cpp | 7 +- lib/Target/AArch64/AArch64PBQPRegAlloc.h | 7 +- lib/Target/AArch64/AArch64PerfectShuffle.h | 7 +- lib/Target/AArch64/AArch64PfmCounters.td | 7 +- lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp | 11 +- lib/Target/AArch64/AArch64PromoteConstant.cpp | 10 +- .../AArch64/AArch64RedundantCopyElimination.cpp | 11 +- lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 238 +- lib/Target/AArch64/AArch64RegisterBankInfo.h | 20 +- lib/Target/AArch64/AArch64RegisterBanks.td | 7 +- lib/Target/AArch64/AArch64RegisterInfo.cpp | 49 +- lib/Target/AArch64/AArch64RegisterInfo.h | 11 +- lib/Target/AArch64/AArch64RegisterInfo.td | 26 +- lib/Target/AArch64/AArch64SIMDInstrOpt.cpp | 7 +- lib/Target/AArch64/AArch64SVEInstrInfo.td | 426 +- lib/Target/AArch64/AArch64SchedA53.td | 9 +- lib/Target/AArch64/AArch64SchedA57.td | 9 +- lib/Target/AArch64/AArch64SchedA57WriteRes.td | 7 +- lib/Target/AArch64/AArch64SchedCyclone.td | 9 +- lib/Target/AArch64/AArch64SchedExynosM1.td | 9 +- lib/Target/AArch64/AArch64SchedExynosM3.td | 9 +- lib/Target/AArch64/AArch64SchedExynosM4.td | 45 +- lib/Target/AArch64/AArch64SchedFalkor.td | 9 +- lib/Target/AArch64/AArch64SchedFalkorDetails.td | 7 +- lib/Target/AArch64/AArch64SchedKryo.td | 9 +- lib/Target/AArch64/AArch64SchedKryoDetails.td | 7 +- lib/Target/AArch64/AArch64SchedPredExynos.td | 18 +- lib/Target/AArch64/AArch64SchedPredicates.td | 60 +- lib/Target/AArch64/AArch64SchedThunderX.td | 9 +- lib/Target/AArch64/AArch64SchedThunderX2T99.td | 9 +- lib/Target/AArch64/AArch64Schedule.td | 7 +- lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 95 +- lib/Target/AArch64/AArch64SelectionDAGInfo.h | 11 +- lib/Target/AArch64/AArch64SpeculationHardening.cpp | 182 +- lib/Target/AArch64/AArch64StackTagging.cpp | 345 + lib/Target/AArch64/AArch64StorePairSuppress.cpp | 9 +- lib/Target/AArch64/AArch64Subtarget.cpp | 8 +- lib/Target/AArch64/AArch64Subtarget.h | 40 +- lib/Target/AArch64/AArch64SystemOperands.td | 8 +- lib/Target/AArch64/AArch64TargetMachine.cpp | 37 +- lib/Target/AArch64/AArch64TargetMachine.h | 7 +- lib/Target/AArch64/AArch64TargetObjectFile.cpp | 7 +- lib/Target/AArch64/AArch64TargetObjectFile.h | 7 +- lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 15 +- lib/Target/AArch64/AArch64TargetTransformInfo.h | 11 +- lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 102 +- .../AArch64/Disassembler/AArch64Disassembler.cpp | 49 +- .../AArch64/Disassembler/AArch64Disassembler.h | 7 +- .../Disassembler/AArch64ExternalSymbolizer.cpp | 7 +- .../Disassembler/AArch64ExternalSymbolizer.h | 7 +- .../AArch64/InstPrinter/AArch64InstPrinter.cpp | 1582 ---- .../AArch64/InstPrinter/AArch64InstPrinter.h | 223 - .../AArch64/MCTargetDesc/AArch64AddressingModes.h | 7 +- .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 54 +- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 9 +- .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp | 11 +- .../AArch64/MCTargetDesc/AArch64ELFStreamer.h | 7 +- .../AArch64/MCTargetDesc/AArch64FixupKinds.h | 7 +- .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp | 1587 ++++ .../AArch64/MCTargetDesc/AArch64InstPrinter.h | 222 + .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 11 +- lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h | 7 +- .../AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp | 14 +- lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp | 10 +- lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h | 9 +- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 203 +- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.h | 14 +- .../MCTargetDesc/AArch64MachObjectWriter.cpp | 17 +- .../AArch64/MCTargetDesc/AArch64TargetStreamer.cpp | 8 +- .../AArch64/MCTargetDesc/AArch64TargetStreamer.h | 7 +- .../MCTargetDesc/AArch64WinCOFFObjectWriter.cpp | 7 +- .../MCTargetDesc/AArch64WinCOFFStreamer.cpp | 7 +- .../AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h | 7 +- lib/Target/AArch64/SVEInstrFormats.td | 1340 ++- .../AArch64/TargetInfo/AArch64TargetInfo.cpp | 33 +- lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h | 24 + lib/Target/AArch64/Utils/AArch64BaseInfo.cpp | 7 +- lib/Target/AArch64/Utils/AArch64BaseInfo.h | 50 +- lib/Target/AMDGPU/AMDGPU.h | 52 +- lib/Target/AMDGPU/AMDGPU.td | 570 +- lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp | 41 +- lib/Target/AMDGPU/AMDGPUAliasAnalysis.h | 13 +- lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp | 7 +- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 75 +- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp | 8 +- lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp | 19 +- lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h | 43 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 339 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 17 +- lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 314 +- lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 362 +- lib/Target/AMDGPU/AMDGPUCallLowering.h | 20 +- lib/Target/AMDGPU/AMDGPUCallingConv.td | 49 +- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 136 +- lib/Target/AMDGPU/AMDGPUFeatures.td | 18 +- lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp | 7 +- lib/Target/AMDGPU/AMDGPUFrameLowering.cpp | 7 +- lib/Target/AMDGPU/AMDGPUFrameLowering.h | 7 +- lib/Target/AMDGPU/AMDGPUGISel.td | 55 +- lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def | 113 +- lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 220 +- lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h | 41 +- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 802 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 363 +- lib/Target/AMDGPU/AMDGPUISelLowering.h | 73 +- lib/Target/AMDGPU/AMDGPUInline.cpp | 45 +- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp | 7 +- lib/Target/AMDGPU/AMDGPUInstrInfo.h | 7 +- lib/Target/AMDGPU/AMDGPUInstrInfo.td | 46 +- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 1469 ++- lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 55 +- lib/Target/AMDGPU/AMDGPUInstructions.td | 267 +- lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp | 103 - lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h | 58 - lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1357 ++- lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 50 +- lib/Target/AMDGPU/AMDGPULibCalls.cpp | 151 +- lib/Target/AMDGPU/AMDGPULibFunc.cpp | 62 +- lib/Target/AMDGPU/AMDGPULibFunc.h | 11 +- lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 7 +- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 38 +- lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | 7 +- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 48 +- lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 7 +- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp | 21 +- lib/Target/AMDGPU/AMDGPUMachineFunction.h | 7 +- lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp | 17 +- lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h | 80 +- lib/Target/AMDGPU/AMDGPUMacroFusion.cpp | 7 +- lib/Target/AMDGPU/AMDGPUMacroFusion.h | 7 +- .../AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp | 11 +- lib/Target/AMDGPU/AMDGPUPTNote.h | 7 +- lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp | 77 +- lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h | 17 +- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 36 +- lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp | 336 + lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp | 353 - lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1782 +++- lib/Target/AMDGPU/AMDGPURegisterBankInfo.h | 52 +- lib/Target/AMDGPU/AMDGPURegisterBanks.td | 9 +- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp | 27 +- lib/Target/AMDGPU/AMDGPURegisterInfo.h | 7 +- lib/Target/AMDGPU/AMDGPURegisterInfo.td | 9 +- lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp | 7 +- lib/Target/AMDGPU/AMDGPUSearchableTables.td | 60 +- lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 263 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 311 +- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 307 +- lib/Target/AMDGPU/AMDGPUTargetMachine.h | 21 +- lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp | 7 +- lib/Target/AMDGPU/AMDGPUTargetObjectFile.h | 7 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 38 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 21 +- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 18 +- lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp | 7 +- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 7 +- lib/Target/AMDGPU/AMDKernelCodeT.h | 15 +- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2828 ++++-- lib/Target/AMDGPU/BUFInstructions.td | 957 +- lib/Target/AMDGPU/CaymanInstructions.td | 7 +- lib/Target/AMDGPU/DSInstructions.td | 566 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 485 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 32 +- lib/Target/AMDGPU/EvergreenInstructions.td | 7 +- lib/Target/AMDGPU/FLATInstructions.td | 527 +- lib/Target/AMDGPU/GCNDPPCombine.cpp | 259 +- lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 826 +- lib/Target/AMDGPU/GCNHazardRecognizer.h | 41 +- lib/Target/AMDGPU/GCNILPSched.cpp | 7 +- lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 7 +- lib/Target/AMDGPU/GCNIterativeScheduler.h | 7 +- lib/Target/AMDGPU/GCNMinRegStrategy.cpp | 7 +- lib/Target/AMDGPU/GCNNSAReassign.cpp | 343 + lib/Target/AMDGPU/GCNProcessors.td | 114 +- lib/Target/AMDGPU/GCNRegBankReassign.cpp | 800 ++ lib/Target/AMDGPU/GCNRegPressure.cpp | 22 +- lib/Target/AMDGPU/GCNRegPressure.h | 61 +- lib/Target/AMDGPU/GCNSchedStrategy.cpp | 35 +- lib/Target/AMDGPU/GCNSchedStrategy.h | 16 +- .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 1413 --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h | 250 - .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp | 65 +- .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp | 21 +- .../AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp | 7 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h | 7 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h | 7 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 1568 ++++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 268 + lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 29 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h | 8 +- .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 +- .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h | 20 +- .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp | 41 +- .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h | 12 +- .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 218 +- .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h | 40 +- .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp | 14 +- .../AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp | 7 +- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 84 +- lib/Target/AMDGPU/MIMGInstructions.td | 484 +- lib/Target/AMDGPU/R600.td | 7 +- lib/Target/AMDGPU/R600AsmPrinter.cpp | 7 +- lib/Target/AMDGPU/R600AsmPrinter.h | 7 +- lib/Target/AMDGPU/R600ClauseMergePass.cpp | 7 +- lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp | 7 +- lib/Target/AMDGPU/R600Defines.h | 7 +- lib/Target/AMDGPU/R600EmitClauseMarkers.cpp | 7 +- lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp | 7 +- lib/Target/AMDGPU/R600FrameLowering.cpp | 7 +- lib/Target/AMDGPU/R600FrameLowering.h | 7 +- lib/Target/AMDGPU/R600ISelLowering.cpp | 37 +- lib/Target/AMDGPU/R600ISelLowering.h | 14 +- lib/Target/AMDGPU/R600InstrFormats.td | 7 +- lib/Target/AMDGPU/R600InstrInfo.cpp | 8 +- lib/Target/AMDGPU/R600InstrInfo.h | 7 +- lib/Target/AMDGPU/R600Instructions.td | 35 +- lib/Target/AMDGPU/R600MachineFunctionInfo.cpp | 7 +- lib/Target/AMDGPU/R600MachineFunctionInfo.h | 7 +- lib/Target/AMDGPU/R600MachineScheduler.cpp | 7 +- lib/Target/AMDGPU/R600MachineScheduler.h | 7 +- .../AMDGPU/R600OpenCLImageTypeLoweringPass.cpp | 7 +- lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp | 22 +- lib/Target/AMDGPU/R600Packetizer.cpp | 11 +- lib/Target/AMDGPU/R600Processors.td | 18 +- lib/Target/AMDGPU/R600RegisterInfo.cpp | 9 +- lib/Target/AMDGPU/R600RegisterInfo.h | 9 +- lib/Target/AMDGPU/R600Schedule.td | 7 +- lib/Target/AMDGPU/R700Instructions.td | 7 +- lib/Target/AMDGPU/SIAddIMGInit.cpp | 7 +- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 64 +- lib/Target/AMDGPU/SIDebuggerInsertNops.cpp | 97 - lib/Target/AMDGPU/SIDefines.h | 178 +- lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 83 +- lib/Target/AMDGPU/SIFixVGPRCopies.cpp | 7 +- lib/Target/AMDGPU/SIFixWWMLiveness.cpp | 418 - lib/Target/AMDGPU/SIFixupVectorISel.cpp | 12 +- lib/Target/AMDGPU/SIFoldOperands.cpp | 363 +- lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 22 +- lib/Target/AMDGPU/SIFrameLowering.cpp | 810 +- lib/Target/AMDGPU/SIFrameLowering.h | 28 +- lib/Target/AMDGPU/SIISelLowering.cpp | 1918 +++- lib/Target/AMDGPU/SIISelLowering.h | 49 +- lib/Target/AMDGPU/SIInsertSkips.cpp | 76 +- lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 417 +- lib/Target/AMDGPU/SIInstrFormats.td | 68 +- lib/Target/AMDGPU/SIInstrInfo.cpp | 1415 ++- lib/Target/AMDGPU/SIInstrInfo.h | 125 +- lib/Target/AMDGPU/SIInstrInfo.td | 654 +- lib/Target/AMDGPU/SIInstructions.td | 425 +- lib/Target/AMDGPU/SIIntrinsics.td | 19 - lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 60 +- lib/Target/AMDGPU/SILowerControlFlow.cpp | 104 +- lib/Target/AMDGPU/SILowerI1Copies.cpp | 107 +- lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 323 + lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 271 +- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 377 +- lib/Target/AMDGPU/SIMachineScheduler.cpp | 11 +- lib/Target/AMDGPU/SIMachineScheduler.h | 7 +- lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 322 +- lib/Target/AMDGPU/SIModeRegister.cpp | 9 +- lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 98 +- lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 155 +- lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 36 +- lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp | 221 + lib/Target/AMDGPU/SIProgramInfo.h | 21 +- lib/Target/AMDGPU/SIRegisterInfo.cpp | 660 +- lib/Target/AMDGPU/SIRegisterInfo.h | 78 +- lib/Target/AMDGPU/SIRegisterInfo.td | 633 +- lib/Target/AMDGPU/SISchedule.td | 71 +- lib/Target/AMDGPU/SIShrinkInstructions.cpp | 140 +- lib/Target/AMDGPU/SIWholeQuadMode.cpp | 82 +- lib/Target/AMDGPU/SMInstructions.td | 359 +- lib/Target/AMDGPU/SOPInstructions.td | 666 +- lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp | 9 +- lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h | 29 + lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp | 36 +- lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h | 14 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 410 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 203 +- lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp | 723 ++ lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h | 135 + lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h | 11 +- lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp | 7 +- lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h | 7 +- lib/Target/AMDGPU/VIInstrFormats.td | 7 +- lib/Target/AMDGPU/VIInstructions.td | 7 +- lib/Target/AMDGPU/VOP1Instructions.td | 487 +- lib/Target/AMDGPU/VOP2Instructions.td | 889 +- lib/Target/AMDGPU/VOP3Instructions.td | 501 +- lib/Target/AMDGPU/VOP3PInstructions.td | 220 +- lib/Target/AMDGPU/VOPCInstructions.td | 972 +- lib/Target/AMDGPU/VOPInstructions.td | 182 +- lib/Target/ARC/ARC.h | 8 +- lib/Target/ARC/ARC.td | 7 +- lib/Target/ARC/ARCAsmPrinter.cpp | 26 +- lib/Target/ARC/ARCBranchFinalize.cpp | 7 +- lib/Target/ARC/ARCCallingConv.td | 7 +- lib/Target/ARC/ARCExpandPseudos.cpp | 7 +- lib/Target/ARC/ARCFrameLowering.cpp | 59 +- lib/Target/ARC/ARCFrameLowering.h | 7 +- lib/Target/ARC/ARCISelDAGToDAG.cpp | 7 +- lib/Target/ARC/ARCISelLowering.cpp | 7 +- lib/Target/ARC/ARCISelLowering.h | 7 +- lib/Target/ARC/ARCInstrFormats.td | 71 +- lib/Target/ARC/ARCInstrInfo.cpp | 54 +- lib/Target/ARC/ARCInstrInfo.h | 17 +- lib/Target/ARC/ARCInstrInfo.td | 122 +- lib/Target/ARC/ARCMCInstLower.cpp | 7 +- lib/Target/ARC/ARCMCInstLower.h | 7 +- lib/Target/ARC/ARCMachineFunctionInfo.cpp | 7 +- lib/Target/ARC/ARCMachineFunctionInfo.h | 7 +- lib/Target/ARC/ARCOptAddrMode.cpp | 507 ++ lib/Target/ARC/ARCRegisterInfo.cpp | 15 +- lib/Target/ARC/ARCRegisterInfo.h | 9 +- lib/Target/ARC/ARCRegisterInfo.td | 7 +- lib/Target/ARC/ARCSubtarget.cpp | 7 +- lib/Target/ARC/ARCSubtarget.h | 7 +- lib/Target/ARC/ARCTargetMachine.cpp | 13 +- lib/Target/ARC/ARCTargetMachine.h | 7 +- lib/Target/ARC/ARCTargetStreamer.h | 7 +- lib/Target/ARC/ARCTargetTransformInfo.h | 7 +- lib/Target/ARC/Disassembler/ARCDisassembler.cpp | 8 +- lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp | 180 - lib/Target/ARC/InstPrinter/ARCInstPrinter.h | 46 - lib/Target/ARC/MCTargetDesc/ARCInfo.h | 7 +- lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp | 179 + lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h | 45 + lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp | 7 +- lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h | 7 +- lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp | 11 +- lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h | 9 +- lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp | 9 +- lib/Target/ARC/TargetInfo/ARCTargetInfo.h | 20 + lib/Target/ARM/A15SDOptimizer.cpp | 7 +- lib/Target/ARM/ARM.h | 18 +- lib/Target/ARM/ARM.td | 185 +- lib/Target/ARM/ARMAsmPrinter.cpp | 153 +- lib/Target/ARM/ARMAsmPrinter.h | 14 +- lib/Target/ARM/ARMBaseInstrInfo.cpp | 412 +- lib/Target/ARM/ARMBaseInstrInfo.h | 72 +- lib/Target/ARM/ARMBaseRegisterInfo.cpp | 51 +- lib/Target/ARM/ARMBaseRegisterInfo.h | 9 +- lib/Target/ARM/ARMBasicBlockInfo.cpp | 146 + lib/Target/ARM/ARMBasicBlockInfo.h | 59 +- lib/Target/ARM/ARMCallLowering.cpp | 176 +- lib/Target/ARM/ARMCallLowering.h | 20 +- lib/Target/ARM/ARMCallingConv.cpp | 284 + lib/Target/ARM/ARMCallingConv.h | 308 +- lib/Target/ARM/ARMCallingConv.td | 52 +- lib/Target/ARM/ARMCodeGenPrepare.cpp | 205 +- lib/Target/ARM/ARMComputeBlockSize.cpp | 81 - lib/Target/ARM/ARMConstantIslandPass.cpp | 246 +- lib/Target/ARM/ARMConstantPoolValue.cpp | 7 +- lib/Target/ARM/ARMConstantPoolValue.h | 7 +- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 28 +- lib/Target/ARM/ARMFastISel.cpp | 53 +- lib/Target/ARM/ARMFeatures.h | 7 +- lib/Target/ARM/ARMFrameLowering.cpp | 117 +- lib/Target/ARM/ARMFrameLowering.h | 7 +- lib/Target/ARM/ARMHazardRecognizer.cpp | 7 +- lib/Target/ARM/ARMHazardRecognizer.h | 7 +- lib/Target/ARM/ARMISelDAGToDAG.cpp | 213 +- lib/Target/ARM/ARMISelLowering.cpp | 1556 +++- lib/Target/ARM/ARMISelLowering.h | 101 +- lib/Target/ARM/ARMInstrFormats.td | 115 +- lib/Target/ARM/ARMInstrInfo.cpp | 9 +- lib/Target/ARM/ARMInstrInfo.h | 7 +- lib/Target/ARM/ARMInstrInfo.td | 380 +- lib/Target/ARM/ARMInstrMVE.td | 4591 ++++++++++ lib/Target/ARM/ARMInstrNEON.td | 1093 ++- lib/Target/ARM/ARMInstrThumb.td | 75 +- lib/Target/ARM/ARMInstrThumb2.td | 487 +- lib/Target/ARM/ARMInstrVFP.td | 367 +- lib/Target/ARM/ARMInstructionSelector.cpp | 268 +- lib/Target/ARM/ARMLegalizerInfo.cpp | 161 +- lib/Target/ARM/ARMLegalizerInfo.h | 7 +- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 149 +- lib/Target/ARM/ARMLowOverheadLoops.cpp | 384 + lib/Target/ARM/ARMMCInstLower.cpp | 7 +- lib/Target/ARM/ARMMachineFunctionInfo.cpp | 7 +- lib/Target/ARM/ARMMachineFunctionInfo.h | 16 +- lib/Target/ARM/ARMMacroFusion.cpp | 7 +- lib/Target/ARM/ARMMacroFusion.h | 7 +- lib/Target/ARM/ARMOptimizeBarriersPass.cpp | 7 +- lib/Target/ARM/ARMParallelDSP.cpp | 889 +- lib/Target/ARM/ARMPerfectShuffle.h | 7 +- lib/Target/ARM/ARMPredicates.td | 211 + lib/Target/ARM/ARMRegisterBankInfo.cpp | 51 +- lib/Target/ARM/ARMRegisterBankInfo.h | 7 +- lib/Target/ARM/ARMRegisterBanks.td | 7 +- lib/Target/ARM/ARMRegisterInfo.cpp | 7 +- lib/Target/ARM/ARMRegisterInfo.h | 7 +- lib/Target/ARM/ARMRegisterInfo.td | 132 +- lib/Target/ARM/ARMSchedule.td | 9 +- lib/Target/ARM/ARMScheduleA57.td | 13 +- lib/Target/ARM/ARMScheduleA57WriteRes.td | 7 +- lib/Target/ARM/ARMScheduleA8.td | 7 +- lib/Target/ARM/ARMScheduleA9.td | 7 +- lib/Target/ARM/ARMScheduleM3.td | 21 - lib/Target/ARM/ARMScheduleM4.td | 119 + lib/Target/ARM/ARMScheduleR52.td | 7 +- lib/Target/ARM/ARMScheduleSwift.td | 7 +- lib/Target/ARM/ARMScheduleV6.td | 7 +- lib/Target/ARM/ARMSelectionDAGInfo.cpp | 9 +- lib/Target/ARM/ARMSelectionDAGInfo.h | 7 +- lib/Target/ARM/ARMSubtarget.cpp | 73 +- lib/Target/ARM/ARMSubtarget.h | 78 +- lib/Target/ARM/ARMSystemRegister.td | 7 +- lib/Target/ARM/ARMTargetMachine.cpp | 43 +- lib/Target/ARM/ARMTargetMachine.h | 7 +- lib/Target/ARM/ARMTargetObjectFile.cpp | 7 +- lib/Target/ARM/ARMTargetObjectFile.h | 7 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 275 +- lib/Target/ARM/ARMTargetTransformInfo.h | 23 +- lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 1739 +++- lib/Target/ARM/Disassembler/ARMDisassembler.cpp | 1391 ++- lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp | 1571 ---- lib/Target/ARM/InstPrinter/ARMInstPrinter.h | 243 - lib/Target/ARM/LICENSE.TXT | 47 - lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h | 11 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 142 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h | 9 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h | 7 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h | 7 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h | 7 +- lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h | 18 +- lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 15 +- lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp | 11 +- lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h | 16 +- lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp | 1678 ++++ lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h | 272 + lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp | 7 +- lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h | 7 +- lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp | 459 +- lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp | 7 +- lib/Target/ARM/MCTargetDesc/ARMMCExpr.h | 7 +- lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp | 35 +- lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h | 27 +- .../ARM/MCTargetDesc/ARMMachORelocationInfo.cpp | 7 +- .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 7 +- lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp | 62 +- lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp | 7 +- lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h | 7 +- .../ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp | 7 +- lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp | 7 +- lib/Target/ARM/MLxExpansionPass.cpp | 7 +- lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp | 9 +- lib/Target/ARM/TargetInfo/ARMTargetInfo.h | 23 + lib/Target/ARM/Thumb1FrameLowering.cpp | 120 +- lib/Target/ARM/Thumb1FrameLowering.h | 7 +- lib/Target/ARM/Thumb1InstrInfo.cpp | 7 +- lib/Target/ARM/Thumb1InstrInfo.h | 7 +- lib/Target/ARM/Thumb2ITBlockPass.cpp | 221 +- lib/Target/ARM/Thumb2InstrInfo.cpp | 58 +- lib/Target/ARM/Thumb2InstrInfo.h | 13 +- lib/Target/ARM/Thumb2SizeReduction.cpp | 13 +- lib/Target/ARM/ThumbRegisterInfo.cpp | 75 +- lib/Target/ARM/ThumbRegisterInfo.h | 13 +- lib/Target/ARM/Utils/ARMBaseInfo.cpp | 7 +- lib/Target/ARM/Utils/ARMBaseInfo.h | 31 +- lib/Target/AVR/AVR.h | 7 +- lib/Target/AVR/AVR.td | 7 +- lib/Target/AVR/AVRAsmPrinter.cpp | 29 +- lib/Target/AVR/AVRCallingConv.td | 7 +- lib/Target/AVR/AVRExpandPseudoInsts.cpp | 17 +- lib/Target/AVR/AVRFrameLowering.cpp | 12 +- lib/Target/AVR/AVRFrameLowering.h | 7 +- lib/Target/AVR/AVRISelDAGToDAG.cpp | 7 +- lib/Target/AVR/AVRISelLowering.cpp | 55 +- lib/Target/AVR/AVRISelLowering.h | 20 +- lib/Target/AVR/AVRInstrFormats.td | 7 +- lib/Target/AVR/AVRInstrInfo.cpp | 10 +- lib/Target/AVR/AVRInstrInfo.h | 7 +- lib/Target/AVR/AVRInstrInfo.td | 53 +- lib/Target/AVR/AVRMCInstLower.cpp | 7 +- lib/Target/AVR/AVRMCInstLower.h | 7 +- lib/Target/AVR/AVRMachineFunctionInfo.h | 7 +- lib/Target/AVR/AVRRegisterInfo.cpp | 30 +- lib/Target/AVR/AVRRegisterInfo.h | 16 +- lib/Target/AVR/AVRRegisterInfo.td | 11 +- lib/Target/AVR/AVRRelaxMemOperations.cpp | 7 +- lib/Target/AVR/AVRSelectionDAGInfo.h | 7 +- lib/Target/AVR/AVRSubtarget.cpp | 19 +- lib/Target/AVR/AVRSubtarget.h | 12 +- lib/Target/AVR/AVRTargetMachine.cpp | 8 +- lib/Target/AVR/AVRTargetMachine.h | 7 +- lib/Target/AVR/AVRTargetObjectFile.cpp | 7 +- lib/Target/AVR/AVRTargetObjectFile.h | 7 +- lib/Target/AVR/AsmParser/AVRAsmParser.cpp | 24 +- lib/Target/AVR/Disassembler/AVRDisassembler.cpp | 8 +- lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp | 171 - lib/Target/AVR/InstPrinter/AVRInstPrinter.h | 54 - lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp | 7 +- lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp | 7 +- lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp | 170 + lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h | 53 + lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp | 8 +- lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCExpr.h | 7 +- lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp | 10 +- lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h | 9 +- lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp | 7 +- lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h | 7 +- lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp | 9 +- lib/Target/AVR/TargetInfo/AVRTargetInfo.h | 18 + lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 10 +- lib/Target/BPF/BPF.h | 12 +- lib/Target/BPF/BPF.td | 8 +- lib/Target/BPF/BPFAbstractMemberAccess.cpp | 482 + lib/Target/BPF/BPFAsmPrinter.cpp | 42 +- lib/Target/BPF/BPFCORE.h | 24 + lib/Target/BPF/BPFCallingConv.td | 7 +- lib/Target/BPF/BPFFrameLowering.cpp | 7 +- lib/Target/BPF/BPFFrameLowering.h | 7 +- lib/Target/BPF/BPFISelDAGToDAG.cpp | 7 +- lib/Target/BPF/BPFISelLowering.cpp | 64 +- lib/Target/BPF/BPFISelLowering.h | 11 +- lib/Target/BPF/BPFInstrFormats.td | 8 +- lib/Target/BPF/BPFInstrInfo.cpp | 7 +- lib/Target/BPF/BPFInstrInfo.h | 7 +- lib/Target/BPF/BPFInstrInfo.td | 111 +- lib/Target/BPF/BPFMCInstLower.cpp | 7 +- lib/Target/BPF/BPFMCInstLower.h | 7 +- lib/Target/BPF/BPFMIChecking.cpp | 104 +- lib/Target/BPF/BPFMIPeephole.cpp | 7 +- lib/Target/BPF/BPFMISimplifyPatchable.cpp | 163 + lib/Target/BPF/BPFRegisterInfo.cpp | 9 +- lib/Target/BPF/BPFRegisterInfo.h | 9 +- lib/Target/BPF/BPFRegisterInfo.td | 7 +- lib/Target/BPF/BPFSelectionDAGInfo.cpp | 7 +- lib/Target/BPF/BPFSelectionDAGInfo.h | 7 +- lib/Target/BPF/BPFSubtarget.cpp | 13 +- lib/Target/BPF/BPFSubtarget.h | 12 +- lib/Target/BPF/BPFTargetMachine.cpp | 20 +- lib/Target/BPF/BPFTargetMachine.h | 7 +- lib/Target/BPF/BTF.def | 9 +- lib/Target/BPF/BTF.h | 98 +- lib/Target/BPF/BTFDebug.cpp | 727 +- lib/Target/BPF/BTFDebug.h | 120 +- lib/Target/BPF/Disassembler/BPFDisassembler.cpp | 13 +- lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp | 108 - lib/Target/BPF/InstPrinter/BPFInstPrinter.h | 41 - lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp | 19 +- lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp | 39 +- lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp | 107 + lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h | 40 + lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h | 7 +- lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp | 14 +- lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp | 11 +- lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h | 11 +- lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp | 18 +- lib/Target/BPF/TargetInfo/BPFTargetInfo.h | 22 + lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 29 +- lib/Target/Hexagon/BitTracker.cpp | 7 +- lib/Target/Hexagon/BitTracker.h | 7 +- .../Hexagon/Disassembler/HexagonDisassembler.cpp | 10 +- lib/Target/Hexagon/Hexagon.h | 7 +- lib/Target/Hexagon/Hexagon.td | 7 +- lib/Target/Hexagon/HexagonAsmPrinter.cpp | 20 +- lib/Target/Hexagon/HexagonAsmPrinter.h | 14 +- lib/Target/Hexagon/HexagonBitSimplify.cpp | 7 +- lib/Target/Hexagon/HexagonBitTracker.cpp | 7 +- lib/Target/Hexagon/HexagonBitTracker.h | 7 +- lib/Target/Hexagon/HexagonBlockRanges.cpp | 7 +- lib/Target/Hexagon/HexagonBlockRanges.h | 7 +- lib/Target/Hexagon/HexagonBranchRelaxation.cpp | 7 +- lib/Target/Hexagon/HexagonCFGOptimizer.cpp | 7 +- lib/Target/Hexagon/HexagonCallingConv.td | 7 +- lib/Target/Hexagon/HexagonCommonGEP.cpp | 24 +- lib/Target/Hexagon/HexagonConstExtenders.cpp | 7 +- lib/Target/Hexagon/HexagonConstPropagation.cpp | 186 +- lib/Target/Hexagon/HexagonCopyToCombine.cpp | 11 +- lib/Target/Hexagon/HexagonDepArch.h | 7 +- lib/Target/Hexagon/HexagonDepArch.td | 7 +- lib/Target/Hexagon/HexagonDepDecoders.h | 79 - lib/Target/Hexagon/HexagonDepDecoders.inc | 78 + lib/Target/Hexagon/HexagonDepIICHVX.td | 7 +- lib/Target/Hexagon/HexagonDepIICScalar.td | 7 +- lib/Target/Hexagon/HexagonDepITypes.h | 7 +- lib/Target/Hexagon/HexagonDepITypes.td | 7 +- lib/Target/Hexagon/HexagonDepInstrFormats.td | 7 +- lib/Target/Hexagon/HexagonDepInstrInfo.td | 7 +- lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td | 7 +- lib/Target/Hexagon/HexagonDepMappings.td | 7 +- lib/Target/Hexagon/HexagonDepOperands.td | 7 +- lib/Target/Hexagon/HexagonDepTimingClasses.h | 7 +- lib/Target/Hexagon/HexagonEarlyIfConv.cpp | 7 +- lib/Target/Hexagon/HexagonExpandCondsets.cpp | 9 +- lib/Target/Hexagon/HexagonFixupHwLoops.cpp | 7 +- lib/Target/Hexagon/HexagonFrameLowering.cpp | 15 +- lib/Target/Hexagon/HexagonFrameLowering.h | 7 +- lib/Target/Hexagon/HexagonGenExtract.cpp | 9 +- lib/Target/Hexagon/HexagonGenInsert.cpp | 11 +- lib/Target/Hexagon/HexagonGenMux.cpp | 11 +- lib/Target/Hexagon/HexagonGenPredicate.cpp | 73 +- lib/Target/Hexagon/HexagonHardwareLoops.cpp | 7 +- lib/Target/Hexagon/HexagonHazardRecognizer.cpp | 7 +- lib/Target/Hexagon/HexagonHazardRecognizer.h | 7 +- lib/Target/Hexagon/HexagonIICHVX.td | 19 +- lib/Target/Hexagon/HexagonIICScalar.td | 7 +- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp | 12 +- lib/Target/Hexagon/HexagonISelDAGToDAG.h | 7 +- lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp | 7 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 100 +- lib/Target/Hexagon/HexagonISelLowering.h | 15 +- lib/Target/Hexagon/HexagonISelLoweringHVX.cpp | 9 +- lib/Target/Hexagon/HexagonInstrFormats.td | 7 +- lib/Target/Hexagon/HexagonInstrFormatsV5.td | 7 +- lib/Target/Hexagon/HexagonInstrFormatsV60.td | 7 +- lib/Target/Hexagon/HexagonInstrFormatsV65.td | 7 +- lib/Target/Hexagon/HexagonInstrInfo.cpp | 62 +- lib/Target/Hexagon/HexagonInstrInfo.h | 21 +- lib/Target/Hexagon/HexagonIntrinsics.td | 7 +- lib/Target/Hexagon/HexagonIntrinsicsV5.td | 7 +- lib/Target/Hexagon/HexagonIntrinsicsV60.td | 7 +- lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp | 19 +- lib/Target/Hexagon/HexagonMCInstLower.cpp | 7 +- lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp | 7 +- lib/Target/Hexagon/HexagonMachineFunctionInfo.h | 7 +- lib/Target/Hexagon/HexagonMachineScheduler.cpp | 9 +- lib/Target/Hexagon/HexagonMachineScheduler.h | 7 +- lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td | 7 +- lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td | 7 +- lib/Target/Hexagon/HexagonNewValueJump.cpp | 7 +- lib/Target/Hexagon/HexagonOperands.td | 7 +- lib/Target/Hexagon/HexagonOptAddrMode.cpp | 7 +- lib/Target/Hexagon/HexagonOptimizeSZextends.cpp | 7 +- lib/Target/Hexagon/HexagonPatterns.td | 11 +- lib/Target/Hexagon/HexagonPatternsV65.td | 7 +- lib/Target/Hexagon/HexagonPeephole.cpp | 7 +- lib/Target/Hexagon/HexagonPseudo.td | 12 +- lib/Target/Hexagon/HexagonRDFOpt.cpp | 7 +- lib/Target/Hexagon/HexagonRegisterInfo.cpp | 9 +- lib/Target/Hexagon/HexagonRegisterInfo.h | 9 +- lib/Target/Hexagon/HexagonRegisterInfo.td | 7 +- lib/Target/Hexagon/HexagonSchedule.td | 7 +- lib/Target/Hexagon/HexagonScheduleV5.td | 7 +- lib/Target/Hexagon/HexagonScheduleV55.td | 7 +- lib/Target/Hexagon/HexagonScheduleV60.td | 7 +- lib/Target/Hexagon/HexagonScheduleV62.td | 7 +- lib/Target/Hexagon/HexagonScheduleV65.td | 7 +- lib/Target/Hexagon/HexagonScheduleV66.td | 7 +- lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp | 7 +- lib/Target/Hexagon/HexagonSelectionDAGInfo.h | 7 +- .../Hexagon/HexagonSplitConst32AndConst64.cpp | 7 +- lib/Target/Hexagon/HexagonSplitDouble.cpp | 11 +- lib/Target/Hexagon/HexagonStoreWidening.cpp | 15 +- lib/Target/Hexagon/HexagonSubtarget.cpp | 7 +- lib/Target/Hexagon/HexagonSubtarget.h | 7 +- lib/Target/Hexagon/HexagonTargetMachine.cpp | 8 +- lib/Target/Hexagon/HexagonTargetMachine.h | 7 +- lib/Target/Hexagon/HexagonTargetObjectFile.cpp | 14 +- lib/Target/Hexagon/HexagonTargetObjectFile.h | 7 +- lib/Target/Hexagon/HexagonTargetStreamer.h | 7 +- lib/Target/Hexagon/HexagonTargetTransformInfo.cpp | 12 +- lib/Target/Hexagon/HexagonTargetTransformInfo.h | 7 +- lib/Target/Hexagon/HexagonVExtract.cpp | 7 +- lib/Target/Hexagon/HexagonVLIWPacketizer.cpp | 7 +- lib/Target/Hexagon/HexagonVLIWPacketizer.h | 7 +- .../Hexagon/HexagonVectorLoopCarriedReuse.cpp | 222 +- lib/Target/Hexagon/HexagonVectorPrint.cpp | 7 +- .../Hexagon/MCTargetDesc/HexagonAsmBackend.cpp | 8 +- lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h | 7 +- .../MCTargetDesc/HexagonELFObjectWriter.cpp | 9 +- .../Hexagon/MCTargetDesc/HexagonFixupKinds.h | 7 +- .../Hexagon/MCTargetDesc/HexagonInstPrinter.cpp | 8 +- .../Hexagon/MCTargetDesc/HexagonInstPrinter.h | 7 +- .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp | 7 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h | 7 +- .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp | 8 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h | 7 +- .../Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp | 10 +- .../Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h | 14 +- .../Hexagon/MCTargetDesc/HexagonMCCompound.cpp | 8 +- .../Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp | 7 +- .../Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp | 9 +- .../Hexagon/MCTargetDesc/HexagonMCELFStreamer.h | 10 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp | 7 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h | 7 +- .../Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp | 8 +- .../Hexagon/MCTargetDesc/HexagonMCInstrInfo.h | 7 +- .../Hexagon/MCTargetDesc/HexagonMCShuffler.cpp | 8 +- .../Hexagon/MCTargetDesc/HexagonMCShuffler.h | 7 +- .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp | 9 +- .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.h | 8 +- .../Hexagon/MCTargetDesc/HexagonShuffler.cpp | 9 +- lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h | 9 +- lib/Target/Hexagon/RDFCopy.cpp | 7 +- lib/Target/Hexagon/RDFCopy.h | 7 +- lib/Target/Hexagon/RDFDeadCode.cpp | 7 +- lib/Target/Hexagon/RDFDeadCode.h | 7 +- lib/Target/Hexagon/RDFGraph.cpp | 29 +- lib/Target/Hexagon/RDFGraph.h | 34 +- lib/Target/Hexagon/RDFLiveness.cpp | 8 +- lib/Target/Hexagon/RDFLiveness.h | 9 +- lib/Target/Hexagon/RDFRegisters.cpp | 7 +- lib/Target/Hexagon/RDFRegisters.h | 7 +- .../Hexagon/TargetInfo/HexagonTargetInfo.cpp | 10 +- lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h | 20 + lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp | 10 +- .../Lanai/Disassembler/LanaiDisassembler.cpp | 13 +- lib/Target/Lanai/Disassembler/LanaiDisassembler.h | 7 +- lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp | 305 - lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h | 66 - lib/Target/Lanai/Lanai.h | 15 +- lib/Target/Lanai/Lanai.td | 7 +- lib/Target/Lanai/LanaiAluCode.h | 7 +- lib/Target/Lanai/LanaiAsmPrinter.cpp | 19 +- lib/Target/Lanai/LanaiCallingConv.td | 7 +- lib/Target/Lanai/LanaiDelaySlotFiller.cpp | 7 +- lib/Target/Lanai/LanaiFrameLowering.cpp | 9 +- lib/Target/Lanai/LanaiFrameLowering.h | 8 +- lib/Target/Lanai/LanaiISelDAGToDAG.cpp | 9 +- lib/Target/Lanai/LanaiISelLowering.cpp | 7 +- lib/Target/Lanai/LanaiISelLowering.h | 7 +- lib/Target/Lanai/LanaiInstrFormats.td | 7 +- lib/Target/Lanai/LanaiInstrInfo.cpp | 24 +- lib/Target/Lanai/LanaiInstrInfo.h | 16 +- lib/Target/Lanai/LanaiInstrInfo.td | 7 +- lib/Target/Lanai/LanaiMCInstLower.cpp | 7 +- lib/Target/Lanai/LanaiMCInstLower.h | 7 +- lib/Target/Lanai/LanaiMachineFunctionInfo.cpp | 7 +- lib/Target/Lanai/LanaiMachineFunctionInfo.h | 7 +- lib/Target/Lanai/LanaiMemAluCombiner.cpp | 12 +- lib/Target/Lanai/LanaiRegisterInfo.cpp | 17 +- lib/Target/Lanai/LanaiRegisterInfo.h | 11 +- lib/Target/Lanai/LanaiRegisterInfo.td | 7 +- lib/Target/Lanai/LanaiSchedule.td | 7 +- lib/Target/Lanai/LanaiSelectionDAGInfo.cpp | 7 +- lib/Target/Lanai/LanaiSelectionDAGInfo.h | 7 +- lib/Target/Lanai/LanaiSubtarget.cpp | 7 +- lib/Target/Lanai/LanaiSubtarget.h | 7 +- lib/Target/Lanai/LanaiTargetMachine.cpp | 8 +- lib/Target/Lanai/LanaiTargetMachine.h | 7 +- lib/Target/Lanai/LanaiTargetObjectFile.cpp | 7 +- lib/Target/Lanai/LanaiTargetObjectFile.h | 7 +- lib/Target/Lanai/LanaiTargetTransformInfo.h | 7 +- lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp | 7 +- lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h | 7 +- .../Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp | 9 +- lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h | 7 +- lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp | 307 + lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h | 65 + lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp | 7 +- lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h | 7 +- .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp | 9 +- lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp | 7 +- lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h | 7 +- .../Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp | 10 +- lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h | 9 +- lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp | 13 +- lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h | 20 + lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp | 8 +- .../MSP430/Disassembler/MSP430Disassembler.cpp | 8 +- .../MSP430/InstPrinter/MSP430InstPrinter.cpp | 138 - lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h | 50 - .../MSP430/MCTargetDesc/MSP430AsmBackend.cpp | 7 +- .../MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp | 7 +- .../MSP430/MCTargetDesc/MSP430ELFStreamer.cpp | 7 +- lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h | 7 +- .../MSP430/MCTargetDesc/MSP430InstPrinter.cpp | 137 + lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h | 49 + lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp | 8 +- lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h | 7 +- .../MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp | 7 +- .../MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp | 10 +- .../MSP430/MCTargetDesc/MSP430MCTargetDesc.h | 9 +- lib/Target/MSP430/MSP430.h | 7 +- lib/Target/MSP430/MSP430.td | 7 +- lib/Target/MSP430/MSP430AsmPrinter.cpp | 85 +- lib/Target/MSP430/MSP430BranchSelector.cpp | 7 +- lib/Target/MSP430/MSP430CallingConv.td | 7 +- lib/Target/MSP430/MSP430FrameLowering.cpp | 7 +- lib/Target/MSP430/MSP430FrameLowering.h | 7 +- lib/Target/MSP430/MSP430ISelDAGToDAG.cpp | 7 +- lib/Target/MSP430/MSP430ISelLowering.cpp | 7 +- lib/Target/MSP430/MSP430ISelLowering.h | 7 +- lib/Target/MSP430/MSP430InstrFormats.td | 7 +- lib/Target/MSP430/MSP430InstrInfo.cpp | 10 +- lib/Target/MSP430/MSP430InstrInfo.h | 7 +- lib/Target/MSP430/MSP430InstrInfo.td | 7 +- lib/Target/MSP430/MSP430MCInstLower.cpp | 7 +- lib/Target/MSP430/MSP430MCInstLower.h | 7 +- lib/Target/MSP430/MSP430MachineFunctionInfo.cpp | 7 +- lib/Target/MSP430/MSP430MachineFunctionInfo.h | 7 +- lib/Target/MSP430/MSP430RegisterInfo.cpp | 9 +- lib/Target/MSP430/MSP430RegisterInfo.h | 9 +- lib/Target/MSP430/MSP430RegisterInfo.td | 7 +- lib/Target/MSP430/MSP430Subtarget.cpp | 7 +- lib/Target/MSP430/MSP430Subtarget.h | 7 +- lib/Target/MSP430/MSP430TargetMachine.cpp | 8 +- lib/Target/MSP430/MSP430TargetMachine.h | 7 +- lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp | 10 +- lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h | 20 + lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 409 +- lib/Target/Mips/Disassembler/MipsDisassembler.cpp | 17 +- lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp | 288 - lib/Target/Mips/InstPrinter/MipsInstPrinter.h | 113 - .../Mips/MCTargetDesc/MipsABIFlagsSection.cpp | 7 +- lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp | 14 +- lib/Target/Mips/MCTargetDesc/MipsABIInfo.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp | 11 +- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h | 12 +- .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp | 9 +- lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp | 9 +- lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h | 10 +- lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h | 12 +- lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp | 287 + lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h | 112 + lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp | 7 +- lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp | 15 +- lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp | 21 +- lib/Target/Mips/MCTargetDesc/MipsMCExpr.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp | 12 +- lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h | 12 +- .../Mips/MCTargetDesc/MipsNaClELFStreamer.cpp | 11 +- lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp | 7 +- .../Mips/MCTargetDesc/MipsTargetStreamer.cpp | 83 +- lib/Target/Mips/MicroMips32r6InstrFormats.td | 7 +- lib/Target/Mips/MicroMips32r6InstrInfo.td | 32 +- lib/Target/Mips/MicroMipsDSPInstrFormats.td | 7 +- lib/Target/Mips/MicroMipsDSPInstrInfo.td | 7 +- lib/Target/Mips/MicroMipsInstrFPU.td | 19 +- lib/Target/Mips/MicroMipsInstrFormats.td | 7 +- lib/Target/Mips/MicroMipsInstrInfo.td | 36 +- lib/Target/Mips/MicroMipsSizeReduction.cpp | 7 +- lib/Target/Mips/Mips.h | 7 +- lib/Target/Mips/Mips.td | 13 +- lib/Target/Mips/Mips16FrameLowering.cpp | 7 +- lib/Target/Mips/Mips16FrameLowering.h | 7 +- lib/Target/Mips/Mips16HardFloat.cpp | 9 +- lib/Target/Mips/Mips16HardFloatInfo.cpp | 7 +- lib/Target/Mips/Mips16HardFloatInfo.h | 7 +- lib/Target/Mips/Mips16ISelDAGToDAG.cpp | 7 +- lib/Target/Mips/Mips16ISelDAGToDAG.h | 7 +- lib/Target/Mips/Mips16ISelLowering.cpp | 17 +- lib/Target/Mips/Mips16ISelLowering.h | 8 +- lib/Target/Mips/Mips16InstrFormats.td | 7 +- lib/Target/Mips/Mips16InstrInfo.cpp | 7 +- lib/Target/Mips/Mips16InstrInfo.h | 7 +- lib/Target/Mips/Mips16InstrInfo.td | 15 +- lib/Target/Mips/Mips16RegisterInfo.cpp | 7 +- lib/Target/Mips/Mips16RegisterInfo.h | 7 +- lib/Target/Mips/Mips32r6InstrFormats.td | 7 +- lib/Target/Mips/Mips32r6InstrInfo.td | 12 +- lib/Target/Mips/Mips64InstrInfo.td | 92 +- lib/Target/Mips/Mips64r6InstrInfo.td | 10 +- lib/Target/Mips/MipsAnalyzeImmediate.cpp | 7 +- lib/Target/Mips/MipsAnalyzeImmediate.h | 7 +- lib/Target/Mips/MipsAsmPrinter.cpp | 63 +- lib/Target/Mips/MipsAsmPrinter.h | 13 +- lib/Target/Mips/MipsBranchExpansion.cpp | 7 +- lib/Target/Mips/MipsCCState.cpp | 7 +- lib/Target/Mips/MipsCCState.h | 7 +- lib/Target/Mips/MipsCallLowering.cpp | 265 +- lib/Target/Mips/MipsCallLowering.h | 31 +- lib/Target/Mips/MipsCallingConv.td | 7 +- lib/Target/Mips/MipsCondMov.td | 29 +- lib/Target/Mips/MipsConstantIslandPass.cpp | 15 +- lib/Target/Mips/MipsDSPInstrFormats.td | 7 +- lib/Target/Mips/MipsDSPInstrInfo.td | 12 +- lib/Target/Mips/MipsDelaySlotFiller.cpp | 45 +- lib/Target/Mips/MipsEVAInstrFormats.td | 7 +- lib/Target/Mips/MipsEVAInstrInfo.td | 7 +- lib/Target/Mips/MipsExpandPseudo.cpp | 7 +- lib/Target/Mips/MipsFastISel.cpp | 55 +- lib/Target/Mips/MipsFrameLowering.cpp | 7 +- lib/Target/Mips/MipsFrameLowering.h | 7 +- lib/Target/Mips/MipsISelDAGToDAG.cpp | 7 +- lib/Target/Mips/MipsISelDAGToDAG.h | 7 +- lib/Target/Mips/MipsISelLowering.cpp | 175 +- lib/Target/Mips/MipsISelLowering.h | 21 +- lib/Target/Mips/MipsInstrFPU.td | 26 +- lib/Target/Mips/MipsInstrFormats.td | 8 +- lib/Target/Mips/MipsInstrInfo.cpp | 23 +- lib/Target/Mips/MipsInstrInfo.h | 7 +- lib/Target/Mips/MipsInstrInfo.td | 114 +- lib/Target/Mips/MipsInstructionSelector.cpp | 447 +- lib/Target/Mips/MipsLegalizerInfo.cpp | 93 +- lib/Target/Mips/MipsLegalizerInfo.h | 7 +- lib/Target/Mips/MipsMCInstLower.cpp | 9 +- lib/Target/Mips/MipsMCInstLower.h | 7 +- lib/Target/Mips/MipsMSAInstrFormats.td | 7 +- lib/Target/Mips/MipsMSAInstrInfo.td | 90 +- lib/Target/Mips/MipsMTInstrFormats.td | 7 +- lib/Target/Mips/MipsMTInstrInfo.td | 7 +- lib/Target/Mips/MipsMachineFunction.cpp | 105 +- lib/Target/Mips/MipsMachineFunction.h | 14 +- lib/Target/Mips/MipsOptimizePICCall.cpp | 7 +- lib/Target/Mips/MipsOptionRecord.h | 7 +- lib/Target/Mips/MipsOs16.cpp | 7 +- lib/Target/Mips/MipsPreLegalizerCombiner.cpp | 18 +- lib/Target/Mips/MipsRegisterBankInfo.cpp | 598 +- lib/Target/Mips/MipsRegisterBankInfo.h | 132 +- lib/Target/Mips/MipsRegisterBanks.td | 9 +- lib/Target/Mips/MipsRegisterInfo.cpp | 40 +- lib/Target/Mips/MipsRegisterInfo.h | 9 +- lib/Target/Mips/MipsRegisterInfo.td | 54 +- lib/Target/Mips/MipsSEFrameLowering.cpp | 7 +- lib/Target/Mips/MipsSEFrameLowering.h | 7 +- lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 113 +- lib/Target/Mips/MipsSEISelDAGToDAG.h | 11 +- lib/Target/Mips/MipsSEISelLowering.cpp | 126 +- lib/Target/Mips/MipsSEISelLowering.h | 15 +- lib/Target/Mips/MipsSEInstrInfo.cpp | 12 +- lib/Target/Mips/MipsSEInstrInfo.h | 7 +- lib/Target/Mips/MipsSERegisterInfo.cpp | 7 +- lib/Target/Mips/MipsSERegisterInfo.h | 7 +- lib/Target/Mips/MipsSchedule.td | 7 +- lib/Target/Mips/MipsScheduleGeneric.td | 934 +- lib/Target/Mips/MipsScheduleP5600.td | 67 +- lib/Target/Mips/MipsSubtarget.cpp | 21 +- lib/Target/Mips/MipsSubtarget.h | 11 +- lib/Target/Mips/MipsTargetMachine.cpp | 17 +- lib/Target/Mips/MipsTargetMachine.h | 13 +- lib/Target/Mips/MipsTargetObjectFile.cpp | 7 +- lib/Target/Mips/MipsTargetObjectFile.h | 7 +- lib/Target/Mips/MipsTargetStreamer.h | 11 +- lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp | 10 +- lib/Target/Mips/TargetInfo/MipsTargetInfo.h | 23 + lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp | 296 - lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h | 52 - lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h | 7 +- lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 309 + lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 53 + lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp | 16 +- lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h | 7 +- .../NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp | 10 +- lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h | 10 +- .../NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp | 26 +- .../NVPTX/MCTargetDesc/NVPTXTargetStreamer.h | 10 +- lib/Target/NVPTX/ManagedStringPool.h | 7 +- lib/Target/NVPTX/NVPTX.h | 20 +- lib/Target/NVPTX/NVPTX.td | 9 +- lib/Target/NVPTX/NVPTXAllocaHoisting.cpp | 7 +- lib/Target/NVPTX/NVPTXAllocaHoisting.h | 7 +- lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 83 +- lib/Target/NVPTX/NVPTXAsmPrinter.h | 18 +- lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp | 7 +- lib/Target/NVPTX/NVPTXFrameLowering.cpp | 7 +- lib/Target/NVPTX/NVPTXFrameLowering.h | 7 +- lib/Target/NVPTX/NVPTXGenericToNVVM.cpp | 7 +- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 14 +- lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 8 +- lib/Target/NVPTX/NVPTXISelLowering.cpp | 283 +- lib/Target/NVPTX/NVPTXISelLowering.h | 11 +- lib/Target/NVPTX/NVPTXImageOptimizer.cpp | 7 +- lib/Target/NVPTX/NVPTXInstrFormats.td | 7 +- lib/Target/NVPTX/NVPTXInstrInfo.cpp | 7 +- lib/Target/NVPTX/NVPTXInstrInfo.h | 7 +- lib/Target/NVPTX/NVPTXInstrInfo.td | 23 +- lib/Target/NVPTX/NVPTXIntrinsics.td | 658 +- lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp | 7 +- lib/Target/NVPTX/NVPTXLowerAggrCopies.h | 7 +- lib/Target/NVPTX/NVPTXLowerAlloca.cpp | 8 +- lib/Target/NVPTX/NVPTXLowerArgs.cpp | 11 +- lib/Target/NVPTX/NVPTXMCExpr.cpp | 7 +- lib/Target/NVPTX/NVPTXMCExpr.h | 7 +- lib/Target/NVPTX/NVPTXMachineFunctionInfo.h | 7 +- lib/Target/NVPTX/NVPTXPeephole.cpp | 7 +- lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 11 +- lib/Target/NVPTX/NVPTXProxyRegErasure.cpp | 7 +- lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 9 +- lib/Target/NVPTX/NVPTXRegisterInfo.h | 9 +- lib/Target/NVPTX/NVPTXRegisterInfo.td | 7 +- lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp | 8 +- lib/Target/NVPTX/NVPTXSubtarget.cpp | 7 +- lib/Target/NVPTX/NVPTXSubtarget.h | 7 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 27 +- lib/Target/NVPTX/NVPTXTargetMachine.h | 7 +- lib/Target/NVPTX/NVPTXTargetObjectFile.h | 7 +- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 8 +- lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 9 +- lib/Target/NVPTX/NVPTXUtilities.cpp | 8 +- lib/Target/NVPTX/NVPTXUtilities.h | 7 +- lib/Target/NVPTX/NVVMIntrRange.cpp | 7 +- lib/Target/NVPTX/NVVMReflect.cpp | 7 +- lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp | 10 +- lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h | 21 + lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 15 +- .../PowerPC/Disassembler/PPCDisassembler.cpp | 22 +- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp | 532 -- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h | 77 - lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 117 +- .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp | 10 +- lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h | 7 +- lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp | 543 ++ lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h | 76 + lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp | 13 +- lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h | 17 +- .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 9 +- lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h | 14 +- lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp | 7 +- lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h | 7 +- .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 67 +- lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 14 +- .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp | 7 +- lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp | 7 +- lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h | 7 +- .../PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp | 29 + lib/Target/PowerPC/P9InstrResources.td | 371 +- lib/Target/PowerPC/PPC.h | 22 +- lib/Target/PowerPC/PPC.td | 38 +- lib/Target/PowerPC/PPCAsmPrinter.cpp | 223 +- lib/Target/PowerPC/PPCBoolRetToInt.cpp | 7 +- lib/Target/PowerPC/PPCBranchCoalescing.cpp | 11 +- lib/Target/PowerPC/PPCBranchSelector.cpp | 262 +- lib/Target/PowerPC/PPCCCState.cpp | 7 +- lib/Target/PowerPC/PPCCCState.h | 7 +- lib/Target/PowerPC/PPCCTRLoops.cpp | 585 +- lib/Target/PowerPC/PPCCallingConv.cpp | 162 + lib/Target/PowerPC/PPCCallingConv.h | 36 +- lib/Target/PowerPC/PPCCallingConv.td | 50 +- lib/Target/PowerPC/PPCEarlyReturn.cpp | 19 +- lib/Target/PowerPC/PPCExpandISEL.cpp | 7 +- lib/Target/PowerPC/PPCFastISel.cpp | 108 +- lib/Target/PowerPC/PPCFrameLowering.cpp | 211 +- lib/Target/PowerPC/PPCFrameLowering.h | 31 +- lib/Target/PowerPC/PPCHazardRecognizers.cpp | 10 +- lib/Target/PowerPC/PPCHazardRecognizers.h | 7 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 94 +- lib/Target/PowerPC/PPCISelLowering.cpp | 1087 ++- lib/Target/PowerPC/PPCISelLowering.h | 117 +- lib/Target/PowerPC/PPCInstr64Bit.td | 66 +- lib/Target/PowerPC/PPCInstrAltivec.td | 37 +- lib/Target/PowerPC/PPCInstrBuilder.h | 7 +- lib/Target/PowerPC/PPCInstrFormats.td | 21 +- lib/Target/PowerPC/PPCInstrHTM.td | 49 +- lib/Target/PowerPC/PPCInstrInfo.cpp | 388 +- lib/Target/PowerPC/PPCInstrInfo.h | 100 +- lib/Target/PowerPC/PPCInstrInfo.td | 84 +- lib/Target/PowerPC/PPCInstrQPX.td | 7 +- lib/Target/PowerPC/PPCInstrSPE.td | 19 +- lib/Target/PowerPC/PPCInstrVSX.td | 531 +- lib/Target/PowerPC/PPCLoopPreIncPrep.cpp | 15 +- lib/Target/PowerPC/PPCMCInstLower.cpp | 17 +- lib/Target/PowerPC/PPCMIPeephole.cpp | 186 +- lib/Target/PowerPC/PPCMachineFunctionInfo.cpp | 7 +- lib/Target/PowerPC/PPCMachineFunctionInfo.h | 16 +- lib/Target/PowerPC/PPCMachineScheduler.cpp | 83 + lib/Target/PowerPC/PPCMachineScheduler.h | 49 + lib/Target/PowerPC/PPCPerfectShuffle.h | 7 +- lib/Target/PowerPC/PPCPfmCounters.td | 7 +- lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 7 +- lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 11 +- lib/Target/PowerPC/PPCReduceCRLogicals.cpp | 52 +- lib/Target/PowerPC/PPCRegisterInfo.cpp | 217 +- lib/Target/PowerPC/PPCRegisterInfo.h | 18 +- lib/Target/PowerPC/PPCRegisterInfo.td | 9 +- lib/Target/PowerPC/PPCSchedule.td | 8 +- lib/Target/PowerPC/PPCSchedule440.td | 7 +- lib/Target/PowerPC/PPCScheduleA2.td | 7 +- lib/Target/PowerPC/PPCScheduleE500.td | 7 +- lib/Target/PowerPC/PPCScheduleE500mc.td | 7 +- lib/Target/PowerPC/PPCScheduleE5500.td | 7 +- lib/Target/PowerPC/PPCScheduleG3.td | 7 +- lib/Target/PowerPC/PPCScheduleG4.td | 7 +- lib/Target/PowerPC/PPCScheduleG4Plus.td | 7 +- lib/Target/PowerPC/PPCScheduleG5.td | 7 +- lib/Target/PowerPC/PPCScheduleP7.td | 7 +- lib/Target/PowerPC/PPCScheduleP8.td | 7 +- lib/Target/PowerPC/PPCScheduleP9.td | 77 +- lib/Target/PowerPC/PPCSubtarget.cpp | 29 +- lib/Target/PowerPC/PPCSubtarget.h | 28 +- lib/Target/PowerPC/PPCTLSDynamicCall.cpp | 11 +- lib/Target/PowerPC/PPCTOCRegDeps.cpp | 11 +- lib/Target/PowerPC/PPCTargetMachine.cpp | 74 +- lib/Target/PowerPC/PPCTargetMachine.h | 11 +- lib/Target/PowerPC/PPCTargetObjectFile.cpp | 7 +- lib/Target/PowerPC/PPCTargetObjectFile.h | 7 +- lib/Target/PowerPC/PPCTargetStreamer.h | 7 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 449 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 21 +- lib/Target/PowerPC/PPCVSXCopy.cpp | 11 +- lib/Target/PowerPC/PPCVSXFMAMutate.cpp | 7 +- lib/Target/PowerPC/PPCVSXSwapRemoval.cpp | 12 +- lib/Target/PowerPC/README_P9.txt | 8 +- .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp | 10 +- lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h | 22 + lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 393 +- .../RISCV/Disassembler/RISCVDisassembler.cpp | 20 +- lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp | 115 - lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h | 55 - lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 93 +- lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h | 54 +- .../RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp | 70 +- lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp | 32 +- lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h | 7 +- lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h | 36 +- lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp | 114 + lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h | 54 + lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp | 8 +- lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h | 7 +- .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 150 +- lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 120 +- lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h | 23 +- .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp | 18 +- lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h | 10 +- .../RISCV/MCTargetDesc/RISCVTargetStreamer.cpp | 7 +- .../RISCV/MCTargetDesc/RISCVTargetStreamer.h | 7 +- lib/Target/RISCV/RISCV.h | 7 +- lib/Target/RISCV/RISCV.td | 25 +- lib/Target/RISCV/RISCVAsmPrinter.cpp | 65 +- lib/Target/RISCV/RISCVCallingConv.td | 18 +- lib/Target/RISCV/RISCVExpandPseudoInsts.cpp | 196 +- lib/Target/RISCV/RISCVFrameLowering.cpp | 80 +- lib/Target/RISCV/RISCVFrameLowering.h | 7 +- lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 15 +- lib/Target/RISCV/RISCVISelLowering.cpp | 1185 ++- lib/Target/RISCV/RISCVISelLowering.h | 86 +- lib/Target/RISCV/RISCVInstrFormats.td | 36 +- lib/Target/RISCV/RISCVInstrFormatsC.td | 7 +- lib/Target/RISCV/RISCVInstrInfo.cpp | 36 +- lib/Target/RISCV/RISCVInstrInfo.h | 9 +- lib/Target/RISCV/RISCVInstrInfo.td | 320 +- lib/Target/RISCV/RISCVInstrInfoA.td | 89 +- lib/Target/RISCV/RISCVInstrInfoC.td | 57 +- lib/Target/RISCV/RISCVInstrInfoD.td | 41 +- lib/Target/RISCV/RISCVInstrInfoF.td | 97 +- lib/Target/RISCV/RISCVInstrInfoM.td | 46 +- lib/Target/RISCV/RISCVMCInstLower.cpp | 37 +- lib/Target/RISCV/RISCVMachineFunctionInfo.h | 9 +- lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 7 +- lib/Target/RISCV/RISCVRegisterInfo.cpp | 53 +- lib/Target/RISCV/RISCVRegisterInfo.h | 9 +- lib/Target/RISCV/RISCVRegisterInfo.td | 9 +- lib/Target/RISCV/RISCVSubtarget.cpp | 22 +- lib/Target/RISCV/RISCVSubtarget.h | 21 +- lib/Target/RISCV/RISCVSystemOperands.td | 27 +- lib/Target/RISCV/RISCVTargetMachine.cpp | 21 +- lib/Target/RISCV/RISCVTargetMachine.h | 9 +- lib/Target/RISCV/RISCVTargetObjectFile.cpp | 103 +- lib/Target/RISCV/RISCVTargetObjectFile.h | 31 +- lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 92 + lib/Target/RISCV/RISCVTargetTransformInfo.h | 52 + lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp | 14 +- lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h | 21 + lib/Target/RISCV/Utils/RISCVBaseInfo.cpp | 71 + lib/Target/RISCV/Utils/RISCVBaseInfo.h | 44 +- lib/Target/RISCV/Utils/RISCVMatInt.cpp | 32 +- lib/Target/RISCV/Utils/RISCVMatInt.h | 16 +- lib/Target/Sparc/AsmParser/SparcAsmParser.cpp | 11 +- lib/Target/Sparc/DelaySlotFiller.cpp | 7 +- .../Sparc/Disassembler/SparcDisassembler.cpp | 14 +- lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp | 220 - lib/Target/Sparc/InstPrinter/SparcInstPrinter.h | 57 - lib/Target/Sparc/LeonFeatures.td | 7 +- lib/Target/Sparc/LeonPasses.cpp | 7 +- lib/Target/Sparc/LeonPasses.h | 7 +- lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp | 7 +- .../Sparc/MCTargetDesc/SparcELFObjectWriter.cpp | 7 +- lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h | 7 +- lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp | 219 + lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h | 56 + lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp | 7 +- lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h | 7 +- .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp | 14 +- lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp | 7 +- lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h | 7 +- .../Sparc/MCTargetDesc/SparcMCTargetDesc.cpp | 10 +- lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h | 11 +- .../Sparc/MCTargetDesc/SparcTargetStreamer.cpp | 9 +- .../Sparc/MCTargetDesc/SparcTargetStreamer.h | 7 +- lib/Target/Sparc/Sparc.h | 7 +- lib/Target/Sparc/Sparc.td | 7 +- lib/Target/Sparc/SparcAsmPrinter.cpp | 23 +- lib/Target/Sparc/SparcCallingConv.td | 7 +- lib/Target/Sparc/SparcFrameLowering.cpp | 7 +- lib/Target/Sparc/SparcFrameLowering.h | 7 +- lib/Target/Sparc/SparcISelDAGToDAG.cpp | 12 +- lib/Target/Sparc/SparcISelLowering.cpp | 10 +- lib/Target/Sparc/SparcISelLowering.h | 7 +- lib/Target/Sparc/SparcInstr64Bit.td | 7 +- lib/Target/Sparc/SparcInstrAliases.td | 7 +- lib/Target/Sparc/SparcInstrFormats.td | 7 +- lib/Target/Sparc/SparcInstrInfo.cpp | 7 +- lib/Target/Sparc/SparcInstrInfo.h | 7 +- lib/Target/Sparc/SparcInstrInfo.td | 7 +- lib/Target/Sparc/SparcInstrVIS.td | 7 +- lib/Target/Sparc/SparcMCInstLower.cpp | 7 +- lib/Target/Sparc/SparcMachineFunctionInfo.cpp | 7 +- lib/Target/Sparc/SparcMachineFunctionInfo.h | 7 +- lib/Target/Sparc/SparcRegisterInfo.cpp | 15 +- lib/Target/Sparc/SparcRegisterInfo.h | 9 +- lib/Target/Sparc/SparcRegisterInfo.td | 7 +- lib/Target/Sparc/SparcSchedule.td | 7 +- lib/Target/Sparc/SparcSubtarget.cpp | 7 +- lib/Target/Sparc/SparcSubtarget.h | 7 +- lib/Target/Sparc/SparcTargetMachine.cpp | 12 +- lib/Target/Sparc/SparcTargetMachine.h | 7 +- lib/Target/Sparc/SparcTargetObjectFile.cpp | 7 +- lib/Target/Sparc/SparcTargetObjectFile.h | 7 +- lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp | 10 +- lib/Target/Sparc/TargetInfo/SparcTargetInfo.h | 22 + lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 35 +- .../SystemZ/Disassembler/SystemZDisassembler.cpp | 8 +- .../SystemZ/InstPrinter/SystemZInstPrinter.cpp | 234 - .../SystemZ/InstPrinter/SystemZInstPrinter.h | 78 - .../SystemZ/MCTargetDesc/SystemZInstPrinter.cpp | 233 + .../SystemZ/MCTargetDesc/SystemZInstPrinter.h | 77 + .../SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp | 7 +- .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 7 +- lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h | 7 +- .../SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp | 14 +- lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h | 7 +- .../SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp | 11 +- .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp | 11 +- .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.h | 9 +- lib/Target/SystemZ/SystemZ.h | 8 +- lib/Target/SystemZ/SystemZ.td | 7 +- lib/Target/SystemZ/SystemZAsmPrinter.cpp | 70 +- lib/Target/SystemZ/SystemZAsmPrinter.h | 13 +- lib/Target/SystemZ/SystemZCallingConv.cpp | 7 +- lib/Target/SystemZ/SystemZCallingConv.h | 7 +- lib/Target/SystemZ/SystemZCallingConv.td | 7 +- lib/Target/SystemZ/SystemZConstantPoolValue.cpp | 7 +- lib/Target/SystemZ/SystemZConstantPoolValue.h | 7 +- lib/Target/SystemZ/SystemZElimCompare.cpp | 16 +- lib/Target/SystemZ/SystemZExpandPseudo.cpp | 7 +- lib/Target/SystemZ/SystemZFeatures.td | 58 +- lib/Target/SystemZ/SystemZFrameLowering.cpp | 7 +- lib/Target/SystemZ/SystemZFrameLowering.h | 7 +- lib/Target/SystemZ/SystemZHazardRecognizer.cpp | 7 +- lib/Target/SystemZ/SystemZHazardRecognizer.h | 7 +- lib/Target/SystemZ/SystemZISelDAGToDAG.cpp | 109 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 816 +- lib/Target/SystemZ/SystemZISelLowering.h | 44 +- lib/Target/SystemZ/SystemZInstrBuilder.h | 7 +- lib/Target/SystemZ/SystemZInstrDFP.td | 99 +- lib/Target/SystemZ/SystemZInstrFP.td | 302 +- lib/Target/SystemZ/SystemZInstrFormats.td | 378 +- lib/Target/SystemZ/SystemZInstrHFP.td | 7 +- lib/Target/SystemZ/SystemZInstrInfo.cpp | 306 +- lib/Target/SystemZ/SystemZInstrInfo.h | 23 +- lib/Target/SystemZ/SystemZInstrInfo.td | 150 +- lib/Target/SystemZ/SystemZInstrSystem.td | 7 +- lib/Target/SystemZ/SystemZInstrVector.td | 555 +- lib/Target/SystemZ/SystemZLDCleanup.cpp | 7 +- lib/Target/SystemZ/SystemZLongBranch.cpp | 7 +- lib/Target/SystemZ/SystemZMCInstLower.cpp | 7 +- lib/Target/SystemZ/SystemZMCInstLower.h | 7 +- lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp | 7 +- lib/Target/SystemZ/SystemZMachineFunctionInfo.h | 7 +- lib/Target/SystemZ/SystemZMachineScheduler.cpp | 7 +- lib/Target/SystemZ/SystemZMachineScheduler.h | 7 +- lib/Target/SystemZ/SystemZOperands.td | 27 +- lib/Target/SystemZ/SystemZOperators.td | 105 +- lib/Target/SystemZ/SystemZPatterns.td | 7 +- lib/Target/SystemZ/SystemZPostRewrite.cpp | 124 + lib/Target/SystemZ/SystemZProcessors.td | 9 +- lib/Target/SystemZ/SystemZRegisterInfo.cpp | 123 +- lib/Target/SystemZ/SystemZRegisterInfo.h | 9 +- lib/Target/SystemZ/SystemZRegisterInfo.td | 14 +- lib/Target/SystemZ/SystemZSchedule.td | 8 +- lib/Target/SystemZ/SystemZScheduleArch13.td | 1695 ++++ lib/Target/SystemZ/SystemZScheduleZ13.td | 18 +- lib/Target/SystemZ/SystemZScheduleZ14.td | 18 +- lib/Target/SystemZ/SystemZScheduleZ196.td | 7 +- lib/Target/SystemZ/SystemZScheduleZEC12.td | 7 +- lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp | 25 +- lib/Target/SystemZ/SystemZSelectionDAGInfo.h | 7 +- lib/Target/SystemZ/SystemZShortenInst.cpp | 62 +- lib/Target/SystemZ/SystemZSubtarget.cpp | 10 +- lib/Target/SystemZ/SystemZSubtarget.h | 37 +- lib/Target/SystemZ/SystemZTDC.cpp | 11 +- lib/Target/SystemZ/SystemZTargetMachine.cpp | 22 +- lib/Target/SystemZ/SystemZTargetMachine.h | 7 +- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp | 39 +- lib/Target/SystemZ/SystemZTargetTransformInfo.h | 7 +- .../SystemZ/TargetInfo/SystemZTargetInfo.cpp | 9 +- lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h | 20 + lib/Target/Target.cpp | 7 +- lib/Target/TargetIntrinsicInfo.cpp | 7 +- lib/Target/TargetLoweringObjectFile.cpp | 8 +- lib/Target/TargetMachine.cpp | 22 +- lib/Target/TargetMachineC.cpp | 7 +- .../WebAssembly/AsmParser/WebAssemblyAsmParser.cpp | 278 +- .../Disassembler/WebAssemblyDisassembler.cpp | 58 +- .../InstPrinter/WebAssemblyInstPrinter.cpp | 310 - .../InstPrinter/WebAssemblyInstPrinter.h | 66 - .../MCTargetDesc/WebAssemblyAsmBackend.cpp | 20 +- .../MCTargetDesc/WebAssemblyFixupKinds.h | 13 +- .../MCTargetDesc/WebAssemblyInstPrinter.cpp | 296 + .../MCTargetDesc/WebAssemblyInstPrinter.h | 65 + .../MCTargetDesc/WebAssemblyMCAsmInfo.cpp | 9 +- .../MCTargetDesc/WebAssemblyMCAsmInfo.h | 7 +- .../MCTargetDesc/WebAssemblyMCCodeEmitter.cpp | 35 +- .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp | 24 +- .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 302 +- .../MCTargetDesc/WebAssemblyTargetStreamer.cpp | 24 +- .../MCTargetDesc/WebAssemblyTargetStreamer.h | 20 +- .../MCTargetDesc/WebAssemblyWasmObjectWriter.cpp | 109 +- lib/Target/WebAssembly/README.txt | 2 +- .../TargetInfo/WebAssemblyTargetInfo.cpp | 10 +- .../WebAssembly/TargetInfo/WebAssemblyTargetInfo.h | 26 + lib/Target/WebAssembly/WebAssembly.h | 13 +- lib/Target/WebAssembly/WebAssembly.td | 29 +- .../WebAssemblyAddMissingPrototypes.cpp | 89 +- lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp | 11 +- lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp | 186 +- lib/Target/WebAssembly/WebAssemblyAsmPrinter.h | 16 +- lib/Target/WebAssembly/WebAssemblyCFGSort.cpp | 54 +- lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp | 931 +- .../WebAssembly/WebAssemblyCallIndirectFixup.cpp | 37 +- .../WebAssembly/WebAssemblyDebugValueManager.cpp | 7 +- .../WebAssembly/WebAssemblyDebugValueManager.h | 7 +- .../WebAssemblyEHRestoreStackPointer.cpp | 87 - .../WebAssembly/WebAssemblyExceptionInfo.cpp | 21 +- lib/Target/WebAssembly/WebAssemblyExceptionInfo.h | 7 +- .../WebAssembly/WebAssemblyExplicitLocals.cpp | 55 +- lib/Target/WebAssembly/WebAssemblyFastISel.cpp | 183 +- .../WebAssembly/WebAssemblyFixFunctionBitcasts.cpp | 79 +- .../WebAssemblyFixIrreducibleControlFlow.cpp | 616 +- .../WebAssembly/WebAssemblyFrameLowering.cpp | 14 +- lib/Target/WebAssembly/WebAssemblyFrameLowering.h | 7 +- lib/Target/WebAssembly/WebAssemblyISD.def | 14 +- lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp | 168 +- lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 556 +- lib/Target/WebAssembly/WebAssemblyISelLowering.h | 21 +- lib/Target/WebAssembly/WebAssemblyInstrAtomics.td | 546 +- .../WebAssembly/WebAssemblyInstrBulkMemory.td | 71 + lib/Target/WebAssembly/WebAssemblyInstrCall.td | 202 +- lib/Target/WebAssembly/WebAssemblyInstrControl.td | 93 +- lib/Target/WebAssembly/WebAssemblyInstrConv.td | 7 +- .../WebAssembly/WebAssemblyInstrExceptRef.td | 27 - lib/Target/WebAssembly/WebAssemblyInstrFloat.td | 7 +- lib/Target/WebAssembly/WebAssemblyInstrFormats.td | 10 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp | 62 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.h | 16 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.td | 129 +- lib/Target/WebAssembly/WebAssemblyInstrInteger.td | 14 +- lib/Target/WebAssembly/WebAssemblyInstrMemory.td | 95 +- lib/Target/WebAssembly/WebAssemblyInstrRef.td | 25 + lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 215 +- .../WebAssembly/WebAssemblyLateEHPrepare.cpp | 467 +- .../WebAssembly/WebAssemblyLowerBrUnless.cpp | 7 +- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 95 +- .../WebAssembly/WebAssemblyLowerGlobalDtors.cpp | 30 +- lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp | 118 +- lib/Target/WebAssembly/WebAssemblyMCInstLower.h | 12 +- .../WebAssembly/WebAssemblyMachineFunctionInfo.cpp | 40 +- .../WebAssembly/WebAssemblyMachineFunctionInfo.h | 47 +- .../WebAssembly/WebAssemblyMemIntrinsicResults.cpp | 23 +- .../WebAssemblyOptimizeLiveIntervals.cpp | 13 +- .../WebAssembly/WebAssemblyOptimizeReturned.cpp | 17 +- lib/Target/WebAssembly/WebAssemblyPeephole.cpp | 39 +- .../WebAssemblyPrepareForLiveIntervals.cpp | 19 +- lib/Target/WebAssembly/WebAssemblyRegColoring.cpp | 31 +- lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp | 9 +- lib/Target/WebAssembly/WebAssemblyRegStackify.cpp | 173 +- lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp | 30 +- lib/Target/WebAssembly/WebAssemblyRegisterInfo.h | 9 +- lib/Target/WebAssembly/WebAssemblyRegisterInfo.td | 11 +- .../WebAssembly/WebAssemblyReplacePhysRegs.cpp | 7 +- .../WebAssemblyRuntimeLibcallSignatures.cpp | 143 +- .../WebAssemblyRuntimeLibcallSignatures.h | 11 +- .../WebAssembly/WebAssemblySelectionDAGInfo.cpp | 49 +- .../WebAssembly/WebAssemblySelectionDAGInfo.h | 22 +- .../WebAssembly/WebAssemblySetP2AlignOperands.cpp | 123 +- lib/Target/WebAssembly/WebAssemblySubtarget.cpp | 12 +- lib/Target/WebAssembly/WebAssemblySubtarget.h | 22 +- .../WebAssembly/WebAssemblyTargetMachine.cpp | 250 +- lib/Target/WebAssembly/WebAssemblyTargetMachine.h | 18 +- .../WebAssembly/WebAssemblyTargetObjectFile.cpp | 7 +- .../WebAssembly/WebAssemblyTargetObjectFile.h | 7 +- .../WebAssembly/WebAssemblyTargetTransformInfo.cpp | 9 +- .../WebAssembly/WebAssemblyTargetTransformInfo.h | 7 +- lib/Target/WebAssembly/WebAssemblyUtilities.cpp | 301 +- lib/Target/WebAssembly/WebAssemblyUtilities.h | 27 +- lib/Target/WebAssembly/known_gcc_test_failures.txt | 27 +- lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp | 1089 --- lib/Target/X86/AsmParser/X86AsmInstrumentation.h | 68 - lib/Target/X86/AsmParser/X86AsmParser.cpp | 447 +- lib/Target/X86/AsmParser/X86AsmParserCommon.h | 7 +- lib/Target/X86/AsmParser/X86Operand.h | 58 +- lib/Target/X86/Disassembler/X86Disassembler.cpp | 217 +- .../X86/Disassembler/X86DisassemblerDecoder.cpp | 19 +- .../X86/Disassembler/X86DisassemblerDecoder.h | 14 +- lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp | 202 - lib/Target/X86/InstPrinter/X86ATTInstPrinter.h | 138 - lib/Target/X86/InstPrinter/X86InstComments.cpp | 1310 --- lib/Target/X86/InstPrinter/X86InstComments.h | 27 - .../X86/InstPrinter/X86InstPrinterCommon.cpp | 142 - lib/Target/X86/InstPrinter/X86InstPrinterCommon.h | 38 - lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp | 162 - lib/Target/X86/InstPrinter/X86IntelInstPrinter.h | 157 - lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp | 487 + lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h | 124 + lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 82 +- lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 94 +- lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp | 38 +- lib/Target/X86/MCTargetDesc/X86FixupKinds.h | 7 +- lib/Target/X86/MCTargetDesc/X86InstComments.cpp | 1322 +++ lib/Target/X86/MCTargetDesc/X86InstComments.h | 26 + .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 362 + lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h | 41 + .../X86/MCTargetDesc/X86IntelInstPrinter.cpp | 445 + lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h | 144 + lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp | 7 +- lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h | 7 +- lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 97 +- lib/Target/X86/MCTargetDesc/X86MCExpr.h | 9 +- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp | 22 +- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h | 10 +- .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 7 +- lib/Target/X86/MCTargetDesc/X86TargetStreamer.h | 7 +- .../X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp | 7 +- lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp | 7 +- .../X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp | 7 +- lib/Target/X86/ShadowCallStack.cpp | 322 - lib/Target/X86/TargetInfo/X86TargetInfo.cpp | 9 +- lib/Target/X86/TargetInfo/X86TargetInfo.h | 21 + lib/Target/X86/Utils/X86ShuffleDecode.cpp | 14 +- lib/Target/X86/Utils/X86ShuffleDecode.h | 9 +- lib/Target/X86/X86.h | 15 +- lib/Target/X86/X86.td | 1226 ++- lib/Target/X86/X86AsmPrinter.cpp | 274 +- lib/Target/X86/X86AsmPrinter.h | 25 +- lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp | 29 +- lib/Target/X86/X86CallFrameOptimization.cpp | 12 +- lib/Target/X86/X86CallLowering.cpp | 78 +- lib/Target/X86/X86CallLowering.h | 13 +- lib/Target/X86/X86CallingConv.cpp | 162 +- lib/Target/X86/X86CallingConv.h | 104 +- lib/Target/X86/X86CallingConv.td | 28 +- lib/Target/X86/X86CmovConversion.cpp | 35 +- lib/Target/X86/X86CondBrFolding.cpp | 26 +- lib/Target/X86/X86DiscriminateMemOps.cpp | 42 +- lib/Target/X86/X86DomainReassignment.cpp | 12 +- lib/Target/X86/X86EvexToVex.cpp | 21 +- lib/Target/X86/X86ExpandPseudo.cpp | 41 +- lib/Target/X86/X86FastISel.cpp | 264 +- lib/Target/X86/X86FixupBWInsts.cpp | 13 +- lib/Target/X86/X86FixupLEAs.cpp | 393 +- lib/Target/X86/X86FixupSetCC.cpp | 37 +- lib/Target/X86/X86FlagsCopyLowering.cpp | 56 +- lib/Target/X86/X86FloatingPoint.cpp | 28 +- lib/Target/X86/X86FrameLowering.cpp | 80 +- lib/Target/X86/X86FrameLowering.h | 11 +- lib/Target/X86/X86GenRegisterBankInfo.def | 7 +- lib/Target/X86/X86ISelDAGToDAG.cpp | 1590 +++- lib/Target/X86/X86ISelLowering.cpp | 9548 +++++++++++++------- lib/Target/X86/X86ISelLowering.h | 216 +- lib/Target/X86/X86IndirectBranchTracking.cpp | 49 +- lib/Target/X86/X86InsertPrefetch.cpp | 10 +- lib/Target/X86/X86Instr3DNow.td | 11 +- lib/Target/X86/X86InstrAVX512.td | 3488 +++---- lib/Target/X86/X86InstrArithmetic.td | 101 +- lib/Target/X86/X86InstrBuilder.h | 7 +- lib/Target/X86/X86InstrCMovSetCC.td | 176 +- lib/Target/X86/X86InstrCompiler.td | 323 +- lib/Target/X86/X86InstrControl.td | 64 +- lib/Target/X86/X86InstrExtension.td | 11 +- lib/Target/X86/X86InstrFMA.td | 13 +- lib/Target/X86/X86InstrFMA3Info.cpp | 17 +- lib/Target/X86/X86InstrFMA3Info.h | 7 +- lib/Target/X86/X86InstrFPStack.td | 341 +- lib/Target/X86/X86InstrFoldTables.cpp | 186 +- lib/Target/X86/X86InstrFoldTables.h | 7 +- lib/Target/X86/X86InstrFormats.td | 33 +- lib/Target/X86/X86InstrFragmentsSIMD.td | 368 +- lib/Target/X86/X86InstrInfo.cpp | 1116 +-- lib/Target/X86/X86InstrInfo.h | 79 +- lib/Target/X86/X86InstrInfo.td | 439 +- lib/Target/X86/X86InstrMMX.td | 13 +- lib/Target/X86/X86InstrMPX.td | 7 +- lib/Target/X86/X86InstrSGX.td | 7 +- lib/Target/X86/X86InstrSSE.td | 1917 ++-- lib/Target/X86/X86InstrSVM.td | 7 +- lib/Target/X86/X86InstrShiftRotate.td | 98 +- lib/Target/X86/X86InstrSystem.td | 26 +- lib/Target/X86/X86InstrTSX.td | 7 +- lib/Target/X86/X86InstrVMX.td | 7 +- lib/Target/X86/X86InstrVecCompiler.td | 104 +- lib/Target/X86/X86InstrXOP.td | 33 +- lib/Target/X86/X86InstructionSelector.cpp | 92 +- lib/Target/X86/X86InterleavedAccess.cpp | 27 +- lib/Target/X86/X86IntrinsicsInfo.h | 781 +- lib/Target/X86/X86LegalizerInfo.cpp | 30 +- lib/Target/X86/X86LegalizerInfo.h | 7 +- lib/Target/X86/X86MCInstLower.cpp | 274 +- lib/Target/X86/X86MachineFunctionInfo.cpp | 7 +- lib/Target/X86/X86MachineFunctionInfo.h | 7 +- lib/Target/X86/X86MacroFusion.cpp | 164 +- lib/Target/X86/X86MacroFusion.h | 7 +- lib/Target/X86/X86OptimizeLEAs.cpp | 14 +- lib/Target/X86/X86PadShortFunction.cpp | 16 +- lib/Target/X86/X86PfmCounters.td | 7 +- lib/Target/X86/X86RegisterBankInfo.cpp | 24 +- lib/Target/X86/X86RegisterBankInfo.h | 7 +- lib/Target/X86/X86RegisterBanks.td | 7 +- lib/Target/X86/X86RegisterInfo.cpp | 37 +- lib/Target/X86/X86RegisterInfo.h | 23 +- lib/Target/X86/X86RegisterInfo.td | 44 +- lib/Target/X86/X86RetpolineThunks.cpp | 7 +- lib/Target/X86/X86SchedBroadwell.td | 169 +- lib/Target/X86/X86SchedHaswell.td | 195 +- lib/Target/X86/X86SchedPredicates.td | 31 +- lib/Target/X86/X86SchedSandyBridge.td | 96 +- lib/Target/X86/X86SchedSkylakeClient.td | 193 +- lib/Target/X86/X86SchedSkylakeServer.td | 212 +- lib/Target/X86/X86Schedule.td | 14 +- lib/Target/X86/X86ScheduleAtom.td | 12 +- lib/Target/X86/X86ScheduleBdVer2.td | 599 +- lib/Target/X86/X86ScheduleBtVer2.td | 45 +- lib/Target/X86/X86ScheduleSLM.td | 10 +- lib/Target/X86/X86ScheduleZnver1.td | 10 +- lib/Target/X86/X86SelectionDAGInfo.cpp | 222 +- lib/Target/X86/X86SelectionDAGInfo.h | 7 +- lib/Target/X86/X86ShuffleDecodeConstantPool.cpp | 7 +- lib/Target/X86/X86ShuffleDecodeConstantPool.h | 7 +- lib/Target/X86/X86SpeculativeLoadHardening.cpp | 41 +- lib/Target/X86/X86Subtarget.cpp | 22 +- lib/Target/X86/X86Subtarget.h | 47 +- lib/Target/X86/X86TargetMachine.cpp | 33 +- lib/Target/X86/X86TargetMachine.h | 7 +- lib/Target/X86/X86TargetObjectFile.cpp | 7 +- lib/Target/X86/X86TargetObjectFile.h | 7 +- lib/Target/X86/X86TargetTransformInfo.cpp | 529 +- lib/Target/X86/X86TargetTransformInfo.h | 76 +- lib/Target/X86/X86VZeroUpper.cpp | 7 +- lib/Target/X86/X86WinAllocaExpander.cpp | 46 +- lib/Target/X86/X86WinEHState.cpp | 45 +- .../XCore/Disassembler/XCoreDisassembler.cpp | 12 +- lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp | 90 - lib/Target/XCore/InstPrinter/XCoreInstPrinter.h | 47 - lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp | 89 + lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h | 46 + lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp | 7 +- lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h | 7 +- .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp | 10 +- lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h | 9 +- lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp | 10 +- lib/Target/XCore/TargetInfo/XCoreTargetInfo.h | 20 + lib/Target/XCore/XCore.h | 7 +- lib/Target/XCore/XCore.td | 7 +- lib/Target/XCore/XCoreAsmPrinter.cpp | 31 +- lib/Target/XCore/XCoreCallingConv.td | 7 +- lib/Target/XCore/XCoreFrameLowering.cpp | 7 +- lib/Target/XCore/XCoreFrameLowering.h | 7 +- lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp | 7 +- lib/Target/XCore/XCoreISelDAGToDAG.cpp | 7 +- lib/Target/XCore/XCoreISelLowering.cpp | 82 +- lib/Target/XCore/XCoreISelLowering.h | 9 +- lib/Target/XCore/XCoreInstrFormats.td | 7 +- lib/Target/XCore/XCoreInstrInfo.cpp | 7 +- lib/Target/XCore/XCoreInstrInfo.h | 7 +- lib/Target/XCore/XCoreInstrInfo.td | 7 +- lib/Target/XCore/XCoreLowerThreadLocal.cpp | 7 +- lib/Target/XCore/XCoreMCInstLower.cpp | 7 +- lib/Target/XCore/XCoreMCInstLower.h | 7 +- lib/Target/XCore/XCoreMachineFunctionInfo.cpp | 7 +- lib/Target/XCore/XCoreMachineFunctionInfo.h | 7 +- lib/Target/XCore/XCoreRegisterInfo.cpp | 11 +- lib/Target/XCore/XCoreRegisterInfo.h | 9 +- lib/Target/XCore/XCoreRegisterInfo.td | 7 +- lib/Target/XCore/XCoreSelectionDAGInfo.cpp | 7 +- lib/Target/XCore/XCoreSelectionDAGInfo.h | 7 +- lib/Target/XCore/XCoreSubtarget.cpp | 7 +- lib/Target/XCore/XCoreSubtarget.h | 7 +- lib/Target/XCore/XCoreTargetMachine.cpp | 8 +- lib/Target/XCore/XCoreTargetMachine.h | 7 +- lib/Target/XCore/XCoreTargetObjectFile.cpp | 7 +- lib/Target/XCore/XCoreTargetObjectFile.h | 7 +- lib/Target/XCore/XCoreTargetStreamer.h | 7 +- lib/Target/XCore/XCoreTargetTransformInfo.h | 7 +- 1623 files changed, 108382 insertions(+), 52991 deletions(-) create mode 100644 lib/Target/AArch64/AArch64CallingConvention.cpp create mode 100644 lib/Target/AArch64/AArch64ExpandImm.cpp create mode 100644 lib/Target/AArch64/AArch64ExpandImm.h create mode 100644 lib/Target/AArch64/AArch64StackTagging.cpp delete mode 100644 lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp delete mode 100644 lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h create mode 100644 lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp create mode 100644 lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h create mode 100644 lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h delete mode 100644 lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp delete mode 100644 lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h create mode 100644 lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp delete mode 100644 lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp create mode 100644 lib/Target/AMDGPU/GCNNSAReassign.cpp create mode 100644 lib/Target/AMDGPU/GCNRegBankReassign.cpp delete mode 100644 lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp delete mode 100644 lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h create mode 100644 lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp create mode 100644 lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h delete mode 100644 lib/Target/AMDGPU/SIDebuggerInsertNops.cpp delete mode 100644 lib/Target/AMDGPU/SIFixWWMLiveness.cpp delete mode 100644 lib/Target/AMDGPU/SIIntrinsics.td create mode 100644 lib/Target/AMDGPU/SILowerSGPRSpills.cpp create mode 100644 lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp create mode 100644 lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h create mode 100644 lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp create mode 100644 lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h create mode 100644 lib/Target/ARC/ARCOptAddrMode.cpp delete mode 100644 lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp delete mode 100644 lib/Target/ARC/InstPrinter/ARCInstPrinter.h create mode 100644 lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp create mode 100644 lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h create mode 100644 lib/Target/ARC/TargetInfo/ARCTargetInfo.h create mode 100644 lib/Target/ARM/ARMBasicBlockInfo.cpp create mode 100644 lib/Target/ARM/ARMCallingConv.cpp delete mode 100644 lib/Target/ARM/ARMComputeBlockSize.cpp create mode 100644 lib/Target/ARM/ARMInstrMVE.td create mode 100644 lib/Target/ARM/ARMLowOverheadLoops.cpp create mode 100644 lib/Target/ARM/ARMPredicates.td delete mode 100644 lib/Target/ARM/ARMScheduleM3.td create mode 100644 lib/Target/ARM/ARMScheduleM4.td delete mode 100644 lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp delete mode 100644 lib/Target/ARM/InstPrinter/ARMInstPrinter.h delete mode 100755 lib/Target/ARM/LICENSE.TXT create mode 100644 lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp create mode 100644 lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h create mode 100644 lib/Target/ARM/TargetInfo/ARMTargetInfo.h delete mode 100644 lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp delete mode 100644 lib/Target/AVR/InstPrinter/AVRInstPrinter.h create mode 100644 lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp create mode 100644 lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h create mode 100644 lib/Target/AVR/TargetInfo/AVRTargetInfo.h create mode 100644 lib/Target/BPF/BPFAbstractMemberAccess.cpp create mode 100644 lib/Target/BPF/BPFCORE.h create mode 100644 lib/Target/BPF/BPFMISimplifyPatchable.cpp delete mode 100644 lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp delete mode 100644 lib/Target/BPF/InstPrinter/BPFInstPrinter.h create mode 100644 lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp create mode 100644 lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h create mode 100644 lib/Target/BPF/TargetInfo/BPFTargetInfo.h delete mode 100644 lib/Target/Hexagon/HexagonDepDecoders.h create mode 100644 lib/Target/Hexagon/HexagonDepDecoders.inc create mode 100644 lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h delete mode 100644 lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp delete mode 100644 lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h create mode 100644 lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h delete mode 100644 lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp delete mode 100644 lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h create mode 100644 lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h delete mode 100644 lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp delete mode 100644 lib/Target/Mips/InstPrinter/MipsInstPrinter.h create mode 100644 lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp create mode 100644 lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h create mode 100644 lib/Target/Mips/TargetInfo/MipsTargetInfo.h delete mode 100644 lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp delete mode 100644 lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h create mode 100644 lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp create mode 100644 lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h create mode 100644 lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h delete mode 100644 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp delete mode 100644 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp create mode 100644 lib/Target/PowerPC/PPCCallingConv.cpp create mode 100644 lib/Target/PowerPC/PPCMachineScheduler.cpp create mode 100644 lib/Target/PowerPC/PPCMachineScheduler.h create mode 100644 lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h delete mode 100644 lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp delete mode 100644 lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h create mode 100644 lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp create mode 100644 lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h create mode 100644 lib/Target/RISCV/RISCVTargetTransformInfo.cpp create mode 100644 lib/Target/RISCV/RISCVTargetTransformInfo.h create mode 100644 lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h delete mode 100644 lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp delete mode 100644 lib/Target/Sparc/InstPrinter/SparcInstPrinter.h create mode 100644 lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp create mode 100644 lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h create mode 100644 lib/Target/Sparc/TargetInfo/SparcTargetInfo.h delete mode 100644 lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp delete mode 100644 lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h create mode 100644 lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp create mode 100644 lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h create mode 100644 lib/Target/SystemZ/SystemZPostRewrite.cpp create mode 100644 lib/Target/SystemZ/SystemZScheduleArch13.td create mode 100644 lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h delete mode 100644 lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp delete mode 100644 lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h create mode 100644 lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp create mode 100644 lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h create mode 100644 lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h delete mode 100644 lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp create mode 100644 lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td delete mode 100644 lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td create mode 100644 lib/Target/WebAssembly/WebAssemblyInstrRef.td delete mode 100644 lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp delete mode 100644 lib/Target/X86/AsmParser/X86AsmInstrumentation.h delete mode 100644 lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp delete mode 100644 lib/Target/X86/InstPrinter/X86ATTInstPrinter.h delete mode 100644 lib/Target/X86/InstPrinter/X86InstComments.cpp delete mode 100644 lib/Target/X86/InstPrinter/X86InstComments.h delete mode 100644 lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp delete mode 100644 lib/Target/X86/InstPrinter/X86InstPrinterCommon.h delete mode 100644 lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp delete mode 100644 lib/Target/X86/InstPrinter/X86IntelInstPrinter.h create mode 100644 lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp create mode 100644 lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h create mode 100644 lib/Target/X86/MCTargetDesc/X86InstComments.cpp create mode 100644 lib/Target/X86/MCTargetDesc/X86InstComments.h create mode 100644 lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp create mode 100644 lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h create mode 100644 lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp create mode 100644 lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h delete mode 100644 lib/Target/X86/ShadowCallStack.cpp create mode 100644 lib/Target/X86/TargetInfo/X86TargetInfo.h delete mode 100644 lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp delete mode 100644 lib/Target/XCore/InstPrinter/XCoreInstPrinter.h create mode 100644 lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp create mode 100644 lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h create mode 100644 lib/Target/XCore/TargetInfo/XCoreTargetInfo.h (limited to 'lib/Target') diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index c36d9354f3ba..6965403a25ab 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -1,9 +1,8 @@ //==-- AArch64.h - Top-level interface for AArch64 --------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,6 +56,7 @@ InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); FunctionPass *createAArch64PreLegalizeCombiner(); +FunctionPass *createAArch64StackTaggingPass(); void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); @@ -79,6 +79,7 @@ void initializeAArch64StorePairSuppressPass(PassRegistry&); void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); +void initializeAArch64StackTaggingPass(PassRegistry&); } // end namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 8f79140cba64..e39c6995e367 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -1,9 +1,8 @@ //=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -104,6 +103,21 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", "Enable Scalable Vector Extension (SVE) instructions">; +def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", + "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>; + +def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true", + "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>; + +def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true", + "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>; + +def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true", + "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; + +def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true", + "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; + def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", @@ -127,7 +141,7 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align", "Disallow all unaligned memory " "access">; -foreach i = {1-7,18,20} in +foreach i = {1-7,9-15,18,20-28} in def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true", "Reserve X"#i#", making it unavailable " "as a GPR">; @@ -385,9 +399,29 @@ def AArch64InstrInfo : InstrInfo; include "AArch64SystemOperands.td" +//===----------------------------------------------------------------------===// +// Access to privileged registers +//===----------------------------------------------------------------------===// + +foreach i = 1-3 in +def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP", + "true", "Permit use of TPIDR_EL"#i#" for the TLS base">; + //===----------------------------------------------------------------------===// // AArch64 Processors supported. // + +//===----------------------------------------------------------------------===// +// Unsupported features to disable for scheduling models +//===----------------------------------------------------------------------===// + +class AArch64Unsupported { list F; } + +def SVEUnsupported : AArch64Unsupported { + let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, + HasSVE2BitPerm]; +} + include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" @@ -483,6 +517,18 @@ def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", FeaturePerfMon ]>; +def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", + "Cortex-A76 ARM processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureNEON, + FeatureRCPC, + FeatureCrypto, + FeatureFullFP16, + FeatureDotProd, + FeatureSSBS + ]>; + // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targetting apple OSes. def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", @@ -554,7 +600,7 @@ def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", FeatureDotProd, FeatureExynosCheapAsMoveHandling, FeatureForce32BitJumpTables, - FeatureFP16FML, + FeatureFullFP16, FeatureFuseAddress, FeatureFuseAES, FeatureFuseArithmeticLogic, @@ -694,15 +740,17 @@ def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72, Cortex-A73 and Cortex-A75 are currently modeled as a Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; +def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>; +def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>; def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; @@ -716,6 +764,9 @@ def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; // FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57. def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>; +// Alias for the latest Apple processor model supported by LLVM. +def : ProcessorModel<"apple-latest", CycloneModel, [ProcCyclone]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp index 30232afaf024..e80fe2cada09 100644 --- a/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -1,9 +1,8 @@ //===-- AArch64A53Fix835769.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This pass changes code to work around Cortex-A53 erratum 835769. diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 452fbd3488b0..92c8c4955d50 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -1,9 +1,8 @@ //===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // For best-case performance on Cortex-A57, we should try to use a balanced diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 22b0c1e3b471..89404463e1f0 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -1,9 +1,8 @@ //===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // When profitable, replace GPR targeting i64 instructions with their diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 0442076992e2..094fbd999523 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1,9 +1,8 @@ //===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,10 +17,12 @@ #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetObjectFile.h" -#include "InstPrinter/AArch64InstPrinter.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "MCTargetDesc/AArch64TargetStreamer.h" +#include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -29,6 +30,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/COFF.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -44,6 +46,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" @@ -96,6 +99,10 @@ public: void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI); + std::map, MCSymbol *> HwasanMemaccessSymbols; + void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI); + void EmitHwasanMemaccessSymbols(Module &M); + void EmitSled(const MachineInstr &MI, SledKind Kind); /// tblgen'erated driver function for lowering simple MI->MC @@ -147,11 +154,9 @@ private: raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); @@ -230,7 +235,204 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) recordSled(CurSled, MI, Kind); } +void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { + unsigned Reg = MI.getOperand(0).getReg(); + uint32_t AccessInfo = MI.getOperand(1).getImm(); + MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}]; + if (!Sym) { + // FIXME: Make this work on non-ELF. + if (!TM.getTargetTriple().isOSBinFormatELF()) + report_fatal_error("llvm.hwasan.check.memaccess only supported on ELF"); + + std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" + + utostr(AccessInfo); + Sym = OutContext.getOrCreateSymbol(SymName); + } + + EmitToStreamer(*OutStreamer, + MCInstBuilder(AArch64::BL) + .addExpr(MCSymbolRefExpr::create(Sym, OutContext))); +} + +void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { + if (HwasanMemaccessSymbols.empty()) + return; + + const Triple &TT = TM.getTargetTriple(); + assert(TT.isOSBinFormatELF()); + std::unique_ptr STI( + TM.getTarget().createMCSubtargetInfo(TT.str(), "", "")); + + MCSymbol *HwasanTagMismatchSym = + OutContext.getOrCreateSymbol("__hwasan_tag_mismatch"); + + const MCSymbolRefExpr *HwasanTagMismatchRef = + MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext); + + for (auto &P : HwasanMemaccessSymbols) { + unsigned Reg = P.first.first; + uint32_t AccessInfo = P.first.second; + MCSymbol *Sym = P.second; + + OutStreamer->SwitchSection(OutContext.getELFSection( + ".text.hot", ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, + Sym->getName())); + + OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); + OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak); + OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden); + OutStreamer->EmitLabel(Sym); + + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(4) + .addImm(55), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX) + .addReg(AArch64::W16) + .addReg(AArch64::X9) + .addReg(AArch64::X16) + .addImm(0) + .addImm(0), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::SUBSXrs) + .addReg(AArch64::XZR) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), + *STI); + MCSymbol *HandlePartialSym = OutContext.createTempSymbol(); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::NE) + .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)), + *STI); + MCSymbol *ReturnSym = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(ReturnSym); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI); + + OutStreamer->EmitLabel(HandlePartialSym); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) + .addReg(AArch64::WZR) + .addReg(AArch64::W16) + .addImm(15) + .addImm(0), + *STI); + MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::HI) + .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), + *STI); + + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ANDXri) + .addReg(AArch64::X17) + .addReg(Reg) + .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), + *STI); + unsigned Size = 1 << (AccessInfo & 0xf); + if (Size != 1) + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X17) + .addReg(AArch64::X17) + .addImm(Size - 1) + .addImm(0), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) + .addReg(AArch64::WZR) + .addReg(AArch64::W16) + .addReg(AArch64::W17) + .addImm(0), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::LS) + .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), + *STI); + + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ORRXri) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) + .addReg(AArch64::W16) + .addReg(AArch64::X16) + .addImm(0), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::SUBSXrs) + .addReg(AArch64::XZR) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::EQ) + .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), + *STI); + + OutStreamer->EmitLabel(HandleMismatchSym); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre) + .addReg(AArch64::SP) + .addReg(AArch64::X0) + .addReg(AArch64::X1) + .addReg(AArch64::SP) + .addImm(-32), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi) + .addReg(AArch64::FP) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(29), + *STI); + + if (Reg != AArch64::X0) + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs) + .addReg(AArch64::X0) + .addReg(AArch64::XZR) + .addReg(Reg) + .addImm(0), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi) + .addReg(AArch64::X1) + .addImm(AccessInfo) + .addImm(0), + *STI); + + // Intentionally load the GOT entry and branch to it, rather than possibly + // late binding the function, which may clobber the registers before we have + // a chance to save them. + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ADRP) + .addReg(AArch64::X16) + .addExpr(AArch64MCExpr::create( + HwasanTagMismatchRef, + AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X16) + .addReg(AArch64::X16) + .addExpr(AArch64MCExpr::create( + HwasanTagMismatchRef, + AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); + } +} + void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { + EmitHwasanMemaccessSymbols(M); + const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) { // Funny Darwin hack: This flag tells the linker that no global symbols @@ -295,14 +497,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, break; } case MachineOperand::MO_GlobalAddress: { - const GlobalValue *GV = MO.getGlobal(); - MCSymbol *Sym = getSymbol(GV); - - // FIXME: Can we get anything other than a plain symbol here? - assert(!MO.getTargetFlags() && "Unknown operand target flag!"); - - Sym->print(O, MAI); - printOffset(MO.getOffset(), O); + PrintSymbolOperand(MO, O); break; } case MachineOperand::MO_BlockAddress: { @@ -348,12 +543,11 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, } bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNum); // First try the generic code, which knows about modifiers like 'c' and 'n'. - if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O)) + if (!AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O)) return false; // Does this asm operand have a single letter operand modifier? @@ -364,9 +558,6 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: return true; // Unknown modifier. - case 'a': // Print 'a' modifier - PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); - return false; case 'w': // Print W register case 'x': // Print X register if (MO.isReg()) @@ -432,7 +623,6 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a') @@ -471,9 +661,18 @@ void AArch64AsmPrinter::EmitJumpTableInfo() { const std::vector &JT = MJTI->getJumpTables(); if (JT.empty()) return; + const Function &F = MF->getFunction(); const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM); - OutStreamer->SwitchSection(ReadOnlySec); + bool JTInDiffSection = + !STI->isTargetCOFF() || + !TLOF.shouldPutJumpTableInFunctionSection( + MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32, + F); + if (JTInDiffSection) { + // Drop it in the readonly section. + MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM); + OutStreamer->SwitchSection(ReadOnlySec); + } auto AFI = MF->getInfo(); for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) { @@ -694,6 +893,34 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::MOVMCSym: { + unsigned DestReg = MI->getOperand(0).getReg(); + const MachineOperand &MO_Sym = MI->getOperand(1); + MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym); + MCOperand Hi_MCSym, Lo_MCSym; + + Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S); + Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC); + + MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym); + MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym); + + MCInst MovZ; + MovZ.setOpcode(AArch64::MOVZXi); + MovZ.addOperand(MCOperand::createReg(DestReg)); + MovZ.addOperand(Hi_MCSym); + MovZ.addOperand(MCOperand::createImm(16)); + EmitToStreamer(*OutStreamer, MovZ); + + MCInst MovK; + MovK.setOpcode(AArch64::MOVKXi); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(Lo_MCSym); + MovK.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MovK); + return; + } case AArch64::MOVIv2d_ns: // If the target has , lower this // instruction to movi.16b instead. @@ -856,6 +1083,10 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { LowerPATCHABLE_TAIL_CALL(*MI); return; + case AArch64::HWASAN_CHECK_MEMACCESS: + LowerHWASAN_CHECK_MEMACCESS(*MI); + return; + case AArch64::SEH_StackAlloc: TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm()); return; diff --git a/lib/Target/AArch64/AArch64BranchTargets.cpp b/lib/Target/AArch64/AArch64BranchTargets.cpp index da70a624c5be..6fa3a462bc71 100644 --- a/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -1,9 +1,8 @@ //===-- AArch64BranchTargets.cpp -- Harden code using v8.5-A BTI extension -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp index 5980e5684e89..59757769c89a 100644 --- a/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -1,9 +1,8 @@ //===--- AArch64CallLowering.cpp - Call lowering --------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -45,6 +44,8 @@ #include #include +#define DEBUG_TYPE "aarch64-call-lowering" + using namespace llvm; AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) @@ -56,18 +57,18 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { CCAssignFn *AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { auto &MFI = MIRBuilder.getMF().getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64)); + Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64)); MIRBuilder.buildFrameIndex(AddrReg, FI); StackUsed = std::max(StackUsed, Size + Offset); return AddrReg; } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { markPhysRegUsed(PhysReg); switch (VA.getLocInfo()) { @@ -84,11 +85,12 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { } } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { + // FIXME: Get alignment auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, - 0); + 1); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } @@ -97,6 +99,8 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { /// (it's an implicit-def of the BL). virtual void markPhysRegUsed(unsigned PhysReg) = 0; + bool isArgumentHandler() const override { return true; } + uint64_t StackUsed; }; @@ -129,31 +133,31 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), AssignFnVarArg(AssignFnVarArg), StackSize(0) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { LLT p0 = LLT::pointer(0, 64); LLT s64 = LLT::scalar(64); - unsigned SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, AArch64::SP); + Register SPReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildCopy(SPReg, Register(AArch64::SP)); - unsigned OffsetReg = MRI.createGenericVirtualRegister(s64); + Register OffsetReg = MRI.createGenericVirtualRegister(s64); MIRBuilder.buildConstant(OffsetReg, Offset); - unsigned AddrReg = MRI.createGenericVirtualRegister(p0); + Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); return AddrReg; } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); - unsigned ExtReg = extendRegister(ValVReg, VA); + Register ExtReg = extendRegister(ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) { Size = VA.getLocVT().getSizeInBits() / 8; @@ -162,7 +166,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { .getReg(); } auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOStore, Size, 0); + MPO, MachineMemOperand::MOStore, Size, 1); MIRBuilder.buildStore(ValVReg, Addr, *MMO); } @@ -188,8 +192,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { void AArch64CallLowering::splitToValueTypes( const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, - const SplitArgTy &PerformArgSplit) const { + const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const { const AArch64TargetLowering &TLI = *getTLI(); LLVMContext &Ctx = OrigArg.Ty->getContext(); @@ -203,32 +206,31 @@ void AArch64CallLowering::splitToValueTypes( if (SplitVTs.size() == 1) { // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). - SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), + SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), OrigArg.Flags, OrigArg.IsFixed); return; } - unsigned FirstRegIdx = SplitArgs.size(); + // Create one ArgInfo for each virtual register in the original ArgInfo. + assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( OrigArg.Ty, CallConv, false); - for (auto SplitVT : SplitVTs) { - Type *SplitTy = SplitVT.getTypeForEVT(Ctx); - SplitArgs.push_back( - ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), - SplitTy, OrigArg.Flags, OrigArg.IsFixed}); + for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags, + OrigArg.IsFixed); if (NeedsRegBlock) SplitArgs.back().Flags.setInConsecutiveRegs(); } SplitArgs.back().Flags.setInConsecutiveRegsLast(); - - for (unsigned i = 0; i < Offsets.size(); ++i) - PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const { + ArrayRef VRegs, + Register SwiftErrorVReg) const { auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR); assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && "Return value without a vreg"); @@ -250,34 +252,101 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, "For each split Type there should be exactly one VReg."); SmallVector SplitArgs; + CallingConv::ID CC = F.getCallingConv(); + for (unsigned i = 0; i < SplitEVTs.size(); ++i) { - // We zero-extend i1s to i8. - unsigned CurVReg = VRegs[i]; - if (MRI.getType(VRegs[i]).getSizeInBits() == 1) { - CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg) - ->getOperand(0) - .getReg(); + if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) > 1) { + LLVM_DEBUG(dbgs() << "Can't handle extended arg types which need split"); + return false; } + Register CurVReg = VRegs[i]; ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx)}; setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); - splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, F.getCallingConv(), - [&](unsigned Reg, uint64_t Offset) { - MIRBuilder.buildExtract(Reg, CurVReg, Offset); - }); + + // i1 is a special case because SDAG i1 true is naturally zero extended + // when widened using ANYEXT. We need to do it explicitly here. + if (MRI.getType(CurVReg).getSizeInBits() == 1) { + CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0); + } else { + // Some types will need extending as specified by the CC. + MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]); + if (EVT(NewVT) != SplitEVTs[i]) { + unsigned ExtendOp = TargetOpcode::G_ANYEXT; + if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, + Attribute::SExt)) + ExtendOp = TargetOpcode::G_SEXT; + else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, + Attribute::ZExt)) + ExtendOp = TargetOpcode::G_ZEXT; + + LLT NewLLT(NewVT); + LLT OldLLT(MVT::getVT(CurArgInfo.Ty)); + CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx); + // Instead of an extend, we might have a vector type which needs + // padding with more elements, e.g. <2 x half> -> <4 x half>. + if (NewVT.isVector()) { + if (OldLLT.isVector()) { + if (NewLLT.getNumElements() > OldLLT.getNumElements()) { + // We don't handle VA types which are not exactly twice the + // size, but can easily be done in future. + if (NewLLT.getNumElements() != OldLLT.getNumElements() * 2) { + LLVM_DEBUG(dbgs() << "Outgoing vector ret has too many elts"); + return false; + } + auto Undef = MIRBuilder.buildUndef({OldLLT}); + CurVReg = + MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef.getReg(0)}) + .getReg(0); + } else { + // Just do a vector extend. + CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}) + .getReg(0); + } + } else if (NewLLT.getNumElements() == 2) { + // We need to pad a <1 x S> type to <2 x S>. Since we don't have + // <1 x S> vector types in GISel we use a build_vector instead + // of a vector merge/concat. + auto Undef = MIRBuilder.buildUndef({OldLLT}); + CurVReg = + MIRBuilder + .buildBuildVector({NewLLT}, {CurVReg, Undef.getReg(0)}) + .getReg(0); + } else { + LLVM_DEBUG(dbgs() << "Could not handle ret ty"); + return false; + } + } else { + // A scalar extend. + CurVReg = + MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}).getReg(0); + } + } + } + if (CurVReg != CurArgInfo.Regs[0]) { + CurArgInfo.Regs[0] = CurVReg; + // Reset the arg flags after modifying CurVReg. + setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); + } + splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, CC); } OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); Success = handleAssignments(MIRBuilder, SplitArgs, Handler); } + if (SwiftErrorVReg) { + MIB.addUse(AArch64::X21, RegState::Implicit); + MIRBuilder.buildCopy(AArch64::X21, SwiftErrorVReg); + } + MIRBuilder.insertInstr(MIB); return Success; } -bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef VRegs) const { +bool AArch64CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { MachineFunction &MF = MIRBuilder.getMF(); MachineBasicBlock &MBB = MIRBuilder.getMBB(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -288,26 +357,11 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, for (auto &Arg : F.args()) { if (DL.getTypeStoreSize(Arg.getType()) == 0) continue; + ArgInfo OrigArg{VRegs[i], Arg.getType()}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); - bool Split = false; - LLT Ty = MRI.getType(VRegs[i]); - unsigned Dst = VRegs[i]; - - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv(), - [&](unsigned Reg, uint64_t Offset) { - if (!Split) { - Split = true; - Dst = MRI.createGenericVirtualRegister(Ty); - MIRBuilder.buildUndef(Dst); - } - unsigned Tmp = MRI.createGenericVirtualRegister(Ty); - MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset); - Dst = Tmp; - }); - - if (Dst != VRegs[i]) - MIRBuilder.buildCopy(VRegs[i], Dst); + + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv()); ++i; } @@ -351,7 +405,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef OrigArgs) const { + ArrayRef OrigArgs, + Register SwiftErrorVReg) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -359,10 +414,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector SplitArgs; for (auto &OrigArg : OrigArgs) { - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv, - [&](unsigned Reg, uint64_t Offset) { - MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset); - }); + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv); + // AAPCS requires that we zero-extend i1 to 8 bits by the caller. + if (OrigArg.Ty->isIntegerTy(1)) + SplitArgs.back().Flags.setZExt(); } // Find out which ABI gets to decide where things go. @@ -412,23 +467,19 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // symmetry with the arugments, the physical register must be an // implicit-define of the call instruction. CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); - if (OrigRet.Reg) { + if (!OrigRet.Ty->isVoidTy()) { SplitArgs.clear(); - SmallVector RegOffsets; - SmallVector SplitRegs; - splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv(), - [&](unsigned Reg, uint64_t Offset) { - RegOffsets.push_back(Offset); - SplitRegs.push_back(Reg); - }); + splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv()); CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; + } - if (!RegOffsets.empty()) - MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets); + if (SwiftErrorVReg) { + MIB.addDef(AArch64::X21, RegState::Implicit); + MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21)); } CallSeqStart.addImm(Handler.StackSize).addImm(0); diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h index 1c2bd6a4de5d..4f428f254537 100644 --- a/lib/Target/AArch64/AArch64CallLowering.h +++ b/lib/Target/AArch64/AArch64CallLowering.h @@ -1,9 +1,8 @@ //===- AArch64CallLowering.h - Call lowering --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -35,14 +34,24 @@ public: AArch64CallLowering(const AArch64TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const override; + ArrayRef VRegs, + Register SwiftErrorVReg) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef VRegs) const override; + ArrayRef> VRegs) const override; + + bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, + const MachineOperand &Callee, const ArgInfo &OrigRet, + ArrayRef OrigArgs, + Register SwiftErrorVReg) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef OrigArgs) const override; + ArrayRef OrigArgs) const override { + return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0); + } + + bool supportSwiftError() const override { return true; } private: using RegHandler = std::function; - using SplitArgTy = std::function; - void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, - CallingConv::ID CallConv, - const SplitArgTy &SplitArg) const; + CallingConv::ID CallConv) const; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp new file mode 100644 index 000000000000..02538a187611 --- /dev/null +++ b/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -0,0 +1,134 @@ +//=== AArch64CallingConvention.cpp - AArch64 CC impl ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the table-generated and custom routines for the AArch64 +// Calling Convention. +// +//===----------------------------------------------------------------------===// + +#include "AArch64CallingConvention.h" +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/CallingConv.h" +using namespace llvm; + +static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; + +static bool finishStackBlock(SmallVectorImpl &PendingMembers, + MVT LocVT, ISD::ArgFlagsTy &ArgFlags, + CCState &State, unsigned SlotAlign) { + unsigned Size = LocVT.getSizeInBits() / 8; + unsigned StackAlign = + State.getMachineFunction().getDataLayout().getStackAlignment(); + unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); + + for (auto &It : PendingMembers) { + It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); + State.addLoc(It); + SlotAlign = 1; + } + + // All pending members have now been allocated + PendingMembers.clear(); + return true; +} + +/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An +/// [N x Ty] type must still be contiguous in memory though. +static bool CC_AArch64_Custom_Stack_Block( + unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + SmallVectorImpl &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); +} + +/// Given an [N x Ty] block, it should be passed in a consecutive sequence of +/// registers. If no such sequence is available, mark the rest of the registers +/// of that type as used and place the argument on the stack. +static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // Try to allocate a contiguous block of registers, each of the correct + // size to hold one member. + ArrayRef RegList; + if (LocVT.SimpleTy == MVT::i64) + RegList = XRegList; + else if (LocVT.SimpleTy == MVT::f16) + RegList = HRegList; + else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector()) + RegList = SRegList; + else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector()) + RegList = DRegList; + else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector()) + RegList = QRegList; + else { + // Not an array we want to split up after all. + return false; + } + + SmallVectorImpl &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); + if (RegResult) { + for (auto &It : PendingMembers) { + It.convertToReg(RegResult); + State.addLoc(It); + ++RegResult; + } + PendingMembers.clear(); + return true; + } + + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + + const AArch64Subtarget &Subtarget = static_cast( + State.getMachineFunction().getSubtarget()); + unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); +} + +// TableGen provides definitions of the calling convention analysis entry +// points. +#include "AArch64GenCallingConv.inc" diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h index 461c01318d4e..13cc0c583fd2 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.h +++ b/lib/Target/AArch64/AArch64CallingConvention.h @@ -1,139 +1,45 @@ -//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===// +//=== AArch64CallingConvention.h - AArch64 CC entry points ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // -// This file contains the custom routines for the AArch64 Calling Convention -// that aren't done by tablegen. +// This file declares the entry points for AArch64 calling convention analysis. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H #define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H -#include "AArch64.h" -#include "AArch64InstrInfo.h" -#include "AArch64Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/IR/CallingConv.h" - -namespace { -using namespace llvm; - -static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, - AArch64::X3, AArch64::X4, AArch64::X5, - AArch64::X6, AArch64::X7}; -static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, - AArch64::H3, AArch64::H4, AArch64::H5, - AArch64::H6, AArch64::H7}; -static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, - AArch64::S3, AArch64::S4, AArch64::S5, - AArch64::S6, AArch64::S7}; -static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, - AArch64::D3, AArch64::D4, AArch64::D5, - AArch64::D6, AArch64::D7}; -static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, - AArch64::Q3, AArch64::Q4, AArch64::Q5, - AArch64::Q6, AArch64::Q7}; - -static bool finishStackBlock(SmallVectorImpl &PendingMembers, - MVT LocVT, ISD::ArgFlagsTy &ArgFlags, - CCState &State, unsigned SlotAlign) { - unsigned Size = LocVT.getSizeInBits() / 8; - unsigned StackAlign = - State.getMachineFunction().getDataLayout().getStackAlignment(); - unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); - - for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); - State.addLoc(It); - SlotAlign = 1; - } - - // All pending members have now been allocated - PendingMembers.clear(); - return true; -} - -/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An -/// [N x Ty] type must still be contiguous in memory though. -static bool CC_AArch64_Custom_Stack_Block( - unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { - SmallVectorImpl &PendingMembers = State.getPendingLocs(); - - // Add the argument to the list to be allocated once we know the size of the - // block. - PendingMembers.push_back( - CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); - - if (!ArgFlags.isInConsecutiveRegsLast()) - return true; - - return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); -} - -/// Given an [N x Ty] block, it should be passed in a consecutive sequence of -/// registers. If no such sequence is available, mark the rest of the registers -/// of that type as used and place the argument on the stack. -static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { - // Try to allocate a contiguous block of registers, each of the correct - // size to hold one member. - ArrayRef RegList; - if (LocVT.SimpleTy == MVT::i64) - RegList = XRegList; - else if (LocVT.SimpleTy == MVT::f16) - RegList = HRegList; - else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector()) - RegList = SRegList; - else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector()) - RegList = DRegList; - else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector()) - RegList = QRegList; - else { - // Not an array we want to split up after all. - return false; - } - - SmallVectorImpl &PendingMembers = State.getPendingLocs(); - - // Add the argument to the list to be allocated once we know the size of the - // block. - PendingMembers.push_back( - CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); - - if (!ArgFlags.isInConsecutiveRegsLast()) - return true; - - unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); - if (RegResult) { - for (auto &It : PendingMembers) { - It.convertToReg(RegResult); - State.addLoc(It); - ++RegResult; - } - PendingMembers.clear(); - return true; - } - - // Mark all regs in the class as unavailable - for (auto Reg : RegList) - State.AllocateReg(Reg); - - const AArch64Subtarget &Subtarget = static_cast( - State.getMachineFunction().getSubtarget()); - unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; - - return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); -} -} +namespace llvm { +bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); +} // namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 5db941e9dac7..d969a9e1ab3a 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -1,9 +1,8 @@ //=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,6 +21,7 @@ class CCIfBigEndian : // ARM AAPCS64 Calling Convention //===----------------------------------------------------------------------===// +let Entry = 1 in def CC_AArch64_AAPCS : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, @@ -34,7 +34,23 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfBigEndian>>, - // An SRet is passed in X8, not X0 like a normal pointer parameter. + // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter. + // However, on windows, in some circumstances, the SRet is passed in X0 or X1 + // instead. The presence of the inreg attribute indicates that SRet is + // passed in the alternative register (X0 or X1), not X8: + // - X0 for non-instance methods. + // - X1 for instance methods. + + // The "sret" attribute identifies indirect returns. + // The "inreg" attribute identifies non-aggregate types. + // The position of the "sret" attribute identifies instance/non-instance + // methods. + // "sret" on argument 0 means non-instance methods. + // "sret" on argument 1 means instance methods. + + CCIfInReg>>>>, + CCIfSRet>>, // Put ByVal arguments directly on the stack. Minimum size and alignment of a @@ -89,6 +105,7 @@ def CC_AArch64_AAPCS : CallingConv<[ CCAssignToStack<16, 16>> ]>; +let Entry = 1 in def RetCC_AArch64_AAPCS : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, @@ -122,6 +139,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ ]>; // Vararg functions on windows pass floats in integer registers +let Entry = 1 in def CC_AArch64_Win64_VarArg : CallingConv<[ CCIfType<[f16, f32], CCPromoteToType>, CCIfType<[f64], CCBitConvertToType>, @@ -133,6 +151,7 @@ def CC_AArch64_Win64_VarArg : CallingConv<[ // from the standard one at this level: // + i128s (i.e. split i64s) don't need even registers. // + Stack slots are sized as needed rather than being at least 64-bit. +let Entry = 1 in def CC_AArch64_DarwinPCS : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, @@ -189,6 +208,7 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCAssignToStack<16, 16>> ]>; +let Entry = 1 in def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, @@ -213,6 +233,7 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // in register and the remaining arguments on stack. We allow 32bit stack slots, // so that WebKit can write partial values in the stack and define the other // 32bit quantity as undef. +let Entry = 1 in def CC_AArch64_WebKit_JS : CallingConv<[ // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). CCIfType<[i1, i8, i16], CCPromoteToType>, @@ -224,6 +245,7 @@ def CC_AArch64_WebKit_JS : CallingConv<[ CCIfType<[i64, f64], CCAssignToStack<8, 8>> ]>; +let Entry = 1 in def RetCC_AArch64_WebKit_JS : CallingConv<[ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], [X0, X1, X2, X3, X4, X5, X6, X7]>>, @@ -257,6 +279,7 @@ def RetCC_AArch64_WebKit_JS : CallingConv<[ // The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI // register mapping". +let Entry = 1 in def CC_AArch64_GHC : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index b88fba4452a1..688bd1b28e85 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -1,9 +1,8 @@ //===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 720323f81d29..9f324b433209 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -1,9 +1,8 @@ //===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/lib/Target/AArch64/AArch64CompressJumpTables.cpp index 0924a27e2586..48dab79b32d3 100644 --- a/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -1,9 +1,8 @@ //==-- AArch64CompressJumpTables.cpp - Compress jump tables for AArch64 --====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // This pass looks at the basic blocks each jump-table refers to and works out // whether they can be emitted in a compressed form (with 8 or 16-bit @@ -108,6 +107,7 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI, MinBlock = Block; } } + assert(MinBlock && "Failed to find minimum offset block"); // The ADR instruction needed to calculate the address of the first reachable // basic block can address +/-1MB. @@ -141,7 +141,7 @@ bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) { const auto &ST = MF->getSubtarget(); TII = ST.getInstrInfo(); - if (ST.force32BitJumpTables() && !MF->getFunction().optForMinSize()) + if (ST.force32BitJumpTables() && !MF->getFunction().hasMinSize()) return false; scanFunction(); diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp index 5ae787409ae8..453132e09669 100644 --- a/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -1,9 +1,8 @@ //===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 5064762b9f77..a6efb115ed44 100644 --- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -1,9 +1,8 @@ //=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 8176b6fb269d..2cfbcc592d6a 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -1,9 +1,8 @@ //===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -941,7 +940,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { MBPI = &getAnalysis(); Traces = &getAnalysis(); MinInstr = nullptr; - MinSize = MF.getFunction().optForMinSize(); + MinSize = MF.getFunction().hasMinSize(); bool Changed = false; CmpConv.runOnMachineFunction(MF, MBPI); diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 2ba10d25e939..a43077cb88ec 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -1,9 +1,8 @@ //==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file When allowed by the instruction, replace a dead definition of a GPR @@ -55,8 +54,6 @@ public: AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } - - bool shouldSkip(const MachineInstr &MI, const MachineFunction &MF) const; }; char AArch64DeadRegisterDefinitions::ID = 0; } // end anonymous namespace @@ -71,60 +68,48 @@ static bool usesFrameIndex(const MachineInstr &MI) { return false; } -bool -AArch64DeadRegisterDefinitions::shouldSkip(const MachineInstr &MI, - const MachineFunction &MF) const { - if (!MF.getSubtarget().hasLSE()) - return false; - -#define CASE_AARCH64_ATOMIC_(PREFIX) \ - case AArch64::PREFIX##X: \ - case AArch64::PREFIX##W: \ - case AArch64::PREFIX##H: \ - case AArch64::PREFIX##B - - for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->isAtomic()) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - return false; - break; - - CASE_AARCH64_ATOMIC_(LDADDA): - CASE_AARCH64_ATOMIC_(LDADDAL): - - CASE_AARCH64_ATOMIC_(LDCLRA): - CASE_AARCH64_ATOMIC_(LDCLRAL): - - CASE_AARCH64_ATOMIC_(LDEORA): - CASE_AARCH64_ATOMIC_(LDEORAL): - - CASE_AARCH64_ATOMIC_(LDSETA): - CASE_AARCH64_ATOMIC_(LDSETAL): - - CASE_AARCH64_ATOMIC_(LDSMAXA): - CASE_AARCH64_ATOMIC_(LDSMAXAL): - - CASE_AARCH64_ATOMIC_(LDSMINA): - CASE_AARCH64_ATOMIC_(LDSMINAL): - - CASE_AARCH64_ATOMIC_(LDUMAXA): - CASE_AARCH64_ATOMIC_(LDUMAXAL): - - CASE_AARCH64_ATOMIC_(LDUMINA): - CASE_AARCH64_ATOMIC_(LDUMINAL): - - CASE_AARCH64_ATOMIC_(SWPA): - CASE_AARCH64_ATOMIC_(SWPAL): - return true; - break; - } - } +// Instructions that lose their 'read' operation for a subesquent fence acquire +// (DMB LD) once the zero register is used. +// +// WARNING: The aquire variants of the instructions are also affected, but they +// are split out into `atomicBarrierDroppedOnZero()` to support annotations on +// assembly. +static bool atomicReadDroppedOnZero(unsigned Opcode) { + switch (Opcode) { + case AArch64::LDADDB: case AArch64::LDADDH: + case AArch64::LDADDW: case AArch64::LDADDX: + case AArch64::LDADDLB: case AArch64::LDADDLH: + case AArch64::LDADDLW: case AArch64::LDADDLX: + case AArch64::LDCLRB: case AArch64::LDCLRH: + case AArch64::LDCLRW: case AArch64::LDCLRX: + case AArch64::LDCLRLB: case AArch64::LDCLRLH: + case AArch64::LDCLRLW: case AArch64::LDCLRLX: + case AArch64::LDEORB: case AArch64::LDEORH: + case AArch64::LDEORW: case AArch64::LDEORX: + case AArch64::LDEORLB: case AArch64::LDEORLH: + case AArch64::LDEORLW: case AArch64::LDEORLX: + case AArch64::LDSETB: case AArch64::LDSETH: + case AArch64::LDSETW: case AArch64::LDSETX: + case AArch64::LDSETLB: case AArch64::LDSETLH: + case AArch64::LDSETLW: case AArch64::LDSETLX: + case AArch64::LDSMAXB: case AArch64::LDSMAXH: + case AArch64::LDSMAXW: case AArch64::LDSMAXX: + case AArch64::LDSMAXLB: case AArch64::LDSMAXLH: + case AArch64::LDSMAXLW: case AArch64::LDSMAXLX: + case AArch64::LDSMINB: case AArch64::LDSMINH: + case AArch64::LDSMINW: case AArch64::LDSMINX: + case AArch64::LDSMINLB: case AArch64::LDSMINLH: + case AArch64::LDSMINLW: case AArch64::LDSMINLX: + case AArch64::LDUMAXB: case AArch64::LDUMAXH: + case AArch64::LDUMAXW: case AArch64::LDUMAXX: + case AArch64::LDUMAXLB: case AArch64::LDUMAXLH: + case AArch64::LDUMAXLW: case AArch64::LDUMAXLX: + case AArch64::LDUMINB: case AArch64::LDUMINH: + case AArch64::LDUMINW: case AArch64::LDUMINX: + case AArch64::LDUMINLB: case AArch64::LDUMINLH: + case AArch64::LDUMINLW: case AArch64::LDUMINLX: + return true; } - -#undef CASE_AARCH64_ATOMIC_ - return false; } @@ -148,9 +133,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( continue; } - if (shouldSkip(MI, MF)) { - LLVM_DEBUG(dbgs() << " Ignoring, Atomic instruction with acquire " - "semantics using WZR/XZR\n"); + if (atomicBarrierDroppedOnZero(MI.getOpcode()) || atomicReadDroppedOnZero(MI.getOpcode())) { + LLVM_DEBUG(dbgs() << " Ignoring, semantics change with xzr/wzr.\n"); continue; } diff --git a/lib/Target/AArch64/AArch64ExpandImm.cpp b/lib/Target/AArch64/AArch64ExpandImm.cpp new file mode 100644 index 000000000000..c764af80eb86 --- /dev/null +++ b/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -0,0 +1,411 @@ +//===- AArch64ExpandImm.h - AArch64 Immediate Expansion -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64ExpandImm stuff. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64ExpandImm.h" +#include "MCTargetDesc/AArch64AddressingModes.h" + +namespace llvm { + +namespace AArch64_IMM { + +/// Helper function which extracts the specified 16-bit chunk from a +/// 64-bit value. +static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { + assert(ChunkIdx < 4 && "Out of range chunk index specified!"); + + return (Imm >> (ChunkIdx * 16)) & 0xFFFF; +} + +/// Check whether the given 16-bit chunk replicated to full 64-bit width +/// can be materialized with an ORR instruction. +static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { + Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; + + return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding); +} + +/// Check for identical 16-bit chunks within the constant and if so +/// materialize them with a single ORR instruction. The remaining one or two +/// 16-bit chunks will be materialized with MOVK instructions. +/// +/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order +/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with +/// an ORR instruction. +static bool tryToreplicateChunks(uint64_t UImm, + SmallVectorImpl &Insn) { + using CountMap = DenseMap; + + CountMap Counts; + + // Scan the constant and count how often every chunk occurs. + for (unsigned Idx = 0; Idx < 4; ++Idx) + ++Counts[getChunk(UImm, Idx)]; + + // Traverse the chunks to find one which occurs more than once. + for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); + Chunk != End; ++Chunk) { + const uint64_t ChunkVal = Chunk->first; + const unsigned Count = Chunk->second; + + uint64_t Encoding = 0; + + // We are looking for chunks which have two or three instances and can be + // materialized with an ORR instruction. + if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding)) + continue; + + const bool CountThree = Count == 3; + + Insn.push_back({ AArch64::ORRXri, 0, Encoding }); + + unsigned ShiftAmt = 0; + uint64_t Imm16 = 0; + // Find the first chunk not materialized with the ORR instruction. + for (; ShiftAmt < 64; ShiftAmt += 16) { + Imm16 = (UImm >> ShiftAmt) & 0xFFFF; + + if (Imm16 != ChunkVal) + break; + } + + // Create the first MOVK instruction. + Insn.push_back({ AArch64::MOVKXi, Imm16, + AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt) }); + + // In case we have three instances the whole constant is now materialized + // and we can exit. + if (CountThree) + return true; + + // Find the remaining chunk which needs to be materialized. + for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { + Imm16 = (UImm >> ShiftAmt) & 0xFFFF; + + if (Imm16 != ChunkVal) + break; + } + Insn.push_back({ AArch64::MOVKXi, Imm16, + AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt) }); + return true; + } + + return false; +} + +/// Check whether this chunk matches the pattern '1...0...'. This pattern +/// starts a contiguous sequence of ones if we look at the bits from the LSB +/// towards the MSB. +static bool isStartChunk(uint64_t Chunk) { + if (Chunk == 0 || Chunk == std::numeric_limits::max()) + return false; + + return isMask_64(~Chunk); +} + +/// Check whether this chunk matches the pattern '0...1...' This pattern +/// ends a contiguous sequence of ones if we look at the bits from the LSB +/// towards the MSB. +static bool isEndChunk(uint64_t Chunk) { + if (Chunk == 0 || Chunk == std::numeric_limits::max()) + return false; + + return isMask_64(Chunk); +} + +/// Clear or set all bits in the chunk at the given index. +static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { + const uint64_t Mask = 0xFFFF; + + if (Clear) + // Clear chunk in the immediate. + Imm &= ~(Mask << (Idx * 16)); + else + // Set all bits in the immediate for the particular chunk. + Imm |= Mask << (Idx * 16); + + return Imm; +} + +/// Check whether the constant contains a sequence of contiguous ones, +/// which might be interrupted by one or two chunks. If so, materialize the +/// sequence of contiguous ones with an ORR instruction. +/// Materialize the chunks which are either interrupting the sequence or outside +/// of the sequence with a MOVK instruction. +/// +/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk +/// which ends the sequence (0...1...). Then we are looking for constants which +/// contain at least one S and E chunk. +/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. +/// +/// We are also looking for constants like |S|A|B|E| where the contiguous +/// sequence of ones wraps around the MSB into the LSB. +static bool trySequenceOfOnes(uint64_t UImm, + SmallVectorImpl &Insn) { + const int NotSet = -1; + const uint64_t Mask = 0xFFFF; + + int StartIdx = NotSet; + int EndIdx = NotSet; + // Try to find the chunks which start/end a contiguous sequence of ones. + for (int Idx = 0; Idx < 4; ++Idx) { + int64_t Chunk = getChunk(UImm, Idx); + // Sign extend the 16-bit chunk to 64-bit. + Chunk = (Chunk << 48) >> 48; + + if (isStartChunk(Chunk)) + StartIdx = Idx; + else if (isEndChunk(Chunk)) + EndIdx = Idx; + } + + // Early exit in case we can't find a start/end chunk. + if (StartIdx == NotSet || EndIdx == NotSet) + return false; + + // Outside of the contiguous sequence of ones everything needs to be zero. + uint64_t Outside = 0; + // Chunks between the start and end chunk need to have all their bits set. + uint64_t Inside = Mask; + + // If our contiguous sequence of ones wraps around from the MSB into the LSB, + // just swap indices and pretend we are materializing a contiguous sequence + // of zeros surrounded by a contiguous sequence of ones. + if (StartIdx > EndIdx) { + std::swap(StartIdx, EndIdx); + std::swap(Outside, Inside); + } + + uint64_t OrrImm = UImm; + int FirstMovkIdx = NotSet; + int SecondMovkIdx = NotSet; + + // Find out which chunks we need to patch up to obtain a contiguous sequence + // of ones. + for (int Idx = 0; Idx < 4; ++Idx) { + const uint64_t Chunk = getChunk(UImm, Idx); + + // Check whether we are looking at a chunk which is not part of the + // contiguous sequence of ones. + if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { + OrrImm = updateImm(OrrImm, Idx, Outside == 0); + + // Remember the index we need to patch. + if (FirstMovkIdx == NotSet) + FirstMovkIdx = Idx; + else + SecondMovkIdx = Idx; + + // Check whether we are looking a chunk which is part of the contiguous + // sequence of ones. + } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { + OrrImm = updateImm(OrrImm, Idx, Inside != Mask); + + // Remember the index we need to patch. + if (FirstMovkIdx == NotSet) + FirstMovkIdx = Idx; + else + SecondMovkIdx = Idx; + } + } + assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); + + // Create the ORR-immediate instruction. + uint64_t Encoding = 0; + AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); + Insn.push_back({ AArch64::ORRXri, 0, Encoding }); + + const bool SingleMovk = SecondMovkIdx == NotSet; + Insn.push_back({ AArch64::MOVKXi, getChunk(UImm, FirstMovkIdx), + AArch64_AM::getShifterImm(AArch64_AM::LSL, + FirstMovkIdx * 16) }); + + // Early exit in case we only need to emit a single MOVK instruction. + if (SingleMovk) + return true; + + // Create the second MOVK instruction. + Insn.push_back({ AArch64::MOVKXi, getChunk(UImm, SecondMovkIdx), + AArch64_AM::getShifterImm(AArch64_AM::LSL, + SecondMovkIdx * 16) }); + + return true; +} + +/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a +/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. +static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize, + unsigned OneChunks, unsigned ZeroChunks, + SmallVectorImpl &Insn) { + const unsigned Mask = 0xFFFF; + + // Use a MOVZ or MOVN instruction to set the high bits, followed by one or + // more MOVK instructions to insert additional 16-bit portions into the + // lower bits. + bool isNeg = false; + + // Use MOVN to materialize the high bits if we have more all one chunks + // than all zero chunks. + if (OneChunks > ZeroChunks) { + isNeg = true; + Imm = ~Imm; + } + + unsigned FirstOpc; + if (BitSize == 32) { + Imm &= (1LL << 32) - 1; + FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi); + } else { + FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi); + } + unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN + unsigned LastShift = 0; // LSL amount for last MOVK + if (Imm != 0) { + unsigned LZ = countLeadingZeros(Imm); + unsigned TZ = countTrailingZeros(Imm); + Shift = (TZ / 16) * 16; + LastShift = ((63 - LZ) / 16) * 16; + } + unsigned Imm16 = (Imm >> Shift) & Mask; + + Insn.push_back({ FirstOpc, Imm16, + AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) }); + + if (Shift == LastShift) + return; + + // If a MOVN was used for the high bits of a negative value, flip the rest + // of the bits back for use with MOVK. + if (isNeg) + Imm = ~Imm; + + unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); + while (Shift < LastShift) { + Shift += 16; + Imm16 = (Imm >> Shift) & Mask; + if (Imm16 == (isNeg ? Mask : 0)) + continue; // This 16-bit portion is already set correctly. + + Insn.push_back({ Opc, Imm16, + AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) }); + } +} + +/// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more +/// real move-immediate instructions to synthesize the immediate. +void expandMOVImm(uint64_t Imm, unsigned BitSize, + SmallVectorImpl &Insn) { + const unsigned Mask = 0xFFFF; + + // Scan the immediate and count the number of 16-bit chunks which are either + // all ones or all zeros. + unsigned OneChunks = 0; + unsigned ZeroChunks = 0; + for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { + const unsigned Chunk = (Imm >> Shift) & Mask; + if (Chunk == Mask) + OneChunks++; + else if (Chunk == 0) + ZeroChunks++; + } + + // Prefer MOVZ/MOVN over ORR because of the rules for the "mov" alias. + if ((BitSize / 16) - OneChunks <= 1 || (BitSize / 16) - ZeroChunks <= 1) { + expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); + return; + } + + // Try a single ORR. + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); + Insn.push_back({ Opc, 0, Encoding }); + return; + } + + // One to up three instruction sequences. + // + // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the + // fastest sequence with fast literal generation. + if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) { + expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); + return; + } + + assert(BitSize == 64 && "All 32-bit immediates can be expanded with a" + "MOVZ/MOVK pair"); + + // Try other two-instruction sequences. + + // 64-bit ORR followed by MOVK. + // We try to construct the ORR immediate in three different ways: either we + // zero out the chunk which will be replaced, we fill the chunk which will + // be replaced with ones, or we take the bit pattern from the other half of + // the 64-bit immediate. This is comprehensive because of the way ORR + // immediates are constructed. + for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { + uint64_t ShiftedMask = (0xFFFFULL << Shift); + uint64_t ZeroChunk = UImm & ~ShiftedMask; + uint64_t OneChunk = UImm | ShiftedMask; + uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); + uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); + if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) || + AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) || + AArch64_AM::processLogicalImmediate(ReplicateChunk, BitSize, + Encoding)) { + // Create the ORR-immediate instruction. + Insn.push_back({ AArch64::ORRXri, 0, Encoding }); + + // Create the MOVK instruction. + const unsigned Imm16 = getChunk(UImm, Shift / 16); + Insn.push_back({ AArch64::MOVKXi, Imm16, + AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) }); + return; + } + } + + // FIXME: Add more two-instruction sequences. + + // Three instruction sequences. + // + // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly + // the fastest sequence with fast literal generation. (If neither MOVK is + // part of a fast literal generation pair, it could be slower than the + // four-instruction sequence, but we won't worry about that for now.) + if (OneChunks || ZeroChunks) { + expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); + return; + } + + // Check for identical 16-bit chunks within the constant and if so materialize + // them with a single ORR instruction. The remaining one or two 16-bit chunks + // will be materialized with MOVK instructions. + if (BitSize == 64 && tryToreplicateChunks(UImm, Insn)) + return; + + // Check whether the constant contains a sequence of contiguous ones, which + // might be interrupted by one or two chunks. If so, materialize the sequence + // of contiguous ones with an ORR instruction. Materialize the chunks which + // are either interrupting the sequence or outside of the sequence with a + // MOVK instruction. + if (BitSize == 64 && trySequenceOfOnes(UImm, Insn)) + return; + + // We found no possible two or three instruction sequence; use the general + // four-instruction sequence. + expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); +} + +} // end namespace AArch64_AM + +} // end namespace llvm diff --git a/lib/Target/AArch64/AArch64ExpandImm.h b/lib/Target/AArch64/AArch64ExpandImm.h new file mode 100644 index 000000000000..42c97d2c3e9b --- /dev/null +++ b/lib/Target/AArch64/AArch64ExpandImm.h @@ -0,0 +1,35 @@ +//===- AArch64ExpandImm.h - AArch64 Immediate Expansion ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 immediate expansion stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64EXPANDIMM_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64EXPANDIMM_H + +#include "llvm/ADT/SmallVector.h" + +namespace llvm { + +namespace AArch64_IMM { + +struct ImmInsnModel { + unsigned Opcode; + uint64_t Op1; + uint64_t Op2; +}; + +void expandMOVImm(uint64_t Imm, unsigned BitSize, + SmallVectorImpl &Insn); + +} // end namespace AArch64_IMM + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index f7190d58fbf9..210c10eb1842 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1,9 +1,8 @@ //===- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,9 @@ // //===----------------------------------------------------------------------===// +#include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" @@ -66,11 +67,6 @@ private: MachineBasicBlock::iterator &NextMBBI); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); - bool expandMOVImmSimple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned BitSize, - unsigned OneChunks, - unsigned ZeroChunks); bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, @@ -79,6 +75,9 @@ private: bool expandCMP_SWAP_128(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandSetTagLoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); }; } // end anonymous namespace @@ -104,279 +103,6 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, } } -/// Helper function which extracts the specified 16-bit chunk from a -/// 64-bit value. -static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { - assert(ChunkIdx < 4 && "Out of range chunk index specified!"); - - return (Imm >> (ChunkIdx * 16)) & 0xFFFF; -} - -/// Check whether the given 16-bit chunk replicated to full 64-bit width -/// can be materialized with an ORR instruction. -static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { - Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; - - return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding); -} - -/// Check for identical 16-bit chunks within the constant and if so -/// materialize them with a single ORR instruction. The remaining one or two -/// 16-bit chunks will be materialized with MOVK instructions. -/// -/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order -/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with -/// an ORR instruction. -static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const AArch64InstrInfo *TII) { - using CountMap = DenseMap; - - CountMap Counts; - - // Scan the constant and count how often every chunk occurs. - for (unsigned Idx = 0; Idx < 4; ++Idx) - ++Counts[getChunk(UImm, Idx)]; - - // Traverse the chunks to find one which occurs more than once. - for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); - Chunk != End; ++Chunk) { - const uint64_t ChunkVal = Chunk->first; - const unsigned Count = Chunk->second; - - uint64_t Encoding = 0; - - // We are looking for chunks which have two or three instances and can be - // materialized with an ORR instruction. - if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding)) - continue; - - const bool CountThree = Count == 3; - // Create the ORR-immediate instruction. - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .add(MI.getOperand(0)) - .addReg(AArch64::XZR) - .addImm(Encoding); - - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - - unsigned ShiftAmt = 0; - uint64_t Imm16 = 0; - // Find the first chunk not materialized with the ORR instruction. - for (; ShiftAmt < 64; ShiftAmt += 16) { - Imm16 = (UImm >> ShiftAmt) & 0xFFFF; - - if (Imm16 != ChunkVal) - break; - } - - // Create the first MOVK instruction. - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) - .addReg(DstReg, - RegState::Define | getDeadRegState(DstIsDead && CountThree)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); - - // In case we have three instances the whole constant is now materialized - // and we can exit. - if (CountThree) { - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; - } - - // Find the remaining chunk which needs to be materialized. - for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { - Imm16 = (UImm >> ShiftAmt) & 0xFFFF; - - if (Imm16 != ChunkVal) - break; - } - - // Create the second MOVK instruction. - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); - - transferImpOps(MI, MIB, MIB2); - MI.eraseFromParent(); - return true; - } - - return false; -} - -/// Check whether this chunk matches the pattern '1...0...'. This pattern -/// starts a contiguous sequence of ones if we look at the bits from the LSB -/// towards the MSB. -static bool isStartChunk(uint64_t Chunk) { - if (Chunk == 0 || Chunk == std::numeric_limits::max()) - return false; - - return isMask_64(~Chunk); -} - -/// Check whether this chunk matches the pattern '0...1...' This pattern -/// ends a contiguous sequence of ones if we look at the bits from the LSB -/// towards the MSB. -static bool isEndChunk(uint64_t Chunk) { - if (Chunk == 0 || Chunk == std::numeric_limits::max()) - return false; - - return isMask_64(Chunk); -} - -/// Clear or set all bits in the chunk at the given index. -static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { - const uint64_t Mask = 0xFFFF; - - if (Clear) - // Clear chunk in the immediate. - Imm &= ~(Mask << (Idx * 16)); - else - // Set all bits in the immediate for the particular chunk. - Imm |= Mask << (Idx * 16); - - return Imm; -} - -/// Check whether the constant contains a sequence of contiguous ones, -/// which might be interrupted by one or two chunks. If so, materialize the -/// sequence of contiguous ones with an ORR instruction. -/// Materialize the chunks which are either interrupting the sequence or outside -/// of the sequence with a MOVK instruction. -/// -/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk -/// which ends the sequence (0...1...). Then we are looking for constants which -/// contain at least one S and E chunk. -/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. -/// -/// We are also looking for constants like |S|A|B|E| where the contiguous -/// sequence of ones wraps around the MSB into the LSB. -static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const AArch64InstrInfo *TII) { - const int NotSet = -1; - const uint64_t Mask = 0xFFFF; - - int StartIdx = NotSet; - int EndIdx = NotSet; - // Try to find the chunks which start/end a contiguous sequence of ones. - for (int Idx = 0; Idx < 4; ++Idx) { - int64_t Chunk = getChunk(UImm, Idx); - // Sign extend the 16-bit chunk to 64-bit. - Chunk = (Chunk << 48) >> 48; - - if (isStartChunk(Chunk)) - StartIdx = Idx; - else if (isEndChunk(Chunk)) - EndIdx = Idx; - } - - // Early exit in case we can't find a start/end chunk. - if (StartIdx == NotSet || EndIdx == NotSet) - return false; - - // Outside of the contiguous sequence of ones everything needs to be zero. - uint64_t Outside = 0; - // Chunks between the start and end chunk need to have all their bits set. - uint64_t Inside = Mask; - - // If our contiguous sequence of ones wraps around from the MSB into the LSB, - // just swap indices and pretend we are materializing a contiguous sequence - // of zeros surrounded by a contiguous sequence of ones. - if (StartIdx > EndIdx) { - std::swap(StartIdx, EndIdx); - std::swap(Outside, Inside); - } - - uint64_t OrrImm = UImm; - int FirstMovkIdx = NotSet; - int SecondMovkIdx = NotSet; - - // Find out which chunks we need to patch up to obtain a contiguous sequence - // of ones. - for (int Idx = 0; Idx < 4; ++Idx) { - const uint64_t Chunk = getChunk(UImm, Idx); - - // Check whether we are looking at a chunk which is not part of the - // contiguous sequence of ones. - if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { - OrrImm = updateImm(OrrImm, Idx, Outside == 0); - - // Remember the index we need to patch. - if (FirstMovkIdx == NotSet) - FirstMovkIdx = Idx; - else - SecondMovkIdx = Idx; - - // Check whether we are looking a chunk which is part of the contiguous - // sequence of ones. - } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { - OrrImm = updateImm(OrrImm, Idx, Inside != Mask); - - // Remember the index we need to patch. - if (FirstMovkIdx == NotSet) - FirstMovkIdx = Idx; - else - SecondMovkIdx = Idx; - } - } - assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); - - // Create the ORR-immediate instruction. - uint64_t Encoding = 0; - AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .add(MI.getOperand(0)) - .addReg(AArch64::XZR) - .addImm(Encoding); - - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - - const bool SingleMovk = SecondMovkIdx == NotSet; - // Create the first MOVK instruction. - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) - .addReg(DstReg, - RegState::Define | getDeadRegState(DstIsDead && SingleMovk)) - .addReg(DstReg) - .addImm(getChunk(UImm, FirstMovkIdx)) - .addImm( - AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16)); - - // Early exit in case we only need to emit a single MOVK instruction. - if (SingleMovk) { - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; - } - - // Create the second MOVK instruction. - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(getChunk(UImm, SecondMovkIdx)) - .addImm( - AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16)); - - transferImpOps(MI, MIB, MIB2); - MI.eraseFromParent(); - return true; -} - /// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more /// real move-immediate instructions to synthesize the immediate. bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, @@ -385,7 +111,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; unsigned DstReg = MI.getOperand(0).getReg(); uint64_t Imm = MI.getOperand(1).getImm(); - const unsigned Mask = 0xFFFF; if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { // Useless def, and we don't want to risk creating an invalid ORR (which @@ -394,194 +119,50 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } - // Scan the immediate and count the number of 16-bit chunks which are either - // all ones or all zeros. - unsigned OneChunks = 0; - unsigned ZeroChunks = 0; - for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { - const unsigned Chunk = (Imm >> Shift) & Mask; - if (Chunk == Mask) - OneChunks++; - else if (Chunk == 0) - ZeroChunks++; - } + SmallVector Insn; + AArch64_IMM::expandMOVImm(Imm, BitSize, Insn); + assert(Insn.size() != 0); - // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov" - // alias. + SmallVector MIBS; + for (auto I = Insn.begin(), E = Insn.end(); I != E; ++I) { + bool LastItem = std::next(I) == E; + switch (I->Opcode) + { + default: llvm_unreachable("unhandled!"); break; - // Try a single ORR. - uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); - uint64_t Encoding; - if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { - unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) - .add(MI.getOperand(0)) - .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) - .addImm(Encoding); - transferImpOps(MI, MIB, MIB); - MI.eraseFromParent(); - return true; - } - - // Two instruction sequences. - // - // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the - // fastest sequence with fast literal generation. - if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) - return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); - - assert(BitSize == 64 && "All 32-bit immediates can be expanded with a" - "MOVZ/MOVK pair"); - - // Try other two-instruction sequences. - - // 64-bit ORR followed by MOVK. - // We try to construct the ORR immediate in three different ways: either we - // zero out the chunk which will be replaced, we fill the chunk which will - // be replaced with ones, or we take the bit pattern from the other half of - // the 64-bit immediate. This is comprehensive because of the way ORR - // immediates are constructed. - for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { - uint64_t ShiftedMask = (0xFFFFULL << Shift); - uint64_t ZeroChunk = UImm & ~ShiftedMask; - uint64_t OneChunk = UImm | ShiftedMask; - uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); - uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); - if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) || - AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) || - AArch64_AM::processLogicalImmediate(ReplicateChunk, - BitSize, Encoding)) { - // Create the ORR-immediate instruction. - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .add(MI.getOperand(0)) - .addReg(AArch64::XZR) - .addImm(Encoding); - - // Create the MOVK instruction. - const unsigned Imm16 = getChunk(UImm, Shift / 16); - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); - - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; + case AArch64::ORRWri: + case AArch64::ORRXri: + MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .add(MI.getOperand(0)) + .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) + .addImm(I->Op2)); + break; + case AArch64::MOVNWi: + case AArch64::MOVNXi: + case AArch64::MOVZWi: + case AArch64::MOVZXi: { + bool DstIsDead = MI.getOperand(0).isDead(); + MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .addReg(DstReg, RegState::Define | + getDeadRegState(DstIsDead && LastItem)) + .addImm(I->Op1) + .addImm(I->Op2)); + } break; + case AArch64::MOVKWi: + case AArch64::MOVKXi: { + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(DstIsDead && LastItem)) + .addReg(DstReg) + .addImm(I->Op1) + .addImm(I->Op2)); + } break; } } - - // FIXME: Add more two-instruction sequences. - - // Three instruction sequences. - // - // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly - // the fastest sequence with fast literal generation. (If neither MOVK is - // part of a fast literal generation pair, it could be slower than the - // four-instruction sequence, but we won't worry about that for now.) - if (OneChunks || ZeroChunks) - return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); - - // Check for identical 16-bit chunks within the constant and if so materialize - // them with a single ORR instruction. The remaining one or two 16-bit chunks - // will be materialized with MOVK instructions. - if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII)) - return true; - - // Check whether the constant contains a sequence of contiguous ones, which - // might be interrupted by one or two chunks. If so, materialize the sequence - // of contiguous ones with an ORR instruction. Materialize the chunks which - // are either interrupting the sequence or outside of the sequence with a - // MOVK instruction. - if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)) - return true; - - // We found no possible two or three instruction sequence; use the general - // four-instruction sequence. - return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); -} - -/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a -/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. -bool AArch64ExpandPseudo::expandMOVImmSimple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned BitSize, - unsigned OneChunks, - unsigned ZeroChunks) { - MachineInstr &MI = *MBBI; - unsigned DstReg = MI.getOperand(0).getReg(); - uint64_t Imm = MI.getOperand(1).getImm(); - const unsigned Mask = 0xFFFF; - - // Use a MOVZ or MOVN instruction to set the high bits, followed by one or - // more MOVK instructions to insert additional 16-bit portions into the - // lower bits. - bool isNeg = false; - - // Use MOVN to materialize the high bits if we have more all one chunks - // than all zero chunks. - if (OneChunks > ZeroChunks) { - isNeg = true; - Imm = ~Imm; - } - - unsigned FirstOpc; - if (BitSize == 32) { - Imm &= (1LL << 32) - 1; - FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi); - } else { - FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi); - } - unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN - unsigned LastShift = 0; // LSL amount for last MOVK - if (Imm != 0) { - unsigned LZ = countLeadingZeros(Imm); - unsigned TZ = countTrailingZeros(Imm); - Shift = (TZ / 16) * 16; - LastShift = ((63 - LZ) / 16) * 16; - } - unsigned Imm16 = (Imm >> Shift) & Mask; - bool DstIsDead = MI.getOperand(0).isDead(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) - .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) - .addImm(Imm16) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); - - // If a MOVN was used for the high bits of a negative value, flip the rest - // of the bits back for use with MOVK. - if (isNeg) - Imm = ~Imm; - - if (Shift == LastShift) { - transferImpOps(MI, MIB1, MIB1); - MI.eraseFromParent(); - return true; - } - - MachineInstrBuilder MIB2; - unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); - while (Shift < LastShift) { - Shift += 16; - Imm16 = (Imm >> Shift) & Mask; - if (Imm16 == (isNeg ? Mask : 0)) - continue; // This 16-bit portion is already set correctly. - MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); - } - - transferImpOps(MI, MIB1, MIB2); + transferImpOps(MI, MIBS.front(), MIBS.back()); MI.eraseFromParent(); return true; } @@ -759,6 +340,64 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( return true; } +bool AArch64ExpandPseudo::expandSetTagLoop( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + Register SizeReg = MI.getOperand(2).getReg(); + Register AddressReg = MI.getOperand(3).getReg(); + + MachineFunction *MF = MBB.getParent(); + + bool ZeroData = MI.getOpcode() == AArch64::STZGloop; + const unsigned OpCode = + ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex; + + auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoopBB); + MF->insert(++LoopBB->getIterator(), DoneBB); + + BuildMI(LoopBB, DL, TII->get(OpCode)) + .addDef(AddressReg) + .addReg(AddressReg) + .addReg(AddressReg) + .addImm(2) + .cloneMemRefs(MI) + .setMIFlags(MI.getFlags()); + BuildMI(LoopBB, DL, TII->get(AArch64::SUBXri)) + .addDef(SizeReg) + .addReg(SizeReg) + .addImm(16 * 2) + .addImm(0); + BuildMI(LoopBB, DL, TII->get(AArch64::CBNZX)).addUse(SizeReg).addMBB(LoopBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + + MBB.addSuccessor(LoopBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + // Recompute liveness bottom up. + LivePhysRegs LiveRegs; + computeAndAddLiveIns(LiveRegs, *DoneBB); + computeAndAddLiveIns(LiveRegs, *LoopBB); + // Do an extra pass in the loop to get the loop carried dependencies right. + // FIXME: is this necessary? + LoopBB->clearLiveIns(); + computeAndAddLiveIns(LiveRegs, *LoopBB); + DoneBB->clearLiveIns(); + computeAndAddLiveIns(LiveRegs, *DoneBB); + + return true; +} + /// If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, @@ -928,6 +567,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, if (MF->getTarget().getTargetTriple().isOSFuchsia() && MF->getTarget().getCodeModel() == CodeModel::Kernel) SysReg = AArch64SysReg::TPIDR_EL1; + else if (MF->getSubtarget().useEL3ForTP()) + SysReg = AArch64SysReg::TPIDR_EL3; + else if (MF->getSubtarget().useEL2ForTP()) + SysReg = AArch64SysReg::TPIDR_EL2; + else if (MF->getSubtarget().useEL1ForTP()) + SysReg = AArch64SysReg::TPIDR_EL1; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg) .addImm(SysReg); MI.eraseFromParent(); @@ -986,6 +631,46 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::IRGstack: { + MachineFunction &MF = *MBB.getParent(); + const AArch64FunctionInfo *AFI = MF.getInfo(); + const AArch64FrameLowering *TFI = + MF.getSubtarget().getFrameLowering(); + + // IRG does not allow immediate offset. getTaggedBasePointerOffset should + // almost always point to SP-after-prologue; if not, emit a longer + // instruction sequence. + int BaseOffset = -AFI->getTaggedBasePointerOffset(); + unsigned FrameReg; + int FrameRegOffset = TFI->resolveFrameOffsetReference( + MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false, + /*ForSimm=*/true); + Register SrcReg = FrameReg; + if (FrameRegOffset != 0) { + // Use output register as temporary. + SrcReg = MI.getOperand(0).getReg(); + emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg, + FrameRegOffset, TII); + } + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::IRG)) + .add(MI.getOperand(0)) + .addUse(SrcReg) + .add(MI.getOperand(2)); + MI.eraseFromParent(); + return true; + } + case AArch64::TAGPstack: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDG)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(4)); + MI.eraseFromParent(); + return true; + } + case AArch64::STGloop: + case AArch64::STZGloop: + return expandSetTagLoop(MBB, MBBI, NextMBBI); } return false; } diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index bc9a5ca97fea..3b3182128c4c 100644 --- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -1,9 +1,8 @@ //===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions @@ -213,8 +212,8 @@ private: struct LoadInfo { LoadInfo() = default; - unsigned DestReg = 0; - unsigned BaseReg = 0; + Register DestReg; + Register BaseReg; int BaseRegIdx = -1; const MachineOperand *OffsetOpnd = nullptr; bool IsPrePost = false; @@ -648,7 +647,7 @@ static Optional getLoadInfo(const MachineInstr &MI) { return None; LoadInfo LI; - LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg(); + LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg(); LI.BaseReg = BaseReg; LI.BaseRegIdx = BaseRegIdx; LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 47550cabb9f0..8dc2768b9597 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -1,9 +1,8 @@ //===- AArch6464FastISel.cpp - AArch64 FastISel implementation ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -305,8 +304,6 @@ public: } // end anonymous namespace -#include "AArch64GenCallingConv.inc" - /// Check if the sign-/zero-extend will be a noop. static bool isIntExtFree(const Instruction *I) { assert((isa(I) || isa(I)) && @@ -408,10 +405,9 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { bool Is64Bit = (VT == MVT::f64); // This checks to see if we can use FMOV instructions to materialize // a constant, otherwise we have to materialize via the constant pool. - if (TLI.isFPImmLegal(Val, VT)) { - int Imm = - Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val); - assert((Imm != -1) && "Cannot encode floating-point constant."); + int Imm = + Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val); + if (Imm != -1) { unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi; return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); } @@ -2369,7 +2365,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { AArch64::sub_32); if ((BW < 32) && !IsBitTest) - SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true); + SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*isZExt=*/true); // Emit the combined compare and branch instruction. SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); @@ -3608,6 +3604,14 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) .addImm(1); return true; + case Intrinsic::debugtrap: { + if (Subtarget->isTargetWindows()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) + .addImm(0xF000); + return true; + } + break; + } case Intrinsic::sqrt: { Type *RetTy = II->getCalledFunction()->getReturnType(); @@ -4268,7 +4272,7 @@ unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, const TargetRegisterClass *RC = (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; if (NeedTrunc) { - Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false); + Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*isZExt=*/false); Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); Op0IsKill = Op1IsKill = true; } @@ -4948,7 +4952,7 @@ std::pair AArch64FastISel::getRegForGEPIndex(const Value *Idx) { MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { - IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false); + IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*isZExt=*/false); IdxNIsKill = true; } else if (IdxVT.bitsGT(PtrVT)) llvm_unreachable("AArch64 FastISel doesn't support types larger than i64"); @@ -5172,10 +5176,6 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectAtomicCmpXchg(cast(I)); } - // Silence warnings. - (void)&CC_AArch64_DarwinPCS_VarArg; - (void)&CC_AArch64_Win64_VarArg; - // fall-back to target-independent instruction selection. return selectOperator(I, I->getOpcode()); } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 538a8d7e8fbc..8c6e5cbd5c13 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1,9 +1,8 @@ //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -251,8 +250,7 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - if (!TFI->hasReservedCallFrame(MF)) { + if (!hasReservedCallFrame(MF)) { unsigned Align = getStackAlignment(); int64_t Amount = I->getOperand(0).getImm(); @@ -588,7 +586,7 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, - bool NeedsWinCFI, bool InProlog = true) { + bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) { // Ignore instructions that do not operate on SP, i.e. shadow call stack // instructions and associated CFI instruction. while (MBBI->getOpcode() == AArch64::STRXpost || @@ -674,9 +672,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MIB.setMemRefs(MBBI->memoperands()); // Generate a new SEH code that corresponds to the new instruction. - if (NeedsWinCFI) + if (NeedsWinCFI) { + *HasWinCFI = true; InsertSEH(*MIB, *TII, InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy); + } return std::prev(MBB.erase(MBBI)); } @@ -685,7 +685,8 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // combined SP bump by adding the local stack size to the stack offsets. static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, unsigned LocalStackSize, - bool NeedsWinCFI) { + bool NeedsWinCFI, + bool *HasWinCFI) { if (AArch64InstrInfo::isSEHInstruction(MI)) return; @@ -732,6 +733,7 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale); if (NeedsWinCFI) { + *HasWinCFI = true; auto MBBI = std::next(MachineBasicBlock::iterator(MI)); assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction"); assert(AArch64InstrInfo::isSEHInstruction(*MBBI) && @@ -803,7 +805,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool HasFP = hasFP(MF); bool NeedsWinCFI = needsWinCFI(MF); - MF.setHasWinCFI(NeedsWinCFI); + bool HasWinCFI = false; + auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); }); + bool IsFunclet = MBB.isEHFuncletEntry(); // At this point, we're going to decide whether or not the function uses a @@ -838,6 +842,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + // Set tagged base pointer to the bottom of the stack frame. + // Ideally it should match SP value after prologue. + AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -859,7 +867,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++NumRedZoneFunctions; } else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI); + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); if (!NeedsWinCFI) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); @@ -872,9 +880,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } } - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) .setMIFlag(MachineInstr::FrameSetup); + } return; } @@ -892,11 +902,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI); + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); NumBytes = 0; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI); + MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI); NumBytes -= PrologueSaveSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); @@ -908,7 +918,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), - NeedsWinCFI); + NeedsWinCFI, &HasWinCFI); ++MBBI; } @@ -916,9 +926,24 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // opcodes that we needed to emit. The FP and BP belong to the containing // function. if (IsFunclet) { - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) .setMIFlag(MachineInstr::FrameSetup); + } + + // SEH funclets are passed the frame pointer in X1. If the parent + // function uses the base register, then the base register is used + // directly, and is not retrieved from X1. + if (F.hasPersonalityFn()) { + EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); + if (isAsynchronousEHPersonality(Per)) { + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) + .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(AArch64::X1); + } + } + return; } @@ -934,12 +959,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Note: All stores of callee-saved registers are marked as "FrameSetup". // This code marks the instruction(s) that set the FP also. emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI); + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); } if (windowsRequiresStackProbe(MF, NumBytes)) { uint32_t NumWords = NumBytes >> 4; if (NeedsWinCFI) { + HasWinCFI = true; // alloc_l can hold at most 256MB, so assume that NumBytes doesn't // exceed this amount. We need to move at most 2^24 - 1 into x15. // This is at most two instructions, MOVZ follwed by MOVK. @@ -983,9 +1009,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead) .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead) .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) .setMIFlag(MachineInstr::FrameSetup); + } break; case CodeModel::Large: BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) @@ -993,9 +1021,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addExternalSymbol("__chkstk") .addExternalSymbol("__chkstk") .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) .setMIFlag(MachineInstr::FrameSetup); + } BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) .addReg(AArch64::X16, RegState::Kill) @@ -1004,9 +1034,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead) .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead) .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) .setMIFlag(MachineInstr::FrameSetup); + } break; } @@ -1015,10 +1047,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addReg(AArch64::X15, RegState::Kill) .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } NumBytes = 0; } @@ -1038,7 +1072,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI); + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { const unsigned Alignment = MFI.getMaxAlignment(); @@ -1061,10 +1095,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addReg(scratchSPReg, RegState::Kill) .addImm(andMaskEncoded); AFI->setStackRealigned(true); - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) .addImm(NumBytes & andMaskEncoded) .setMIFlag(MachineInstr::FrameSetup); + } } } @@ -1078,16 +1114,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (RegInfo->hasBasePointer(MF)) { TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, false); - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) .setMIFlag(MachineInstr::FrameSetup); + } } // The very last FrameSetup instruction indicates the end of prologue. Emit a // SEH opcode indicating the prologue end. - if (NeedsWinCFI) + if (NeedsWinCFI && HasWinCFI) { BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) .setMIFlag(MachineInstr::FrameSetup); + } if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); @@ -1231,7 +1270,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, DebugLoc DL; bool IsTailCallReturn = false; bool NeedsWinCFI = needsWinCFI(MF); + bool HasWinCFI = false; bool IsFunclet = false; + auto WinCFI = make_scope_exit([&]() { + if (!MF.hasWinCFI()) + MF.setHasWinCFI(HasWinCFI); + }); if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); @@ -1326,7 +1370,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // If the offset is 0, convert it to a post-index ldp. if (OffsetOp.getImm() == 0) convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, false); + MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false); else { // If not, make sure to emit an add after the last ldp. // We're doing this by transfering the size to be restored from the @@ -1348,19 +1392,21 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, break; } else if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(), - NeedsWinCFI); + NeedsWinCFI, &HasWinCFI); } - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart)) .setMIFlag(MachineInstr::FrameDestroy); + } // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI); - if (NeedsWinCFI) + false, NeedsWinCFI, &HasWinCFI); + if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); @@ -1392,12 +1438,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, StackRestoreBytes, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI); + NeedsWinCFI, &HasWinCFI); if (Done) { - if (NeedsWinCFI) + if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); + } return; } @@ -1436,11 +1484,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI); + NeedsWinCFI, &HasWinCFI); } - if (NeedsWinCFI) + if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); + + MF.setHasWinCFI(HasWinCFI); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -1450,25 +1500,66 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - return resolveFrameIndexReference(MF, FI, FrameReg); + return resolveFrameIndexReference( + MF, FI, FrameReg, + /*PreferFP=*/ + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), + /*ForSimm=*/false); } -int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, - int FI, unsigned &FrameReg, - bool PreferFP) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - const AArch64RegisterInfo *RegInfo = static_cast( - MF.getSubtarget().getRegisterInfo()); - const AArch64FunctionInfo *AFI = MF.getInfo(); - const AArch64Subtarget &Subtarget = MF.getSubtarget(); +int AArch64FrameLowering::getNonLocalFrameIndexReference( + const MachineFunction &MF, int FI) const { + return getSEHFrameIndexOffset(MF, FI); +} + +static int getFPOffset(const MachineFunction &MF, int ObjectOffset) { + const auto *AFI = MF.getInfo(); + const auto &Subtarget = MF.getSubtarget(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16; - int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize(); + return ObjectOffset + FixedObject + 16; +} + +static int getStackOffset(const MachineFunction &MF, int ObjectOffset) { + const auto &MFI = MF.getFrameInfo(); + return ObjectOffset + MFI.getStackSize(); +} + +int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const auto *RegInfo = static_cast( + MF.getSubtarget().getRegisterInfo()); + int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); + return RegInfo->getLocalAddressRegister(MF) == AArch64::FP + ? getFPOffset(MF, ObjectOffset) + : getStackOffset(MF, ObjectOffset); +} + +int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, + bool PreferFP, + bool ForSimm) const { + const auto &MFI = MF.getFrameInfo(); + int ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - bool isCSR = !isFixed && MFI.getObjectOffset(FI) >= - -((int)AFI->getCalleeSavedStackSize()); + return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg, + PreferFP, ForSimm); +} + +int AArch64FrameLowering::resolveFrameOffsetReference( + const MachineFunction &MF, int ObjectOffset, bool isFixed, + unsigned &FrameReg, bool PreferFP, bool ForSimm) const { + const auto &MFI = MF.getFrameInfo(); + const auto *RegInfo = static_cast( + MF.getSubtarget().getRegisterInfo()); + const auto *AFI = MF.getInfo(); + const auto &Subtarget = MF.getSubtarget(); + + int FPOffset = getFPOffset(MF, ObjectOffset); + int Offset = getStackOffset(MF, ObjectOffset); + bool isCSR = + !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't @@ -1489,11 +1580,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, assert(hasFP(MF) && "Re-aligned stack must have frame pointer"); UseFP = true; } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) { - // If the FPOffset is negative, we have to keep in mind that the - // available offset range for negative offsets is smaller than for - // positive ones. If an offset is - // available via the FP and the SP, use whichever is closest. - bool FPOffsetFits = FPOffset >= -256; + // If the FPOffset is negative and we're producing a signed immediate, we + // have to keep in mind that the available offset range for negative + // offsets is smaller than for positive ones. If an offset is available + // via the FP and the SP, use whichever is closest. + bool FPOffsetFits = !ForSimm || FPOffset >= -256; PreferFP |= Offset > -FPOffset; if (MFI.hasVarSizedObjects()) { @@ -1517,6 +1608,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, // Funclets access the locals contained in the parent's stack frame // via the frame pointer, so we have to use the FP in the parent // function. + (void) Subtarget; assert( Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) && "Funclets should only be present on Win64"); @@ -1759,8 +1851,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( static_cast(unsigned(dwarf::DW_OP_breg18)), static_cast(-8) & 0x7f, // addend (sleb128) }; - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createEscape(nullptr, CFIInst)); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( + nullptr, StringRef(CFIInst, sizeof(CFIInst)))); BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlag(MachineInstr::FrameSetup); @@ -2104,9 +2196,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) ++MBBI; - if (MBBI->isTerminator()) - return; - // Create an UnwindHelp object. int UnwindHelpFI = MFI.CreateStackObject(/*size*/8, /*alignment*/16, false); @@ -2114,8 +2203,10 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( // We need to store -2 into the UnwindHelp object at the start of the // function. DebugLoc DL; - RS->enterBasicBlock(MBB); - unsigned DstReg = RS->scavengeRegister(&AArch64::GPR64RegClass, MBBI, 0); + RS->enterBasicBlockEnd(MBB); + RS->backward(std::prev(MBBI)); + unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass); + assert(DstReg && "There must be a free register after frame setup"); BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2); BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi)) .addReg(DstReg, getKillRegState(true)) diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 0d0385acf46e..6dbd34b2189f 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -1,9 +1,8 @@ //==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -41,8 +40,11 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, - bool PreferFP = false) const; + unsigned &FrameReg, bool PreferFP, + bool ForSimm) const; + int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset, + bool isFixed, unsigned &FrameReg, + bool PreferFP, bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, @@ -79,6 +81,9 @@ public: int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const override; + int getNonLocalFrameIndexReference(const MachineFunction &MF, + int FI) const override; + int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index 37720cbd32bb..528756b34856 100644 --- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -1,9 +1,8 @@ //===- AArch64GenRegisterBankInfo.def ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -111,6 +110,10 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 49: Shift scalar with 64 bit shift imm + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, }; bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index fc9855f6a0da..cd7e927ac80c 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -53,7 +52,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - ForCodeSize = MF.getFunction().optForSize(); + ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -92,6 +91,12 @@ public: bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeIndexed7S(N, 16, Base, OffImm); } + bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm); + } + bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm); + } bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeIndexed(N, 1, Base, OffImm); } @@ -152,6 +157,9 @@ public: bool tryIndexedLoad(SDNode *N); + bool trySelectStackSlotTagP(SDNode *N); + void SelectTagP(SDNode *N); + void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, unsigned SubRegIdx); void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, @@ -180,7 +188,12 @@ private: bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, SDValue &Shift); bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base, - SDValue &OffImm); + SDValue &OffImm) { + return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm); + } + bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW, + unsigned Size, SDValue &Base, + SDValue &OffImm); bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, SDValue &OffImm); bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, @@ -676,12 +689,13 @@ static bool isWorthFoldingADDlow(SDValue N) { return true; } -/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit +/// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. -bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, - SDValue &Base, - SDValue &OffImm) { +bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, + unsigned BW, unsigned Size, + SDValue &Base, + SDValue &OffImm) { SDLoc dl(N); const DataLayout &DL = CurDAG->getDataLayout(); const TargetLowering *TLI = getTargetLowering(); @@ -692,26 +706,43 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, return true; } - // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed + // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed // selected here doesn't support labels/immediates, only base+offset. - if (CurDAG->isBaseWithConstantOffset(N)) { if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { - int64_t RHSC = RHS->getSExtValue(); - unsigned Scale = Log2_32(Size); - if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) && - RHSC < (0x40 << Scale)) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + if (IsSignedImm) { + int64_t RHSC = RHS->getSExtValue(); + unsigned Scale = Log2_32(Size); + int64_t Range = 0x1LL << (BW - 1); + + if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) && + RHSC < (Range << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; + } + } else { + // unsigned Immediate + uint64_t RHSC = RHS->getZExtValue(); + unsigned Scale = Log2_32(Size); + uint64_t Range = 0x1ULL << BW; + + if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; } - OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); - return true; } } } - // Base only. The address will be materialized into a register before // the memory is accessed. // add x0, Xbase, #offset @@ -2650,6 +2681,14 @@ bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) { return true; } + if (RegString->getString() == "pc") { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other, + CurDAG->getTargetConstant(0, DL, MVT::i32), + N->getOperand(0))); + return true; + } + return false; } @@ -2754,6 +2793,58 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { return true; } +bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) { + // tagp(FrameIndex, IRGstack, tag_offset): + // since the offset between FrameIndex and IRGstack is a compile-time + // constant, this can be lowered to a single ADDG instruction. + if (!(isa(N->getOperand(1)))) { + return false; + } + + SDValue IRG_SP = N->getOperand(2); + if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN || + cast(IRG_SP->getOperand(1))->getZExtValue() != + Intrinsic::aarch64_irg_sp) { + return false; + } + + const TargetLowering *TLI = getTargetLowering(); + SDLoc DL(N); + int FI = cast(N->getOperand(1))->getIndex(); + SDValue FiOp = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + int TagOffset = cast(N->getOperand(3))->getZExtValue(); + + SDNode *Out = CurDAG->getMachineNode( + AArch64::TAGPstack, DL, MVT::i64, + {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2), + CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)}); + ReplaceNode(N, Out); + return true; +} + +void AArch64DAGToDAGISel::SelectTagP(SDNode *N) { + assert(isa(N->getOperand(3)) && + "llvm.aarch64.tagp third argument must be an immediate"); + if (trySelectStackSlotTagP(N)) + return; + // FIXME: above applies in any case when offset between Op1 and Op2 is a + // compile-time constant, not just for stack allocations. + + // General case for unrelated pointers in Op1 and Op2. + SDLoc DL(N); + int TagOffset = cast(N->getOperand(3))->getZExtValue(); + SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64, + {N->getOperand(1), N->getOperand(2)}); + SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64, + {SDValue(N1, 0), N->getOperand(2)}); + SDNode *N3 = CurDAG->getMachineNode( + AArch64::ADDG, DL, MVT::i64, + {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64), + CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)}); + ReplaceNode(N, N3); +} + void AArch64DAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -3247,6 +3338,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { switch (IntNo) { default: break; + case Intrinsic::aarch64_tagp: + SelectTagP(Node); + return; case Intrinsic::aarch64_neon_tbl2: SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two, diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index e01ca14d7f63..7becc99fb5c7 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "AArch64ExpandImm.h" #include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" @@ -55,9 +55,11 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" @@ -87,6 +89,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" @@ -454,6 +457,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNUM, Ty, Legal); setOperationAction(ISD::FMINIMUM, Ty, Legal); setOperationAction(ISD::FMAXIMUM, Ty, Legal); + setOperationAction(ISD::LROUND, Ty, Legal); + setOperationAction(ISD::LLROUND, Ty, Legal); + setOperationAction(ISD::LRINT, Ty, Legal); + setOperationAction(ISD::LLRINT, Ty, Legal); } if (Subtarget->hasFullFP16()) { @@ -544,9 +551,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Trap. setOperationAction(ISD::TRAP, MVT::Other, Legal); + if (Subtarget->isTargetWindows()) + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); + // Try to create BICs for vector ANDs. + setTargetDAGCombine(ISD::AND); // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. @@ -608,9 +619,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPrefLoopAlignment(STI.getPrefLoopAlignment()); // Only change the limit for entries in a jump table if specified by - // the subtarget, but not at the command line. + // the sub target, but not at the command line. unsigned MaxJT = STI.getMaximumJumpTableSize(); - if (MaxJT && getMaximumJumpTableSize() == 0) + if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) setMaximumJumpTableSize(MaxJT); setHasExtractBitsInsn(true); @@ -658,14 +669,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // elements smaller than i32, so promote the input to i32 first. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); - setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); - setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); - // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 - // -> v8f16 conversions. + // i8 vector elements also need promotion to i32 for v8i8 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); - setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); - setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); // Similarly, there is no direct i32 -> f64 vector conversion instruction. setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); @@ -676,18 +682,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); + } else { + // when AArch64 doesn't have fullfp16 support, promote the input + // to i32 first. + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); + } + setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); - setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); - setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); - setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); - setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); - setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); - setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); - setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); - setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); - // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. @@ -696,14 +707,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Vector reductions - for (MVT VT : MVT::integer_valuetypes()) { + for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, + MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } - for (MVT VT : MVT::fp_valuetypes()) { + for (MVT VT : { MVT::v4f16, MVT::v2f32, + MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); } @@ -726,6 +739,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -745,6 +759,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FROUND, Ty, Legal); } + if (Subtarget->hasFullFP16()) { + for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { + setOperationAction(ISD::FFLOOR, Ty, Legal); + setOperationAction(ISD::FNEARBYINT, Ty, Legal); + setOperationAction(ISD::FCEIL, Ty, Legal); + setOperationAction(ISD::FRINT, Ty, Legal); + setOperationAction(ISD::FTRUNC, Ty, Legal); + setOperationAction(ISD::FROUND, Ty, Legal); + } + } + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } @@ -783,7 +808,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::AND, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); @@ -1052,10 +1076,9 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, return MVT::i64; } -bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *Fast) const { +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; @@ -1211,6 +1234,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; + case AArch64ISD::STG: return "AArch64ISD::STG"; + case AArch64ISD::STZG: return "AArch64ISD::STZG"; + case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; + case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; } return nullptr; } @@ -2326,7 +2353,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SDLoc(Op)).first; } -static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { +SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. @@ -2334,8 +2362,9 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); unsigned NumElts = InVT.getVectorNumElements(); - // f16 vectors are promoted to f32 before a conversion. - if (InVT.getVectorElementType() == MVT::f16) { + // f16 conversions are promoted to f32 when full fp16 is not supported. + if (InVT.getVectorElementType() == MVT::f16 && + !Subtarget->hasFullFP16()) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); return DAG.getNode( @@ -2743,6 +2772,28 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_neon_umin: return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::localaddress: { + const auto &MF = DAG.getMachineFunction(); + const auto *RegInfo = Subtarget->getRegisterInfo(); + unsigned Reg = RegInfo->getLocalAddressRegister(MF); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, + Op.getSimpleValueType()); + } + + case Intrinsic::eh_recoverfp: { + // FIXME: This needs to be implemented to correctly handle highly aligned + // stack objects. For now we simply return the incoming FP. Refer D53541 + // for more details. + SDValue FnOp = Op.getOperand(1); + SDValue IncomingFPOp = Op.getOperand(2); + GlobalAddressSDNode *GSD = dyn_cast(FnOp); + auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); + if (!Fn) + report_fatal_error( + "llvm.eh.recoverfp must take a function as the first argument"); + return IncomingFPOp; + } } } @@ -2797,7 +2848,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, unsigned AS = StoreNode->getAddressSpace(); unsigned Align = StoreNode->getAlignment(); if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + !allowsMisalignedMemoryAccesses( + MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { return scalarizeVectorStore(StoreNode, DAG); } @@ -2900,8 +2952,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerCTPOP(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); - case ISD::AND: - return LowerVectorAND(Op, DAG); case ISD::OR: return LowerVectorOR(Op, DAG); case ISD::XOR: @@ -2945,8 +2995,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, // Calling Convention Implementation //===----------------------------------------------------------------------===// -#include "AArch64GenCallingConv.inc" - /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { @@ -3167,6 +3215,32 @@ SDValue AArch64TargetLowering::LowerFormalArguments( FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_AArch64_AAPCS); + + // Conservatively forward X8, since it might be used for aggregate return. + if (!CCInfo.isAllocated(AArch64::X8)) { + unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); + } + } + } + + // On Windows, InReg pointers must be returned, so record the pointer in a + // virtual register at the start of the function so it can be returned in the + // epilogue. + if (IsWin64) { + for (unsigned I = 0, E = Ins.size(); I != E; ++I) { + if (Ins[I].Flags.isInReg()) { + assert(!FuncInfo->getSRetReturnReg()); + + MVT PtrTy = getPointerTy(DAG.getDataLayout()); + unsigned Reg = + MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); + + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); + break; + } } } @@ -3365,10 +3439,20 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // X86) but less efficient and uglier in LowerCall. for (Function::const_arg_iterator i = CallerF.arg_begin(), e = CallerF.arg_end(); - i != e; ++i) + i != e; ++i) { if (i->hasByValAttr()) return false; + // On Windows, "inreg" attributes signify non-aggregate indirect returns. + // In this case, it is necessary to save/restore X0 in the callee. Tail + // call opt interferes with this. So we disable tail call opt when the + // caller has an argument with "inreg" attribute. + + // FIXME: Check whether the callee also has an "inreg" argument. + if (i->hasInRegAttr()) + return false; + } + if (getTargetMachine().Options.GuaranteedTailCallOpt) return canGuaranteeTCO(CalleeCC) && CCMatch; @@ -3886,6 +3970,9 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { + auto &MF = DAG.getMachineFunction(); + auto *FuncInfo = MF.getInfo(); + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; @@ -3924,6 +4011,23 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + + // Windows AArch64 ABIs require that for returning structs by value we copy + // the sret argument into X0 for the return. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into X0. + if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { + SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, + getPointerTy(MF.getDataLayout())); + + unsigned RetValReg = AArch64::X0; + Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); + Flag = Chain.getValue(1); + + RetOps.push_back( + DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); + } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); @@ -5197,50 +5301,20 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, return DAG.getFrameIndex(FI, VT); } +#define GET_REGISTER_MATCHER +#include "AArch64GenAsmMatcher.inc" + // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { - unsigned Reg = StringSwitch(RegName) - .Case("sp", AArch64::SP) - .Case("x1", AArch64::X1) - .Case("w1", AArch64::W1) - .Case("x2", AArch64::X2) - .Case("w2", AArch64::W2) - .Case("x3", AArch64::X3) - .Case("w3", AArch64::W3) - .Case("x4", AArch64::X4) - .Case("w4", AArch64::W4) - .Case("x5", AArch64::X5) - .Case("w5", AArch64::W5) - .Case("x6", AArch64::X6) - .Case("w6", AArch64::W6) - .Case("x7", AArch64::X7) - .Case("w7", AArch64::W7) - .Case("x18", AArch64::X18) - .Case("w18", AArch64::W18) - .Case("x20", AArch64::X20) - .Case("w20", AArch64::W20) - .Default(0); - if (((Reg == AArch64::X1 || Reg == AArch64::W1) && - !Subtarget->isXRegisterReserved(1)) || - ((Reg == AArch64::X2 || Reg == AArch64::W2) && - !Subtarget->isXRegisterReserved(2)) || - ((Reg == AArch64::X3 || Reg == AArch64::W3) && - !Subtarget->isXRegisterReserved(3)) || - ((Reg == AArch64::X4 || Reg == AArch64::W4) && - !Subtarget->isXRegisterReserved(4)) || - ((Reg == AArch64::X5 || Reg == AArch64::W5) && - !Subtarget->isXRegisterReserved(5)) || - ((Reg == AArch64::X6 || Reg == AArch64::W6) && - !Subtarget->isXRegisterReserved(6)) || - ((Reg == AArch64::X7 || Reg == AArch64::W7) && - !Subtarget->isXRegisterReserved(7)) || - ((Reg == AArch64::X18 || Reg == AArch64::W18) && - !Subtarget->isXRegisterReserved(18)) || - ((Reg == AArch64::X20 || Reg == AArch64::W20) && - !Subtarget->isXRegisterReserved(20))) - Reg = 0; + unsigned Reg = MatchRegisterName(RegName); + if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { + const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); + unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); + if (!Subtarget->isXRegisterReserved(DwarfRegNum)) + Reg = 0; + } if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" @@ -5398,35 +5472,41 @@ bool AArch64TargetLowering::isOffsetFoldingLegal( return false; } -bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. - // FIXME: We should be able to handle f128 as well with a clever lowering. - if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 || - (VT == MVT::f16 && Subtarget->hasFullFP16()))) { - LLVM_DEBUG(dbgs() << "Legal " << VT.getEVTString() << " imm value: 0\n"); - return true; - } - +bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool OptForSize) const { bool IsLegal = false; - SmallString<128> ImmStrVal; - Imm.toString(ImmStrVal); - + // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and + // 16-bit case when target has full fp16 support. + // FIXME: We should be able to handle f128 as well with a clever lowering. + const APInt ImmInt = Imm.bitcastToAPInt(); if (VT == MVT::f64) - IsLegal = AArch64_AM::getFP64Imm(Imm) != -1; + IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); else if (VT == MVT::f32) - IsLegal = AArch64_AM::getFP32Imm(Imm) != -1; + IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); else if (VT == MVT::f16 && Subtarget->hasFullFP16()) - IsLegal = AArch64_AM::getFP16Imm(Imm) != -1; - - if (IsLegal) { - LLVM_DEBUG(dbgs() << "Legal " << VT.getEVTString() - << " imm value: " << ImmStrVal << "\n"); - return true; - } - - LLVM_DEBUG(dbgs() << "Illegal " << VT.getEVTString() - << " imm value: " << ImmStrVal << "\n"); - return false; + IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); + // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to + // generate that fmov. + + // If we can not materialize in immediate field for fmov, check if the + // value can be encoded as the immediate operand of a logical instruction. + // The immediate value will be created with either MOVZ, MOVN, or ORR. + if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { + // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; + // however the mov+fmov sequence is always better because of the reduced + // cache pressure. The timings are still the same if you consider + // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the + // movw+movk is fused). So we limit up to 2 instrdduction at most. + SmallVector Insn; + AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), + Insn); + unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); + IsLegal = Insn.size() <= Limit; + } + + LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() + << " imm value: "; Imm.dump();); + return IsLegal; } //===----------------------------------------------------------------------===// @@ -6226,6 +6306,8 @@ static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || @@ -6240,6 +6322,8 @@ static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { @@ -6276,6 +6360,8 @@ static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || @@ -6918,46 +7004,6 @@ static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, - SelectionDAG &DAG) const { - SDValue LHS = Op.getOperand(0); - EVT VT = Op.getValueType(); - - BuildVectorSDNode *BVN = - dyn_cast(Op.getOperand(1).getNode()); - if (!BVN) { - // AND commutes, so try swapping the operands. - LHS = Op.getOperand(1); - BVN = dyn_cast(Op.getOperand(0).getNode()); - } - if (!BVN) - return Op; - - APInt DefBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, DefBits, UndefBits)) { - SDValue NewOp; - - // We only have BIC vector immediate instruction, which is and-not. - DefBits = ~DefBits; - if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG, - DefBits, &LHS)) || - (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG, - DefBits, &LHS))) - return NewOp; - - UndefBits = ~UndefBits; - if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG, - UndefBits, &LHS)) || - (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG, - UndefBits, &LHS))) - return NewOp; - } - - // We can always fall back to a non-immediate AND. - return Op; -} - // Specialized code to quickly find if PotentialBVec is a BuildVector that // consists of only the same constant int value, returned in reference arg // ConstVal @@ -7799,8 +7845,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, // Make v4f16 (only) fcmp operations utilise vector instructions // v8f16 support will be a litle more complicated - if (LHS.getValueType().getVectorElementType() == MVT::f16) { - if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) { + if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { + if (LHS.getValueType().getVectorNumElements() == 4) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); @@ -7810,8 +7856,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return SDValue(); } - assert(LHS.getValueType().getVectorElementType() == MVT::f32 || - LHS.getValueType().getVectorElementType() == MVT::f64); + assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || + LHS.getValueType().getVectorElementType() != MVT::f128); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -8255,6 +8301,110 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { return true; } +/// Check if both Op1 and Op2 are shufflevector extracts of either the lower +/// or upper half of the vector elements. +static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { + auto areTypesHalfed = [](Value *FullV, Value *HalfV) { + auto *FullVT = cast(FullV->getType()); + auto *HalfVT = cast(HalfV->getType()); + return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); + }; + + auto extractHalf = [](Value *FullV, Value *HalfV) { + auto *FullVT = cast(FullV->getType()); + auto *HalfVT = cast(HalfV->getType()); + return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); + }; + + Constant *M1, *M2; + Value *S1Op1, *S2Op1; + if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || + !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) + return false; + + // Check that the operands are half as wide as the result and we extract + // half of the elements of the input vectors. + if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || + !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) + return false; + + // Check the mask extracts either the lower or upper half of vector + // elements. + int M1Start = -1; + int M2Start = -1; + int NumElements = cast(Op1->getType())->getNumElements() * 2; + if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || + !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || + M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) + return false; + + return true; +} + +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). +bool AArch64TargetLowering::shouldSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + if (!I->getType()->isVectorTy()) + return false; + + if (IntrinsicInst *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_umull: + if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) + return false; + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + default: + return false; + } + } + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + + // If the exts' operands extract either the lower or upper elements, we + // can sink them too. + auto Ext1 = cast(I->getOperand(0)); + auto Ext2 = cast(I->getOperand(1)); + if (areExtractShuffleVectors(Ext1, Ext2)) { + Ops.push_back(&Ext1->getOperandUse(0)); + Ops.push_back(&Ext2->getOperandUse(0)); + } + + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + + return true; + } + default: + return false; + } + return false; +} + bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const { if (!LoadedType.isSimple() || @@ -8377,8 +8527,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = Builder.CreateConstGEP1_32( - BaseAddr, VecTy->getVectorNumElements() * Factor); + BaseAddr = + Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, + VecTy->getVectorNumElements() * Factor); CallInst *LdN = Builder.CreateCall( LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); @@ -8540,7 +8691,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); Builder.CreateCall(StNFunc, Ops); @@ -8554,13 +8706,12 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, (DstAlign == 0 || DstAlign % AlignCheck == 0)); } -EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - const Function &F = MF.getFunction(); - bool CanImplicitFloat = !F.hasFnAttribute(Attribute::NoImplicitFloat); +EVT AArch64TargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + bool CanImplicitFloat = + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; // Only use AdvSIMD to implement memset of 32-byte and above. It would have @@ -8571,7 +8722,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) return true; bool Fast; - return allowsMisalignedMemoryAccesses(VT, 0, 1, &Fast) && Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, + &Fast) && + Fast; }; if (CanUseNEON && IsMemset && !IsSmallMemset && @@ -9061,6 +9214,9 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasNEON()) return SDValue(); + if (!N->getValueType(0).isSimple()) + return SDValue(); + SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) @@ -9323,6 +9479,46 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +static SDValue performANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + EVT VT = N->getValueType(0); + if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + BuildVectorSDNode *BVN = + dyn_cast(N->getOperand(1).getNode()); + if (!BVN) + return SDValue(); + + // AND does not accept an immediate, so check if we can use a BIC immediate + // instruction instead. We do this here instead of using a (and x, (mvni imm)) + // pattern in isel, because some immediates may be lowered to the preferred + // (and x, (movi imm)) form, even though an mvni representation also exists. + APInt DefBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, DefBits, UndefBits)) { + SDValue NewOp; + + DefBits = ~DefBits; + if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, + DefBits, &LHS)) || + (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, + DefBits, &LHS))) + return NewOp; + + UndefBits = ~UndefBits; + if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, + UndefBits, &LHS)) || + (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, + UndefBits, &LHS))) + return NewOp; + } + + return SDValue(); +} + static SDValue performSRLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; @@ -9598,12 +9794,13 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { DAG.getConstant(NumElems, dl, MVT::i64)); } -static bool isEssentiallyExtractSubvector(SDValue N) { - if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) - return true; - - return N.getOpcode() == ISD::BITCAST && - N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; +static bool isEssentiallyExtractHighSubvector(SDValue N) { + if (N.getOpcode() == ISD::BITCAST) + N = N.getOperand(0); + if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + return cast(N.getOperand(1))->getAPIntValue() == + N.getOperand(0).getValueType().getVectorNumElements() / 2; } /// Helper structure to keep track of ISD::SET_CC operands. @@ -9770,13 +9967,13 @@ static SDValue performAddSubLongCombine(SDNode *N, // It's not worth doing if at least one of the inputs isn't already an // extract, but we don't know which it'll be so we have to try both. - if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { + if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); if (!RHS.getNode()) return SDValue(); RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); - } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { + } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); if (!LHS.getNode()) return SDValue(); @@ -9809,11 +10006,11 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". - if (isEssentiallyExtractSubvector(LHS)) { + if (isEssentiallyExtractHighSubvector(LHS)) { RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); - } else if (isEssentiallyExtractSubvector(RHS)) { + } else if (isEssentiallyExtractHighSubvector(RHS)) { LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); @@ -10261,7 +10458,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); // Don't split at -Oz. - if (DAG.getMachineFunction().getFunction().optForMinSize()) + if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); // Don't split v2i64 vectors. Memcpy lowering produces those and splitting @@ -10917,6 +11114,12 @@ static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } + // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. + if (Op->getOpcode() == ISD::ANY_EXTEND && + Bit < Op->getOperand(0).getValueSizeInBits()) { + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + if (Op->getNumOperands() != 2) return Op; @@ -11172,6 +11375,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performFDivCombine(N, DAG, DCI, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); + case ISD::AND: + return performANDCombine(N, DCI); case ISD::SRL: return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: @@ -11573,6 +11778,9 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // For the real atomic operations, we have ldxr/stxr up to 128 bits, TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + if (AI->isFloatingPointOperation()) + return AtomicExpansionKind::CmpXChg; + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; // Nand not supported in LSE. @@ -11627,9 +11835,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); - return Builder.CreateTruncOrBitCast( - Builder.CreateCall(Ldxr, Addr), - cast(Addr->getType())->getElementType()); + Type *EltTy = cast(Addr->getType())->getElementType(); + + const DataLayout &DL = M->getDataLayout(); + IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy)); + Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); + + return Builder.CreateBitCast(Trunc, EltTy); } void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( @@ -11664,6 +11876,10 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); + const DataLayout &DL = M->getDataLayout(); + IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); + Val = Builder.CreateBitCast(Val, IntValTy); + return Builder.CreateCall(Stxr, {Builder.CreateZExtOrBitCast( Val, Stxr->getFunctionType()->getParamType(0)), @@ -11685,8 +11901,9 @@ static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); + IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), + Offset), + IRB.getInt8PtrTy()->getPointerTo(0)); } Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { @@ -11712,12 +11929,13 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. - auto *SecurityCheckCookie = cast( - M.getOrInsertFunction("__security_check_cookie", - Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()))); - SecurityCheckCookie->setCallingConv(CallingConv::Win64); - SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { + F->setCallingConv(CallingConv::Win64); + F->addAttribute(1, Attribute::AttrKind::InReg); + } return; } TargetLowering::insertSSPDeclarations(M); @@ -11730,7 +11948,7 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { return TargetLowering::getSDagStackGuard(M); } -Value *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { +Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getFunction("__security_check_cookie"); @@ -11825,6 +12043,11 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { return OptSize && !VT.isVector(); } +bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { + // We want inc-of-add for scalars and sub-of-not for vectors. + return VT.isScalarInteger(); +} + bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); } diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index ffc4cc3ef534..4421c31f65c9 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -1,9 +1,8 @@ //==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -215,7 +214,13 @@ enum NodeType : unsigned { LD4LANEpost, ST2LANEpost, ST3LANEpost, - ST4LANEpost + ST4LANEpost, + + STG, + STZG, + ST2G, + STZ2G + }; } // end namespace AArch64ISD @@ -263,9 +268,10 @@ public: /// Returns true if the target allows unaligned memory accesses of the /// specified type. - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, - unsigned Align = 1, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -287,7 +293,8 @@ public: bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; /// Return true if the given shuffle mask can be codegen'd directly, or if it /// should be stack expanded. @@ -328,6 +335,9 @@ public: bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override; + bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } @@ -346,7 +356,7 @@ public: EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. @@ -409,7 +419,7 @@ public: void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; - Value *getSSPStackGuardCheck(const Module &M) const override; + Function *getSSPStackGuardCheck(const Module &M) const override; /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. @@ -470,6 +480,12 @@ public: return VT.getSizeInBits() >= 64; // vector 'bic' } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return false; + return true; + } + bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { // For vectors, we don't have a preference.. @@ -487,6 +503,8 @@ public: return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } + bool preferIncOfAddToSubOfNot(EVT VT) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; @@ -648,9 +666,9 @@ private: SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 35cd7735ceb7..e22cb44d81ae 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -1,9 +1,8 @@ //=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 9061ed4f9f54..d619137b55c5 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -1,9 +1,8 @@ //===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -356,6 +355,9 @@ def am_indexed7s32 : ComplexPattern; def am_indexed7s64 : ComplexPattern; def am_indexed7s128 : ComplexPattern; +def am_indexedu6s128 : ComplexPattern; +def am_indexeds9s128 : ComplexPattern; + // uimm5sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 32 * N]. def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>; @@ -1818,6 +1820,14 @@ multiclass Shift shift_type, string asm, SDNode OpNode> { def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))), (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; + + def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (sext GPR32:$Rm)))), + (!cast(NAME # "Xr") GPR64:$Rn, + (SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>; + + def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (zext GPR32:$Rm)))), + (!cast(NAME # "Xr") GPR64:$Rn, + (SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>; } class ShiftAlias @@ -2332,7 +2342,7 @@ class AddSubG } class SUBP - : BaseTwoOperand<0b0000, GPR64, asm_instr, null_frag, GPR64sp, GPR64sp> { + : BaseTwoOperand<0b0000, GPR64, asm_instr, OpNode, GPR64sp, GPR64sp> { let Inst{31} = 1; let Inst{29} = setsFlags; } @@ -4017,7 +4027,7 @@ class BaseMemTag opc1, bits<2> opc2, string asm_insn, class MemTagVector : BaseMemTag<{0b1, Load}, 0b00, asm_insn, asm_opnds, - "$Rn = $wback,@earlyclobber $wback", oops, iops> { + "", oops, iops> { bits<5> Rt; let Inst{20-12} = 0b000000000; @@ -4027,8 +4037,9 @@ class MemTagVector - : BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "", (outs GPR64:$Rt), - (ins GPR64sp:$Rn, simm9s16:$offset)> { + : BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "$Rt = $wback", + (outs GPR64:$wback), + (ins GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)> { bits<5> Rt; bits<9> offset; @@ -4045,29 +4056,28 @@ class BaseMemTagStore opc1, bits<2> opc2, string asm_insn, bits<9> offset; let Inst{20-12} = offset; - let Inst{4-0} = 0b11111; - let Unpredictable{4-0} = 0b11111; + let Inst{4-0} = Rt; let mayStore = 1; } multiclass MemTagStore opc1, string insn> { def Offset : - BaseMemTagStore; + BaseMemTagStore; def PreIndex : - BaseMemTagStore; + (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>; def PostIndex : - BaseMemTagStore; + (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>; - def : InstAlias(NAME # "Offset") GPR64sp:$Rn, 0)>; + def : InstAlias(NAME # "Offset") GPR64sp:$Rt, GPR64sp:$Rn, 0)>; } //--- diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index ada067888572..215e96a82d0e 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1,9 +1,8 @@ //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -77,8 +76,11 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { const MachineFunction *MF = MBB.getParent(); const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - if (MI.getOpcode() == AArch64::INLINEASM) - return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); + { + auto Op = MI.getOpcode(); + if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) + return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); + } // FIXME: We currently only handle pseudoinstructions that don't get expanded // before the assembly printer. @@ -928,9 +930,9 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( - MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { + const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); - MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; + const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned WidthA = 0, WidthB = 0; @@ -1715,6 +1717,69 @@ bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) { } } +Optional AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { + switch (Opc) { + default: return {}; + case AArch64::PRFMui: return AArch64::PRFUMi; + case AArch64::LDRXui: return AArch64::LDURXi; + case AArch64::LDRWui: return AArch64::LDURWi; + case AArch64::LDRBui: return AArch64::LDURBi; + case AArch64::LDRHui: return AArch64::LDURHi; + case AArch64::LDRSui: return AArch64::LDURSi; + case AArch64::LDRDui: return AArch64::LDURDi; + case AArch64::LDRQui: return AArch64::LDURQi; + case AArch64::LDRBBui: return AArch64::LDURBBi; + case AArch64::LDRHHui: return AArch64::LDURHHi; + case AArch64::LDRSBXui: return AArch64::LDURSBXi; + case AArch64::LDRSBWui: return AArch64::LDURSBWi; + case AArch64::LDRSHXui: return AArch64::LDURSHXi; + case AArch64::LDRSHWui: return AArch64::LDURSHWi; + case AArch64::LDRSWui: return AArch64::LDURSWi; + case AArch64::STRXui: return AArch64::STURXi; + case AArch64::STRWui: return AArch64::STURWi; + case AArch64::STRBui: return AArch64::STURBi; + case AArch64::STRHui: return AArch64::STURHi; + case AArch64::STRSui: return AArch64::STURSi; + case AArch64::STRDui: return AArch64::STURDi; + case AArch64::STRQui: return AArch64::STURQi; + case AArch64::STRBBui: return AArch64::STURBBi; + case AArch64::STRHHui: return AArch64::STURHHi; + } +} + +unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { + switch (Opc) { + default: + return 2; + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::STPXi: + case AArch64::STPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + case AArch64::LDPQi: + case AArch64::STPQi: + case AArch64::LDNPQi: + case AArch64::STNPQi: + case AArch64::LDPWi: + case AArch64::LDPSi: + case AArch64::STPWi: + case AArch64::STPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + case AArch64::LDG: + case AArch64::STGPi: + return 3; + case AArch64::ADDG: + case AArch64::STGOffset: + return 2; + } +} + bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { switch (MI.getOpcode()) { default: @@ -1837,7 +1902,7 @@ unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, // Is this a candidate for ld/st merging or pairing? For example, we don't // touch volatiles or load/stores that have a hint to avoid pair formation. -bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { +bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { // If this is a volatile load/store, don't mess with it. if (MI.hasOrderedMemoryRef()) return false; @@ -1879,8 +1944,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { return true; } -bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, - MachineOperand *&BaseOp, +bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { unsigned Width; @@ -1888,7 +1953,7 @@ bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, } bool AArch64InstrInfo::getMemOperandWithOffsetWidth( - MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, + const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. @@ -1944,7 +2009,7 @@ AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, int64_t &MinOffset, - int64_t &MaxOffset) const { + int64_t &MaxOffset) { switch (Opcode) { // Not a memory operation or something we want to handle. default: @@ -1965,6 +2030,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, MinOffset = -256; MaxOffset = 255; break; + case AArch64::PRFUMi: case AArch64::LDURXi: case AArch64::LDURDi: case AArch64::STURXi: @@ -2034,6 +2100,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, MinOffset = -64; MaxOffset = 63; break; + case AArch64::PRFMui: case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: @@ -2066,6 +2133,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, break; case AArch64::LDRHui: case AArch64::LDRHHui: + case AArch64::LDRSHWui: + case AArch64::LDRSHXui: case AArch64::STRHui: case AArch64::STRHHui: Scale = Width = 2; @@ -2074,12 +2143,40 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, break; case AArch64::LDRBui: case AArch64::LDRBBui: + case AArch64::LDRSBWui: + case AArch64::LDRSBXui: case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; MinOffset = 0; MaxOffset = 4095; break; + case AArch64::ADDG: + case AArch64::TAGPstack: + Scale = 16; + Width = 0; + MinOffset = 0; + MaxOffset = 63; + break; + case AArch64::LDG: + case AArch64::STGOffset: + case AArch64::STZGOffset: + Scale = Width = 16; + MinOffset = -256; + MaxOffset = 255; + break; + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + Scale = 16; + Width = 32; + MinOffset = -256; + MaxOffset = 255; + break; + case AArch64::STGPi: + Scale = Width = 16; + MinOffset = -64; + MaxOffset = 63; + break; } return true; @@ -2181,11 +2278,11 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOperandWithOffset returns true. -bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, - MachineOperand &BaseOp2, +bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, + const MachineOperand &BaseOp2, unsigned NumLoads) const { - MachineInstr &FirstLdSt = *BaseOp1.getParent(); - MachineInstr &SecondLdSt = *BaseOp2.getParent(); + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); if (BaseOp1.getType() != BaseOp2.getType()) return false; @@ -2292,6 +2389,31 @@ void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, } } +void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc, + unsigned Opcode, unsigned ZeroReg, + llvm::ArrayRef Indices) const { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned NumRegs = Indices.size(); + +#ifndef NDEBUG + uint16_t DestEncoding = TRI->getEncodingValue(DestReg); + uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); + assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && + "GPR reg sequences should not be able to overlap"); +#endif + + for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { + const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); + AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); + MIB.addReg(ZeroReg); + AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); + MIB.addImm(0); + } +} + void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, @@ -2431,6 +2553,22 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && + AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { + static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; + copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, + AArch64::XZR, Indices); + return; + } + + if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && + AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { + static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; + copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, + AArch64::WZR, Indices); + return; + } + if (AArch64::FPR128RegClass.contains(DestReg) && AArch64::FPR128RegClass.contains(SrcReg)) { if (Subtarget.hasNEON()) { @@ -2839,7 +2977,7 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, - bool NeedsWinCFI) { + bool NeedsWinCFI, bool *HasWinCFI) { if (DestReg == SrcReg && Offset == 0) return; @@ -2884,10 +3022,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize)) .setMIFlag(Flag); - if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) - .addImm(ThisVal) - .setMIFlag(Flag); + if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) { + if (HasWinCFI) + *HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) + .addImm(ThisVal) + .setMIFlag(Flag); + } SrcReg = DestReg; Offset -= ThisVal; @@ -2903,6 +3044,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, if (NeedsWinCFI) { if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { + if (HasWinCFI) + *HasWinCFI = true; if (Offset == 0) BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)). setMIFlag(Flag); @@ -2910,6 +3053,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)). addImm(Offset).setMIFlag(Flag); } else if (DestReg == AArch64::SP) { + if (HasWinCFI) + *HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)). addImm(Offset).setMIFlag(Flag); } @@ -2919,7 +3064,7 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS) const { + LiveIntervals *LIS, VirtRegMap *VRM) const { // This is a bit of a hack. Consider this instruction: // // %0 = COPY %sp; GPR64all:%0 @@ -3102,11 +3247,6 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, bool *OutUseUnscaledOp, unsigned *OutUnscaledOp, int *EmittableOffset) { - int Scale = 1; - bool IsSigned = false; - // The ImmIdx should be changed case by case if it is not 2. - unsigned ImmIdx = 2; - unsigned UnscaledOp = 0; // Set output values in case of early exit. if (EmittableOffset) *EmittableOffset = 0; @@ -3114,10 +3254,12 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, *OutUseUnscaledOp = false; if (OutUnscaledOp) *OutUnscaledOp = 0; + + // Exit early for structured vector spills/fills as they can't take an + // immediate offset. switch (MI.getOpcode()) { default: - llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex"); - // Vector spills/fills can't take an immediate offset. + break; case AArch64::LD1Twov2d: case AArch64::LD1Threev2d: case AArch64::LD1Fourv2d: @@ -3130,208 +3272,53 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::ST1Twov1d: case AArch64::ST1Threev1d: case AArch64::ST1Fourv1d: + case AArch64::IRG: + case AArch64::IRGstack: return AArch64FrameOffsetCannotUpdate; - case AArch64::PRFMui: - Scale = 8; - UnscaledOp = AArch64::PRFUMi; - break; - case AArch64::LDRXui: - Scale = 8; - UnscaledOp = AArch64::LDURXi; - break; - case AArch64::LDRWui: - Scale = 4; - UnscaledOp = AArch64::LDURWi; - break; - case AArch64::LDRBui: - Scale = 1; - UnscaledOp = AArch64::LDURBi; - break; - case AArch64::LDRHui: - Scale = 2; - UnscaledOp = AArch64::LDURHi; - break; - case AArch64::LDRSui: - Scale = 4; - UnscaledOp = AArch64::LDURSi; - break; - case AArch64::LDRDui: - Scale = 8; - UnscaledOp = AArch64::LDURDi; - break; - case AArch64::LDRQui: - Scale = 16; - UnscaledOp = AArch64::LDURQi; - break; - case AArch64::LDRBBui: - Scale = 1; - UnscaledOp = AArch64::LDURBBi; - break; - case AArch64::LDRHHui: - Scale = 2; - UnscaledOp = AArch64::LDURHHi; - break; - case AArch64::LDRSBXui: - Scale = 1; - UnscaledOp = AArch64::LDURSBXi; - break; - case AArch64::LDRSBWui: - Scale = 1; - UnscaledOp = AArch64::LDURSBWi; - break; - case AArch64::LDRSHXui: - Scale = 2; - UnscaledOp = AArch64::LDURSHXi; - break; - case AArch64::LDRSHWui: - Scale = 2; - UnscaledOp = AArch64::LDURSHWi; - break; - case AArch64::LDRSWui: - Scale = 4; - UnscaledOp = AArch64::LDURSWi; - break; - - case AArch64::STRXui: - Scale = 8; - UnscaledOp = AArch64::STURXi; - break; - case AArch64::STRWui: - Scale = 4; - UnscaledOp = AArch64::STURWi; - break; - case AArch64::STRBui: - Scale = 1; - UnscaledOp = AArch64::STURBi; - break; - case AArch64::STRHui: - Scale = 2; - UnscaledOp = AArch64::STURHi; - break; - case AArch64::STRSui: - Scale = 4; - UnscaledOp = AArch64::STURSi; - break; - case AArch64::STRDui: - Scale = 8; - UnscaledOp = AArch64::STURDi; - break; - case AArch64::STRQui: - Scale = 16; - UnscaledOp = AArch64::STURQi; - break; - case AArch64::STRBBui: - Scale = 1; - UnscaledOp = AArch64::STURBBi; - break; - case AArch64::STRHHui: - Scale = 2; - UnscaledOp = AArch64::STURHHi; - break; - - case AArch64::LDPXi: - case AArch64::LDPDi: - case AArch64::STPXi: - case AArch64::STPDi: - case AArch64::LDNPXi: - case AArch64::LDNPDi: - case AArch64::STNPXi: - case AArch64::STNPDi: - ImmIdx = 3; - IsSigned = true; - Scale = 8; - break; - case AArch64::LDPQi: - case AArch64::STPQi: - case AArch64::LDNPQi: - case AArch64::STNPQi: - ImmIdx = 3; - IsSigned = true; - Scale = 16; - break; - case AArch64::LDPWi: - case AArch64::LDPSi: - case AArch64::STPWi: - case AArch64::STPSi: - case AArch64::LDNPWi: - case AArch64::LDNPSi: - case AArch64::STNPWi: - case AArch64::STNPSi: - ImmIdx = 3; - IsSigned = true; - Scale = 4; - break; - - case AArch64::LDURXi: - case AArch64::LDURWi: - case AArch64::LDURBi: - case AArch64::LDURHi: - case AArch64::LDURSi: - case AArch64::LDURDi: - case AArch64::LDURQi: - case AArch64::LDURHHi: - case AArch64::LDURBBi: - case AArch64::LDURSBXi: - case AArch64::LDURSBWi: - case AArch64::LDURSHXi: - case AArch64::LDURSHWi: - case AArch64::LDURSWi: - case AArch64::STURXi: - case AArch64::STURWi: - case AArch64::STURBi: - case AArch64::STURHi: - case AArch64::STURSi: - case AArch64::STURDi: - case AArch64::STURQi: - case AArch64::STURBBi: - case AArch64::STURHHi: - Scale = 1; - break; } - Offset += MI.getOperand(ImmIdx).getImm() * Scale; + // Get the min/max offset and the scale. + unsigned Scale, Width; + int64_t MinOff, MaxOff; + if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff, + MaxOff)) + llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); + + // Construct the complete offset. + const MachineOperand &ImmOpnd = + MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); + Offset += ImmOpnd.getImm() * Scale; - bool useUnscaledOp = false; // If the offset doesn't match the scale, we rewrite the instruction to // use the unscaled instruction instead. Likewise, if we have a negative - // offset (and have an unscaled op to use). - if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0)) - useUnscaledOp = true; - - // Use an unscaled addressing mode if the instruction has a negative offset - // (or if the instruction is already using an unscaled addressing mode). - unsigned MaskBits; - if (IsSigned) { - // ldp/stp instructions. - MaskBits = 7; - Offset /= Scale; - } else if (UnscaledOp == 0 || useUnscaledOp) { - MaskBits = 9; - IsSigned = true; - Scale = 1; - } else { - MaskBits = 12; - IsSigned = false; - Offset /= Scale; + // offset and there is an unscaled op to use. + Optional UnscaledOp = + AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); + bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); + if (useUnscaledOp && + !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff)) + llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); + + int64_t Remainder = Offset % Scale; + assert(!(Remainder && useUnscaledOp) && + "Cannot have remainder when using unscaled op"); + + assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); + int NewOffset = Offset / Scale; + if (MinOff <= NewOffset && NewOffset <= MaxOff) + Offset = Remainder; + else { + NewOffset = NewOffset < 0 ? MinOff : MaxOff; + Offset = Offset - NewOffset * Scale + Remainder; } - // Attempt to fold address computation. - int MaxOff = (1 << (MaskBits - IsSigned)) - 1; - int MinOff = (IsSigned ? (-MaxOff - 1) : 0); - if (Offset >= MinOff && Offset <= MaxOff) { - if (EmittableOffset) - *EmittableOffset = Offset; - Offset = 0; - } else { - int NewOff = Offset < 0 ? MinOff : MaxOff; - if (EmittableOffset) - *EmittableOffset = NewOff; - Offset = (Offset - NewOff) * Scale; - } + if (EmittableOffset) + *EmittableOffset = NewOffset; if (OutUseUnscaledOp) *OutUseUnscaledOp = useUnscaledOp; - if (OutUnscaledOp) - *OutUnscaledOp = UnscaledOp; + if (OutUnscaledOp && UnscaledOp) + *OutUnscaledOp = *UnscaledOp; + return AArch64FrameOffsetCanUpdate | (Offset == 0 ? AArch64FrameOffsetIsLegal : 0); } @@ -4974,8 +4961,8 @@ AArch64InstrInfo::getOutliningCandidateInfo( // At this point, we have a stack instruction that we might need to // fix up. We'll handle it if it's a load or store. if (MI.mayLoadOrStore()) { - MachineOperand *Base; // Filled with the base operand of MI. - int64_t Offset; // Filled with the offset of MI. + const MachineOperand *Base; // Filled with the base operand of MI. + int64_t Offset; // Filled with the offset of MI. // Does it allow us to offset the base operand and is the base the // register SP? @@ -5331,12 +5318,20 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) return outliner::InstrType::Illegal; + // Don't outline BTI instructions, because that will prevent the outlining + // site from being indirectly callable. + if (MI.getOpcode() == AArch64::HINT) { + int64_t Imm = MI.getOperand(0).getImm(); + if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) + return outliner::InstrType::Illegal; + } + return outliner::InstrType::Legal; } void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { for (MachineInstr &MI : MBB) { - MachineOperand *Base; + const MachineOperand *Base; unsigned Width; int64_t Offset; @@ -5534,7 +5529,32 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( MachineFunction &MF) const { - return MF.getFunction().optForMinSize(); + return MF.getFunction().hasMinSize(); +} + +bool AArch64InstrInfo::isCopyInstrImpl( + const MachineInstr &MI, const MachineOperand *&Source, + const MachineOperand *&Destination) const { + + // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg + // and zero immediate operands used as an alias for mov instruction. + if (MI.getOpcode() == AArch64::ORRWrs && + MI.getOperand(1).getReg() == AArch64::WZR && + MI.getOperand(3).getImm() == 0x0) { + Destination = &MI.getOperand(0); + Source = &MI.getOperand(2); + return true; + } + + if (MI.getOpcode() == AArch64::ORRXrs && + MI.getOperand(1).getReg() == AArch64::XZR && + MI.getOperand(3).getImm() == 0x0) { + Destination = &MI.getOperand(0); + Source = &MI.getOperand(2); + return true; + } + + return false; } #define GET_INSTRINFO_HELPERS diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 9954669d5675..7be4daba7dc4 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -1,9 +1,8 @@ //===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,6 +15,7 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" +#include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -54,7 +54,8 @@ public: unsigned &DstReg, unsigned &SubIdx) const override; bool - areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, @@ -84,6 +85,14 @@ public: return isUnscaledLdSt(MI.getOpcode()); } + /// Returns the unscaled load/store for the scaled load/store opcode, + /// if there is a corresponding unscaled variant available. + static Optional getUnscaledLdSt(unsigned Opc); + + + /// Returns the index for the immediate for a given instruction. + static unsigned getLoadStoreImmIdx(unsigned Opc); + /// Return true if pairing the given load or store may be paired with another. static bool isPairableLdStInst(const MachineInstr &MI); @@ -92,16 +101,18 @@ public: static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit); /// Return true if this is a load/store that can be potentially paired/merged. - bool isCandidateToMergeOrPair(MachineInstr &MI) const; + bool isCandidateToMergeOrPair(const MachineInstr &MI) const; /// Hint that pairing the given load or store is unprofitable. static void suppressLdStPair(MachineInstr &MI); - bool getMemOperandWithOffset(MachineInstr &MI, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &MI, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const override; - bool getMemOperandWithOffsetWidth(MachineInstr &MI, MachineOperand *&BaseOp, + bool getMemOperandWithOffsetWidth(const MachineInstr &MI, + const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; @@ -112,16 +123,21 @@ public: /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly. /// /// For unscaled instructions, \p Scale is set to 1. - bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, - int64_t &MinOffset, int64_t &MaxOffset) const; + static bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, + int64_t &MinOffset, int64_t &MaxOffset); - bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2, + bool shouldClusterMemOps(const MachineOperand &BaseOp1, + const MachineOperand &BaseOp2, unsigned NumLoads) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef Indices) const; + void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc, unsigned Opcode, unsigned ZeroReg, + llvm::ArrayRef Indices) const; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; @@ -146,7 +162,8 @@ public: foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS = nullptr) const override; + LiveIntervals *LIS = nullptr, + VirtRegMap *VRM = nullptr) const override; /// \returns true if a branch from an instruction with opcode \p BranchOpc /// bytes is capable of jumping to a position \p BrOffset bytes away. @@ -251,6 +268,13 @@ public: #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" +protected: + /// If the specific machine instruction is a instruction that moves/copies + /// value from one register to another register return true along with + /// @Source machine operand and @Destination machine operand. + bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source, + const MachineOperand *&Destination) const override; + private: /// Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. @@ -277,7 +301,8 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, - bool SetNZCV = false, bool NeedsWinCFI = false); + bool SetNZCV = false, bool NeedsWinCFI = false, + bool *HasWinCFI = nullptr); /// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the /// FP. Return false if the offset could not be handled directly in MI, and diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index c24b8b36441b..eed53f36d574 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -1,9 +1,8 @@ //=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -108,6 +107,16 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, AssemblerPredicate<"FeatureSVE", "sve">; +def HasSVE2 : Predicate<"Subtarget->hasSVE2()">, + AssemblerPredicate<"FeatureSVE2", "sve2">; +def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">, + AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">; +def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, + AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">; +def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, + AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">; +def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, + AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, AssemblerPredicate<"FeatureRCPC", "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, @@ -126,6 +135,7 @@ def HasMTE : Predicate<"Subtarget->hasMTE()">, AssemblerPredicate<"FeatureMTE", "mte">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; +def IsWindows : Predicate<"Subtarget->isTargetWindows()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; @@ -133,6 +143,10 @@ def UseNegativeImmediates : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", "NegativeImmediates">; +def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -395,6 +409,12 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; +def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -404,10 +424,10 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; // the Function object through the Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def ForCodeSize : Predicate<"MF->getFunction().optForSize()">; - def NotForCodeSize : Predicate<"!MF->getFunction().optForSize()">; + def ForCodeSize : Predicate<"MF->getFunction().hasOptSize()">; + def NotForCodeSize : Predicate<"!MF->getFunction().hasOptSize()">; // Avoid generating STRQro if it is slow, unless we're optimizing for code size. - def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().optForSize()">; + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().hasOptSize()">; def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; @@ -703,7 +723,9 @@ let Predicates = [HasPA] in { // v8.3a floating point conversion for javascript let Predicates = [HasJS, HasFPARMv8] in def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, - "fjcvtzs", []> { + "fjcvtzs", + [(set GPR32:$Rd, + (int_aarch64_fjcvtzs FPR64:$Rn))]> { let Inst{31} = 0; } // HasJS, HasFPARMv8 @@ -760,6 +782,13 @@ def MSRpstateImm4 : MSRpstateImm0_15; def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; +let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in { +def HWASAN_CHECK_MEMACCESS : Pseudo< + (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), + [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>, + Sched<[]>; +} + // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in def : Pat<(readcyclecounter), (MRS 0xdce8)>; @@ -1223,11 +1252,11 @@ defm : STOPregister<"stumin","LDUMIN">;// STUMINx // v8.5 Memory Tagging Extension let Predicates = [HasMTE] in { -def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", null_frag, GPR64sp, GPR64>, +def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", int_aarch64_irg, GPR64sp, GPR64>, Sched<[]>{ let Inst{31} = 1; } -def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", null_frag, GPR64sp>, Sched<[]>{ +def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", int_aarch64_gmi, GPR64sp>, Sched<[]>{ let Inst{31} = 1; let isNotDuplicable = 1; } @@ -1236,7 +1265,7 @@ def SUBG : AddSubG<1, "subg", null_frag>; def : InstAlias<"irg $dst, $src", (IRG GPR64sp:$dst, GPR64sp:$src, XZR), 1>; -def SUBP : SUBP<0, "subp", null_frag>, Sched<[]>; +def SUBP : SUBP<0, "subp", int_aarch64_subp>, Sched<[]>; def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{ let Defs = [NZCV]; } @@ -1244,24 +1273,74 @@ def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{ def : InstAlias<"cmpp $lhs, $rhs", (SUBPS XZR, GPR64sp:$lhs, GPR64sp:$rhs), 0>; def LDG : MemTagLoad<"ldg", "\t$Rt, [$Rn, $offset]">; + +def : Pat<(int_aarch64_addg (am_indexedu6s128 GPR64sp:$Rn, uimm6s16:$imm6), imm0_15:$imm4), + (ADDG GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4)>; +def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)), + (LDG GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>; + def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>; -def LDGV : MemTagVector<1, "ldgv", "\t$Rt, [$Rn]!", - (outs GPR64sp:$wback, GPR64:$Rt), (ins GPR64sp:$Rn)> { - let DecoderMethod = "DecodeLoadAllocTagArrayInstruction"; +def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]", + (outs GPR64:$Rt), (ins GPR64sp:$Rn)>; +def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]", + (outs), (ins GPR64:$Rt, GPR64sp:$Rn)>; +def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]", + (outs), (ins GPR64:$Rt, GPR64sp:$Rn)> { + let Inst{23} = 0; } -def STGV : MemTagVector<0, "stgv", "\t$Rt, [$Rn]!", - (outs GPR64sp:$wback), (ins GPR64:$Rt, GPR64sp:$Rn)>; defm STG : MemTagStore<0b00, "stg">; defm STZG : MemTagStore<0b01, "stzg">; defm ST2G : MemTagStore<0b10, "st2g">; defm STZ2G : MemTagStore<0b11, "stz2g">; +def : Pat<(AArch64stg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (STGOffset $Rn, $Rm, $imm)>; +def : Pat<(AArch64stzg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (STZGOffset $Rn, $Rm, $imm)>; +def : Pat<(AArch64st2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (ST2GOffset $Rn, $Rm, $imm)>; +def : Pat<(AArch64stz2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (STZ2GOffset $Rn, $Rm, $imm)>; + defm STGP : StorePairOffset <0b01, 0, GPR64z, simm7s16, "stgp">; def STGPpre : StorePairPreIdx <0b01, 0, GPR64z, simm7s16, "stgp">; def STGPpost : StorePairPostIdx<0b01, 0, GPR64z, simm7s16, "stgp">; +def : Pat<(int_aarch64_stg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)), + (STGOffset GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>; + +def : Pat<(int_aarch64_stgp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$imm), GPR64:$Rt, GPR64:$Rt2), + (STGPi $Rt, $Rt2, $Rn, $imm)>; + +def IRGstack + : Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rsp, GPR64:$Rm), []>, + Sched<[]>; +def TAGPstack + : Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, uimm6s16:$imm6, GPR64sp:$Rm, imm0_15:$imm4), []>, + Sched<[]>; + +// Explicit SP in the first operand prevents ShrinkWrap optimization +// from leaving this instruction out of the stack frame. When IRGstack +// is transformed into IRG, this operand is replaced with the actual +// register / expression for the tagged base pointer of the current function. +def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>; + +// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address. +// $Rn_wback is one past the end of the range. +let isCodeGenOnly=1, mayStore=1 in { +def STGloop + : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + Sched<[WriteAdr, WriteST]>; + +def STZGloop + : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + Sched<[WriteAdr, WriteST]>; +} + } // Predicates = [HasMTE] //===----------------------------------------------------------------------===// @@ -3052,6 +3131,27 @@ defm : FPToIntegerPats; defm : FPToIntegerPats; defm : FPToIntegerPats; +let Predicates = [HasFullFP16] in { + def : Pat<(i32 (lround f16:$Rn)), + (!cast(FCVTASUWHr) f16:$Rn)>; + def : Pat<(i64 (lround f16:$Rn)), + (!cast(FCVTASUXHr) f16:$Rn)>; + def : Pat<(i64 (llround f16:$Rn)), + (!cast(FCVTASUXHr) f16:$Rn)>; +} +def : Pat<(i32 (lround f32:$Rn)), + (!cast(FCVTASUWSr) f32:$Rn)>; +def : Pat<(i32 (lround f64:$Rn)), + (!cast(FCVTASUWDr) f64:$Rn)>; +def : Pat<(i64 (lround f32:$Rn)), + (!cast(FCVTASUXSr) f32:$Rn)>; +def : Pat<(i64 (lround f64:$Rn)), + (!cast(FCVTASUXDr) f64:$Rn)>; +def : Pat<(i64 (llround f32:$Rn)), + (!cast(FCVTASUXSr) f32:$Rn)>; +def : Pat<(i64 (llround f64:$Rn)), + (!cast(FCVTASUXDr) f64:$Rn)>; + //===----------------------------------------------------------------------===// // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// @@ -3116,6 +3216,27 @@ let Predicates = [HasFRInt3264] in { defm FRINT64X : FRIntNNT<0b11, "frint64x">; } // HasFRInt3264 +let Predicates = [HasFullFP16] in { + def : Pat<(i32 (lrint f16:$Rn)), + (FCVTZSUWHr (!cast(FRINTXHr) f16:$Rn))>; + def : Pat<(i64 (lrint f16:$Rn)), + (FCVTZSUXHr (!cast(FRINTXHr) f16:$Rn))>; + def : Pat<(i64 (llrint f16:$Rn)), + (FCVTZSUXHr (!cast(FRINTXHr) f16:$Rn))>; +} +def : Pat<(i32 (lrint f32:$Rn)), + (FCVTZSUWSr (!cast(FRINTXSr) f32:$Rn))>; +def : Pat<(i32 (lrint f64:$Rn)), + (FCVTZSUWDr (!cast(FRINTXDr) f64:$Rn))>; +def : Pat<(i64 (lrint f32:$Rn)), + (FCVTZSUXSr (!cast(FRINTXSr) f32:$Rn))>; +def : Pat<(i64 (lrint f64:$Rn)), + (FCVTZSUXDr (!cast(FRINTXDr) f64:$Rn))>; +def : Pat<(i64 (llrint f32:$Rn)), + (FCVTZSUXSr (!cast(FRINTXSr) f32:$Rn))>; +def : Pat<(i64 (llrint f64:$Rn)), + (FCVTZSUXDr (!cast(FRINTXDr) f64:$Rn))>; + //===----------------------------------------------------------------------===// // Floating point two operand instructions. //===----------------------------------------------------------------------===// @@ -3489,7 +3610,7 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast("FABD"#VT) VT:$Rn, V } defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>; defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; @@ -5314,6 +5435,8 @@ def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), v (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)), (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; +def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)), + (SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (and FPR32:$Rn, (i32 65535)), vecshiftR16:$imm)), @@ -5342,6 +5465,16 @@ def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)), (i64 (IMPLICIT_DEF)), (FCVTZUh FPR16:$Rn, vecshiftR64:$imm), hsub))>; +def : Pat<(i32 (int_aarch64_neon_facge (f16 FPR16:$Rn), (f16 FPR16:$Rm))), + (i32 (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), + (FACGE16 FPR16:$Rn, FPR16:$Rm), + hsub))>; +def : Pat<(i32 (int_aarch64_neon_facgt (f16 FPR16:$Rn), (f16 FPR16:$Rm))), + (i32 (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), + (FACGT16 FPR16:$Rn, FPR16:$Rm), + hsub))>; defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>; defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">; @@ -6031,6 +6164,7 @@ def : Pat<(i32 (trunc GPR64sp:$src)), // __builtin_trap() uses the BRK instruction on AArch64. def : Pat<(trap), (BRK 1)>; +def : Pat<(debugtrap), (BRK 0xF000)>, Requires<[IsWindows]>; // Multiply high patterns which multiply the lower subvector using smull/umull // and the upper subvector with smull2/umull2. Then shuffle the high the high @@ -6147,6 +6281,7 @@ def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; // Natural vector casts (128 bit) def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -6801,5 +6936,8 @@ def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>; +def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>; + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 5eb589bf66d5..4e13fb8e2027 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -1,9 +1,8 @@ //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -19,11 +18,14 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Optional.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -57,6 +59,15 @@ private: /// the patterns that don't require complex C++. bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + // A lowering phase that runs before any selection attempts. + + void preISelLower(MachineInstr &I) const; + + // An early selection function that runs before the selectImpl() call. + bool earlySelect(MachineInstr &I) const; + + bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, @@ -65,15 +76,84 @@ private: bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; + bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + // Helper to generate an equivalent of scalar_to_vector into a new register, // returned via 'Dst'. - bool emitScalarToVector(unsigned &Dst, const LLT DstTy, - const TargetRegisterClass *DstRC, unsigned Scalar, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineRegisterInfo &MRI) const; + MachineInstr *emitScalarToVector(unsigned EltSize, + const TargetRegisterClass *DstRC, + Register Scalar, + MachineIRBuilder &MIRBuilder) const; + + /// Emit a lane insert into \p DstReg, or a new vector register if None is + /// provided. + /// + /// The lane inserted into is defined by \p LaneIdx. The vector source + /// register is given by \p SrcReg. The register containing the element is + /// given by \p EltReg. + MachineInstr *emitLaneInsert(Optional DstReg, Register SrcReg, + Register EltReg, unsigned LaneIdx, + const RegisterBank &RB, + MachineIRBuilder &MIRBuilder) const; + bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + + void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI, + SmallVectorImpl> &Idxs) const; + bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectSplitVectorUnmerge(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectIntrinsicWithSideEffects(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; + + unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const; + MachineInstr *emitLoadFromConstantPool(Constant *CPVal, + MachineIRBuilder &MIRBuilder) const; + + // Emit a vector concat operation. + MachineInstr *emitVectorConcat(Optional Dst, Register Op1, + Register Op2, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitTST(const Register &LHS, const Register &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitExtractVectorElt(Optional DstReg, + const RegisterBank &DstRB, LLT ScalarTy, + Register VecReg, unsigned LaneIdx, + MachineIRBuilder &MIRBuilder) const; + + /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be + /// materialized using a FMOV instruction, then update MI and return it. + /// Otherwise, do nothing and return a nullptr. + MachineInstr *emitFMovForFConstant(MachineInstr &MI, + MachineRegisterInfo &MRI) const; + + /// Emit a CSet for a compare. + MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, + MachineIRBuilder &MIRBuilder) const; + + // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. + // We use these manually instead of using the importer since it doesn't + // support SDNodeXForm. + ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; + ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; + ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; + ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; ComplexRendererFns selectArithImmed(MachineOperand &Root) const; @@ -109,6 +189,14 @@ private: void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned char OpFlags) const; + // Optimization methods. + bool tryOptVectorShuffle(MachineInstr &I) const; + bool tryOptVectorDup(MachineInstr &MI) const; + bool tryOptSelect(MachineInstr &MI) const; + MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; @@ -177,6 +265,70 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, return nullptr; } +/// Given a register bank, and size in bits, return the smallest register class +/// that can represent that combination. +static const TargetRegisterClass * +getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, + bool GetAllRegSet = false) { + unsigned RegBankID = RB.getID(); + + if (RegBankID == AArch64::GPRRegBankID) { + if (SizeInBits <= 32) + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; + if (SizeInBits == 64) + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; + } + + if (RegBankID == AArch64::FPRRegBankID) { + switch (SizeInBits) { + default: + return nullptr; + case 8: + return &AArch64::FPR8RegClass; + case 16: + return &AArch64::FPR16RegClass; + case 32: + return &AArch64::FPR32RegClass; + case 64: + return &AArch64::FPR64RegClass; + case 128: + return &AArch64::FPR128RegClass; + } + } + + return nullptr; +} + +/// Returns the correct subregister to use for a given register class. +static bool getSubRegForClass(const TargetRegisterClass *RC, + const TargetRegisterInfo &TRI, unsigned &SubReg) { + switch (TRI.getRegSizeInBits(*RC)) { + case 8: + SubReg = AArch64::bsub; + break; + case 16: + SubReg = AArch64::hsub; + break; + case 32: + if (RC == &AArch64::GPR32RegClass) + SubReg = AArch64::sub_32; + else + SubReg = AArch64::ssub; + break; + case 64: + SubReg = AArch64::dsub; + break; + default: + LLVM_DEBUG( + dbgs() << "Couldn't find appropriate subregister for register class."); + return false; + } + + return true; +} + /// Check whether \p I is a currently unsupported binary operation: /// - it has an unsized type /// - an operand is not a vreg @@ -332,107 +484,209 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, return GenericOpc; } -static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, unsigned SrcReg) { - // Copies from gpr32 to fpr16 need to use a sub-register copy. - unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY)) - .addDef(CopyReg) - .addUse(SrcReg); - unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) - .addDef(SubRegCopy) - .addUse(CopyReg, 0, AArch64::hsub); +#ifndef NDEBUG +/// Helper function that verifies that we have a valid copy at the end of +/// selectCopy. Verifies that the source and dest have the expected sizes and +/// then returns true. +static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + // Make sure the size of the source and dest line up. + assert( + (DstSize == SrcSize || + // Copies are a mean to setup initial types, the number of + // bits may not exactly match. + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || + // Copies are a mean to copy bits around, as long as we are + // on the same register class, that's fine. Otherwise, that + // means we need some SUBREG_TO_REG or AND & co. + (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && + "Copy with different width?!"); + + // Check the size of the destination. + assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && + "GPRs cannot get more than 64-bit width values"); + + return true; +} +#endif +/// Helper function for selectCopy. Inserts a subregister copy from +/// \p *From to \p *To, linking it up to \p I. +/// +/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into +/// +/// CopyReg (From class) = COPY SrcReg +/// SubRegCopy (To class) = COPY CopyReg:SubReg +/// Dst = COPY SubRegCopy +static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, unsigned SrcReg, + const TargetRegisterClass *From, + const TargetRegisterClass *To, + unsigned SubReg) { + MachineIRBuilder MIB(I); + auto Copy = MIB.buildCopy({From}, {SrcReg}); + auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {}) + .addReg(Copy.getReg(0), 0, SubReg); MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(SubRegCopy); + RegOp.setReg(SubRegCopy.getReg(0)); + + // It's possible that the destination register won't be constrained. Make + // sure that happens. + if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg())) + RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); + return true; } +/// Helper function to get the source and destination register classes for a +/// copy. Returns a std::pair containing the source register class for the +/// copy, and the destination register class for the copy. If a register class +/// cannot be determined, then it will be nullptr. +static std::pair +getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + unsigned DstReg = I.getOperand(0).getReg(); + unsigned SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + // Special casing for cross-bank copies of s1s. We can technically represent + // a 1-bit value with any size of register. The minimum size for a GPR is 32 + // bits. So, we need to put the FPR on 32 bits as well. + // + // FIXME: I'm not sure if this case holds true outside of copies. If it does, + // then we can pull it into the helpers that get the appropriate class for a + // register bank. Or make a new helper that carries along some constraint + // information. + if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) + SrcSize = DstSize = 32; + + return {getMinClassForRegBank(SrcRegBank, SrcSize, true), + getMinClassForRegBank(DstRegBank, DstSize, true)}; +} + static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { unsigned DstReg = I.getOperand(0).getReg(); unsigned SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { - if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) && - !TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank( - MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true); - if (SrcRC == &AArch64::GPR32allRegClass) - return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); - } - assert(I.isCopy() && "Generic operators do not allow physical registers"); - return true; - } - - const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - (void)DstSize; - const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - (void)SrcSize; - assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && - "No phys reg on generic operators"); - assert( - (DstSize == SrcSize || - // Copies are a mean to setup initial types, the number of - // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) || - // Copies are a mean to copy bits around, as long as we are - // on the same register class, that's fine. Otherwise, that - // means we need some SUBREG_TO_REG or AND & co. - (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && - "Copy with different width?!"); - assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) && - "GPRs cannot get more than 64-bit width values"); + // Find the correct register classes for the source and destination registers. + const TargetRegisterClass *SrcRC; + const TargetRegisterClass *DstRC; + std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); - const TargetRegisterClass *RC = getRegClassForTypeOnBank( - MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true); - if (!RC) { - LLVM_DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n'); + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Unexpected dest size " + << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); return false; } - if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg); - const TargetRegisterClass *SrcRC = - RegClassOrBank.dyn_cast(); - const RegisterBank *RB = nullptr; + // A couple helpers below, for making sure that the copy we produce is valid. + + // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want + // to verify that the src and dst are the same size, since that's handled by + // the SUBREG_TO_REG. + bool KnownValid = false; + + // Returns true, or asserts if something we don't expect happens. Instead of + // returning true, we return isValidCopy() to ensure that we verify the + // result. + auto CheckCopy = [&]() { + // If we have a bitcast or something, we can't have physical registers. + assert( + (I.isCopy() || + (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) && + !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) && + "No phys reg on generic operator!"); + assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI)); + (void)KnownValid; + return true; + }; + + // Is this a copy? If so, then we may need to insert a subregister copy, or + // a SUBREG_TO_REG. + if (I.isCopy()) { + // Yes. Check if there's anything to fix up. if (!SrcRC) { - RB = RegClassOrBank.get(); - SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true); - } - // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG. - if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) { - unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(AArch64::SUBREG_TO_REG)) - .addDef(PromoteReg) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::hsub); - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(PromoteReg); - } else if (RC == &AArch64::FPR16RegClass && - SrcRC == &AArch64::GPR32allRegClass) { - selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); + LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); + return false; + } + + // Is this a cross-bank copy? + if (DstRegBank.getID() != SrcRegBank.getID()) { + // If we're doing a cross-bank copy on different-sized registers, we need + // to do a bit more work. + unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); + unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + + if (SrcSize > DstSize) { + // We're doing a cross-bank copy into a smaller register. We need a + // subregister copy. First, get a register class that's on the same bank + // as the destination, but the same size as the source. + const TargetRegisterClass *SubregRC = + getMinClassForRegBank(DstRegBank, SrcSize, true); + assert(SubregRC && "Didn't get a register class for subreg?"); + + // Get the appropriate subregister for the destination. + unsigned SubReg = 0; + if (!getSubRegForClass(DstRC, TRI, SubReg)) { + LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); + return false; + } + + // Now, insert a subregister copy using the new register class. + selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); + return CheckCopy(); + } + + else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && + SrcSize == 16) { + // Special case for FPR16 to GPR32. + // FIXME: This can probably be generalized like the above case. + unsigned PromoteReg = + MRI.createVirtualRegister(&AArch64::FPR32RegClass); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG), PromoteReg) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::hsub); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(PromoteReg); + + // Promise that the copy is implicitly validated by the SUBREG_TO_REG. + KnownValid = true; + } } + + // If the destination is a physical register, then there's nothing to + // change, so we're done. + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + return CheckCopy(); } - // No need to constrain SrcReg. It will get constrained when - // we hit another of its use or its defs. - // Copies do not have constraints. - if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { + // No need to constrain SrcReg. It will get constrained when we hit another + // of its use or its defs. Copies do not have constraints. + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) << " operand\n"); return false; } I.setDesc(TII.get(AArch64::COPY)); - return true; + return CheckCopy(); } static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { @@ -511,6 +765,46 @@ static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { return GenericOpc; } +static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::GPRRegBankID); + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + if (Ty == LLT::scalar(32)) + return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr; + else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) + return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr; + return 0; +} + +/// Helper function to select the opcode for a G_FCMP. +static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) { + // If this is a compare against +0.0, then we don't have to explicitly + // materialize a constant. + const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI); + bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); + unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + if (OpSize != 32 && OpSize != 64) + return 0; + unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, + {AArch64::FCMPSri, AArch64::FCMPDri}}; + return CmpOpcTbl[ShouldUseImm][OpSize == 64]; +} + +/// Returns true if \p P is an unsigned integer comparison predicate. +static bool isUnsignedICMPPred(const CmpInst::Predicate P) { + switch (P) { + default: + return false; + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return true; + } +} + static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { switch (P) { default: @@ -595,7 +889,7 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, bool AArch64InstructionSelector::selectCompareBranch( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { - const unsigned CondReg = I.getOperand(0).getReg(); + const Register CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); MachineInstr *CCMI = MRI.getVRegDef(CondReg); if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) @@ -603,14 +897,25 @@ bool AArch64InstructionSelector::selectCompareBranch( if (CCMI->getOpcode() != TargetOpcode::G_ICMP) return false; - unsigned LHS = CCMI->getOperand(2).getReg(); - unsigned RHS = CCMI->getOperand(3).getReg(); - if (!getConstantVRegVal(RHS, MRI)) + Register LHS = CCMI->getOperand(2).getReg(); + Register RHS = CCMI->getOperand(3).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + if (!VRegAndVal) std::swap(RHS, LHS); - const auto RHSImm = getConstantVRegVal(RHS, MRI); - if (!RHSImm || *RHSImm != 0) - return false; + VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + if (!VRegAndVal || VRegAndVal->Value != 0) { + MachineIRBuilder MIB(I); + // If we can't select a CBZ then emit a cmp + Bcc. + if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3), + CCMI->getOperand(1), MIB)) + return false; + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( + (CmpInst::Predicate)CCMI->getOperand(1).getPredicate()); + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); + I.eraseFromParent(); + return true; + } const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) @@ -638,6 +943,74 @@ bool AArch64InstructionSelector::selectCompareBranch( return true; } +bool AArch64InstructionSelector::selectVectorSHL( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_SHL); + Register DstReg = I.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + + if (!Ty.isVector()) + return false; + + unsigned Opc = 0; + if (Ty == LLT::vector(4, 32)) { + Opc = AArch64::USHLv4i32; + } else if (Ty == LLT::vector(2, 32)) { + Opc = AArch64::USHLv2i32; + } else { + LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); + return false; + } + + MachineIRBuilder MIB(I); + auto UShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Src2Reg}); + constrainSelectedInstRegOperands(*UShl, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVectorASHR( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_ASHR); + Register DstReg = I.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + + if (!Ty.isVector()) + return false; + + // There is not a shift right register instruction, but the shift left + // register instruction takes a signed value, where negative numbers specify a + // right shift. + + unsigned Opc = 0; + unsigned NegOpc = 0; + const TargetRegisterClass *RC = nullptr; + if (Ty == LLT::vector(4, 32)) { + Opc = AArch64::SSHLv4i32; + NegOpc = AArch64::NEGv4i32; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(2, 32)) { + Opc = AArch64::SSHLv2i32; + NegOpc = AArch64::NEGv2i32; + RC = &AArch64::FPR64RegClass; + } else { + LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); + return false; + } + + MachineIRBuilder MIB(I); + auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); + constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); + auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); + constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectVaStartAAPCS( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { return false; @@ -646,9 +1019,9 @@ bool AArch64InstructionSelector::selectVaStartAAPCS( bool AArch64InstructionSelector::selectVaStartDarwin( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { AArch64FunctionInfo *FuncInfo = MF.getInfo(); - unsigned ListReg = I.getOperand(0).getReg(); + Register ListReg = I.getOperand(0).getReg(); - unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) @@ -684,9 +1057,9 @@ void AArch64InstructionSelector::materializeLargeCMVal( MovZ->addOperand(MF, MachineOperand::CreateImm(0)); constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); - auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags, unsigned Offset, - unsigned ForceDstReg) { - unsigned DstReg = ForceDstReg + auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, + Register ForceDstReg) { + Register DstReg = ForceDstReg ? ForceDstReg : MRI.createVirtualRegister(&AArch64::GPR64RegClass); auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); @@ -702,13 +1075,105 @@ void AArch64InstructionSelector::materializeLargeCMVal( constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); return DstReg; }; - unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(), + Register DstReg = BuildMovK(MovZ.getReg(0), AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); return; } +void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (I.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: { + // These shifts are legalized to have 64 bit shift amounts because we want + // to take advantage of the existing imported selection patterns that assume + // the immediates are s64s. However, if the shifted type is 32 bits and for + // some reason we receive input GMIR that has an s64 shift amount that's not + // a G_CONSTANT, insert a truncate so that we can still select the s32 + // register-register variant. + unsigned SrcReg = I.getOperand(1).getReg(); + unsigned ShiftReg = I.getOperand(2).getReg(); + const LLT ShiftTy = MRI.getType(ShiftReg); + const LLT SrcTy = MRI.getType(SrcReg); + if (SrcTy.isVector()) + return; + assert(!ShiftTy.isVector() && "unexpected vector shift ty"); + if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) + return; + auto *AmtMI = MRI.getVRegDef(ShiftReg); + assert(AmtMI && "could not find a vreg definition for shift amount"); + if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { + // Insert a subregister copy to implement a 64->32 trunc + MachineIRBuilder MIB(I); + auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) + .addReg(ShiftReg, 0, AArch64::sub_32); + MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + I.getOperand(2).setReg(Trunc.getReg(0)); + } + return; + } + default: + return; + } +} + +bool AArch64InstructionSelector::earlySelectSHL( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // We try to match the immediate variant of LSL, which is actually an alias + // for a special case of UBFM. Otherwise, we fall back to the imported + // selector which will match the register variant. + assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); + const auto &MO = I.getOperand(2); + auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI); + if (!VRegAndVal) + return false; + + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + if (DstTy.isVector()) + return false; + bool Is64Bit = DstTy.getSizeInBits() == 64; + auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); + auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); + MachineIRBuilder MIB(I); + + if (!Imm1Fn || !Imm2Fn) + return false; + + auto NewI = + MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, + {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); + + for (auto &RenderFn : *Imm1Fn) + RenderFn(NewI); + for (auto &RenderFn : *Imm2Fn) + RenderFn(NewI); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { + assert(I.getParent() && "Instruction should be in a basic block!"); + assert(I.getParent()->getParent() && "Instruction should be in a function!"); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (I.getOpcode()) { + case TargetOpcode::G_SHL: + return earlySelectSHL(I, MRI); + default: + return false; + } +} + bool AArch64InstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { assert(I.getParent() && "Instruction should be in a basic block!"); @@ -727,30 +1192,27 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI.getType(DefReg); - const TargetRegisterClass *DefRC = nullptr; - if (TargetRegisterInfo::isPhysicalRegister(DefReg)) { - DefRC = TRI.getRegClass(DefReg); - } else { - const RegClassOrRegBank &RegClassOrBank = - MRI.getRegClassOrRegBank(DefReg); + const RegClassOrRegBank &RegClassOrBank = + MRI.getRegClassOrRegBank(DefReg); - DefRC = RegClassOrBank.dyn_cast(); + const TargetRegisterClass *DefRC + = RegClassOrBank.dyn_cast(); + if (!DefRC) { + if (!DefTy.isValid()) { + LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); + return false; + } + const RegisterBank &RB = *RegClassOrBank.get(); + DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); if (!DefRC) { - if (!DefTy.isValid()) { - LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); - return false; - } - const RegisterBank &RB = *RegClassOrBank.get(); - DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); - if (!DefRC) { - LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); - return false; - } + LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); + return false; } } + I.setDesc(TII.get(TargetOpcode::PHI)); return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); @@ -769,12 +1231,27 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } + // Try to do some lowering before we start instruction selecting. These + // lowerings are purely transformations on the input G_MIR and so selection + // must continue after any modification of the instruction. + preISelLower(I); + + // There may be patterns where the importer can't deal with them optimally, + // but does select it to a suboptimal sequence so our custom C++ selection + // code later never has a chance to work on it. Therefore, we have an early + // selection attempt here to give priority to certain selection routines + // over the imported ones. + if (earlySelect(I)) + return true; + if (selectImpl(I, CoverageInfo)) return true; LLT Ty = I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; + MachineIRBuilder MIB(I); + switch (Opcode) { case TargetOpcode::G_BRCOND: { if (Ty.getSizeInBits() > 32) { @@ -786,7 +1263,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } - const unsigned CondReg = I.getOperand(0).getReg(); + const Register CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z @@ -826,15 +1303,57 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_BRJT: + return selectBrJT(I, MRI); + + case TargetOpcode::G_BSWAP: { + // Handle vector types for G_BSWAP directly. + Register DstReg = I.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + + // We should only get vector types here; everything else is handled by the + // importer right now. + if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { + LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); + return false; + } + + // Only handle 4 and 2 element vectors for now. + // TODO: 16-bit elements. + unsigned NumElts = DstTy.getNumElements(); + if (NumElts != 4 && NumElts != 2) { + LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); + return false; + } + + // Choose the correct opcode for the supported types. Right now, that's + // v2s32, v4s32, and v2s64. + unsigned Opc = 0; + unsigned EltSize = DstTy.getElementType().getSizeInBits(); + if (EltSize == 32) + Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 + : AArch64::REV32v16i8; + else if (EltSize == 64) + Opc = AArch64::REV64v16i8; + + // We should always get something by the time we get here... + assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); + + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_CONSTANT: { const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); const LLT p0 = LLT::pointer(0, 64); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI.getType(DefReg); const unsigned DefSize = DefTy.getSizeInBits(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); @@ -861,7 +1380,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } else { // s32 and s64 are covered by tablegen. - if (Ty != p0) { + if (Ty != p0 && Ty != s8 && Ty != s16) { LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty << " constant, expected: " << s32 << ", " << s64 << ", or " << p0 << '\n'); @@ -876,25 +1395,27 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } } + // We allow G_CONSTANT of types < 32b. const unsigned MovOpc = - DefSize == 32 ? AArch64::MOVi32imm : AArch64::MOVi64imm; - - I.setDesc(TII.get(MovOpc)); + DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; if (isFP) { + // Either emit a FMOV, or emit a copy to emit a normal mov. const TargetRegisterClass &GPRRC = DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass; const TargetRegisterClass &FPRRC = DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass; - const unsigned DefGPRReg = MRI.createVirtualRegister(&GPRRC); + // Can we use a FMOV instruction to represent the immediate? + if (emitFMovForFConstant(I, MRI)) + return true; + + // Nope. Emit a copy and use a normal mov instead. + const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC); MachineOperand &RegOp = I.getOperand(0); RegOp.setReg(DefGPRReg); - - BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(), - TII.get(AArch64::COPY)) - .addDef(DefReg) - .addUse(DefGPRReg); + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildCopy({DefReg}, {DefGPRReg}); if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); @@ -913,6 +1434,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, I.getOperand(1).ChangeToImmediate(Val); } + I.setDesc(TII.get(MovOpc)); constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } @@ -936,11 +1458,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); - BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(), - TII.get(AArch64::COPY)) - .addDef(I.getOperand(0).getReg()) - .addUse(DstReg, 0, AArch64::sub_32); + Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + .addReg(DstReg, 0, AArch64::sub_32); RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR32RegClass, MRI); I.getOperand(0).setReg(DstReg); @@ -969,7 +1490,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); BuildMI(MBB, I.getIterator(), I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) .addDef(SrcReg) @@ -1026,8 +1547,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_ZEXTLOAD: case TargetOpcode::G_LOAD: case TargetOpcode::G_STORE: { + bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; + MachineIRBuilder MIB(I); + LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); if (PtrTy != LLT::pointer(0, 64)) { @@ -1043,7 +1568,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } unsigned MemSizeInBits = MemOp.getSize() * 8; - const unsigned PtrReg = I.getOperand(1).getReg(); + const Register PtrReg = I.getOperand(1).getReg(); #ifndef NDEBUG const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); // Sanity-check the pointer register. @@ -1053,7 +1578,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, "Load/Store pointer operand isn't a pointer"); #endif - const unsigned ValReg = I.getOperand(0).getReg(); + const Register ValReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); const unsigned NewOpc = @@ -1098,6 +1623,25 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } } + if (IsZExtLoad) { + // The zextload from a smaller type to i32 should be handled by the importer. + if (MRI.getType(ValReg).getSizeInBits() != 64) + return false; + // If we have a ZEXTLOAD then change the load's type to be a narrower reg + //and zero_extend with SUBREG_TO_REG. + Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + Register DstReg = I.getOperand(0).getReg(); + I.getOperand(0).setReg(LdReg); + + MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) + .addImm(0) + .addUse(LdReg) + .addImm(AArch64::sub_32); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, + MRI); + } return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -1107,7 +1651,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) { @@ -1134,10 +1678,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: - case TargetOpcode::G_OR: + case TargetOpcode::G_ASHR: + if (MRI.getType(I.getOperand(0).getReg()).isVector()) + return selectVectorASHR(I, MRI); + LLVM_FALLTHROUGH; case TargetOpcode::G_SHL: + if (Opcode == TargetOpcode::G_SHL && + MRI.getType(I.getOperand(0).getReg()).isVector()) + return selectVectorSHL(I, MRI); + LLVM_FALLTHROUGH; + case TargetOpcode::G_OR: case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: case TargetOpcode::G_GEP: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) @@ -1145,7 +1696,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, const unsigned OpSize = Ty.getSizeInBits(); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); @@ -1160,6 +1711,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_UADDO: { + // TODO: Support other types. + unsigned OpSize = Ty.getSizeInBits(); + if (OpSize != 32 && OpSize != 64) { + LLVM_DEBUG( + dbgs() + << "G_UADDO currently only supported for 32 and 64 b types.\n"); + return false; + } + + // TODO: Support vectors. + if (Ty.isVector()) { + LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n"); + return false; + } + + // Add and set the set condition flag. + unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr; + MachineIRBuilder MIRBuilder(I); + auto AddsMI = MIRBuilder.buildInstr( + AddsOpc, {I.getOperand(0).getReg()}, + {I.getOperand(2).getReg(), I.getOperand(3).getReg()}); + constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI); + + // Now, put the overflow result in the register given by the first operand + // to the G_UADDO. CSINC increments the result when the predicate is false, + // so to get the increment when it's true, we need to use the inverse. In + // this case, we want to increment when carry is set. + auto CsetMI = MIRBuilder + .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, + {Register(AArch64::WZR), Register(AArch64::WZR)}) + .addImm(getInvertedCondCode(AArch64CC::HS)); + constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + case TargetOpcode::G_PTR_MASK: { uint64_t Align = I.getOperand(2).getImm(); if (Align >= 64 || Align == 0) @@ -1176,8 +1764,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I, const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); @@ -1234,8 +1822,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } case TargetOpcode::G_ANYEXT: { - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); if (RBDst.getID() != AArch64::GPRRegBankID) { @@ -1266,7 +1854,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, // At this point G_ANYEXT is just like a plain COPY, but we need // to explicitly form the 64-bit value if any. if (DstSize > 32) { - unsigned ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); + Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) .addDef(ExtSrc) .addImm(0) @@ -1283,8 +1871,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I, const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), SrcTy = MRI.getType(I.getOperand(1).getReg()); const bool isSigned = Opcode == TargetOpcode::G_SEXT; - const unsigned DefReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DefReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) { @@ -1302,7 +1890,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } - const unsigned SrcXReg = + const Register SrcXReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) .addDef(SrcXReg) @@ -1358,11 +1946,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_BITCAST: // Imported SelectionDAG rules can handle every bitcast except those that // bitcast from a type to the same type. Ideally, these shouldn't occur - // but we might not run an optimizer that deletes them. - if (MRI.getType(I.getOperand(0).getReg()) == - MRI.getType(I.getOperand(1).getReg())) - return selectCopy(I, TII, MRI, TRI, RBI); - return false; + // but we might not run an optimizer that deletes them. The other exception + // is bitcasts involving pointer types, as SelectionDAG has no knowledge + // of them. + return selectCopy(I, TII, MRI, TRI, RBI); case TargetOpcode::G_SELECT: { if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { @@ -1371,20 +1958,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } - const unsigned CondReg = I.getOperand(1).getReg(); - const unsigned TReg = I.getOperand(2).getReg(); - const unsigned FReg = I.getOperand(3).getReg(); - - unsigned CSelOpc = 0; + const Register CondReg = I.getOperand(1).getReg(); + const Register TReg = I.getOperand(2).getReg(); + const Register FReg = I.getOperand(3).getReg(); - if (Ty == LLT::scalar(32)) { - CSelOpc = AArch64::CSELWr; - } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) { - CSelOpc = AArch64::CSELXr; - } else { - return false; - } + if (tryOptSelect(I)) + return true; + Register CSelOpc = selectSelectOpc(I, MRI, RBI); MachineInstr &TstMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) .addDef(AArch64::WZR) @@ -1404,81 +1985,55 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return true; } case TargetOpcode::G_ICMP: { + if (Ty.isVector()) + return selectVectorICmp(I, MRI); + if (Ty != LLT::scalar(32)) { LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty << ", expected: " << LLT::scalar(32) << '\n'); return false; } - unsigned CmpOpc = 0; - unsigned ZReg = 0; + MachineIRBuilder MIRBuilder(I); + if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), + MIRBuilder)) + return false; + emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(), + MIRBuilder); + I.eraseFromParent(); + return true; + } - LLT CmpTy = MRI.getType(I.getOperand(2).getReg()); - if (CmpTy == LLT::scalar(32)) { - CmpOpc = AArch64::SUBSWrr; - ZReg = AArch64::WZR; - } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) { - CmpOpc = AArch64::SUBSXrr; - ZReg = AArch64::XZR; - } else { + case TargetOpcode::G_FCMP: { + if (Ty != LLT::scalar(32)) { + LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty + << ", expected: " << LLT::scalar(32) << '\n'); return false; } - // CSINC increments the result by one when the condition code is false. - // Therefore, we have to invert the predicate to get an increment by 1 when - // the predicate is true. - const AArch64CC::CondCode invCC = - changeICMPPredToAArch64CC(CmpInst::getInversePredicate( - (CmpInst::Predicate)I.getOperand(1).getPredicate())); + unsigned CmpOpc = selectFCMPOpc(I, MRI); + if (!CmpOpc) + return false; - MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) - .addDef(ZReg) - .addUse(I.getOperand(2).getReg()) - .addUse(I.getOperand(3).getReg()); - - MachineInstr &CSetMI = - *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) - .addDef(I.getOperand(0).getReg()) - .addUse(AArch64::WZR) - .addUse(AArch64::WZR) - .addImm(invCC); - - constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI); - - I.eraseFromParent(); - return true; - } - - case TargetOpcode::G_FCMP: { - if (Ty != LLT::scalar(32)) { - LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty - << ", expected: " << LLT::scalar(32) << '\n'); - return false; - } - - unsigned CmpOpc = 0; - LLT CmpTy = MRI.getType(I.getOperand(2).getReg()); - if (CmpTy == LLT::scalar(32)) { - CmpOpc = AArch64::FCMPSrr; - } else if (CmpTy == LLT::scalar(64)) { - CmpOpc = AArch64::FCMPDrr; - } else { - return false; - } - - // FIXME: regbank + // FIXME: regbank AArch64CC::CondCode CC1, CC2; changeFCMPPredToAArch64CC( (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2); - MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) - .addUse(I.getOperand(2).getReg()) - .addUse(I.getOperand(3).getReg()); + // Partially build the compare. Decide if we need to add a use for the + // third operand based off whether or not we're comparing against 0.0. + auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) + .addUse(I.getOperand(2).getReg()); - const unsigned DefReg = I.getOperand(0).getReg(); - unsigned Def1Reg = DefReg; + // If we don't have an immediate compare, then we need to add a use of the + // register which wasn't used for the immediate. + // Note that the immediate will always be the last operand. + if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) + CmpMI = CmpMI.addUse(I.getOperand(3).getReg()); + + const Register DefReg = I.getOperand(0).getReg(); + Register Def1Reg = DefReg; if (CC2 != AArch64CC::AL) Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); @@ -1490,7 +2045,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, .addImm(getInvertedCondCode(CC1)); if (CC2 != AArch64CC::AL) { - unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MachineInstr &CSet2MI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr)) .addDef(Def2Reg) @@ -1505,8 +2060,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI); constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI); } - - constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI); I.eraseFromParent(); @@ -1515,19 +2069,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_VASTART: return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) : selectVaStartAAPCS(I, MF, MRI); + case TargetOpcode::G_INTRINSIC: + return selectIntrinsic(I, MRI); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: - if (!I.getOperand(0).isIntrinsicID()) - return false; - if (I.getOperand(0).getIntrinsicID() != Intrinsic::trap) - return false; - BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::BRK)) - .addImm(1); - I.eraseFromParent(); - return true; + return selectIntrinsicWithSideEffects(I, MRI); case TargetOpcode::G_IMPLICIT_DEF: { I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - const unsigned DstReg = I.getOperand(0).getReg(); + const Register DstReg = I.getOperand(0).getReg(); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB, RBI); @@ -1552,44 +2101,374 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } } + case TargetOpcode::G_INTRINSIC_TRUNC: + return selectIntrinsicTrunc(I, MRI); + case TargetOpcode::G_INTRINSIC_ROUND: + return selectIntrinsicRound(I, MRI); case TargetOpcode::G_BUILD_VECTOR: return selectBuildVector(I, MRI); case TargetOpcode::G_MERGE_VALUES: return selectMergeValues(I, MRI); + case TargetOpcode::G_UNMERGE_VALUES: + return selectUnmergeValues(I, MRI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return selectShuffleVector(I, MRI); + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return selectExtractElt(I, MRI); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return selectInsertElt(I, MRI); + case TargetOpcode::G_CONCAT_VECTORS: + return selectConcatVectors(I, MRI); + case TargetOpcode::G_JUMP_TABLE: + return selectJumpTable(I, MRI); } return false; } -bool AArch64InstructionSelector::emitScalarToVector( - unsigned &Dst, const LLT DstTy, const TargetRegisterClass *DstRC, - unsigned Scalar, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, MachineRegisterInfo &MRI) const { - Dst = MRI.createVirtualRegister(DstRC); +bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, + MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); + Register JTAddr = I.getOperand(0).getReg(); + unsigned JTI = I.getOperand(1).getIndex(); + Register Index = I.getOperand(2).getReg(); + MachineIRBuilder MIB(I); + + Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg}, + {JTAddr, Index}) + .addJumpTableIndex(JTI); + + // Build the indirect branch. + MIB.buildInstr(AArch64::BR, {}, {TargetReg}); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectJumpTable( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); + assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); + + Register DstReg = I.getOperand(0).getReg(); + unsigned JTI = I.getOperand(1).getIndex(); + // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. + MachineIRBuilder MIB(I); + auto MovMI = + MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) + .addJumpTableIndex(JTI, AArch64II::MO_PAGE) + .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectIntrinsicTrunc( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); + + // Select the correct opcode. + unsigned Opc = 0; + if (!SrcTy.isVector()) { + switch (SrcTy.getSizeInBits()) { + default: + case 16: + Opc = AArch64::FRINTZHr; + break; + case 32: + Opc = AArch64::FRINTZSr; + break; + case 64: + Opc = AArch64::FRINTZDr; + break; + } + } else { + unsigned NumElts = SrcTy.getNumElements(); + switch (SrcTy.getElementType().getSizeInBits()) { + default: + break; + case 16: + if (NumElts == 4) + Opc = AArch64::FRINTZv4f16; + else if (NumElts == 8) + Opc = AArch64::FRINTZv8f16; + break; + case 32: + if (NumElts == 2) + Opc = AArch64::FRINTZv2f32; + else if (NumElts == 4) + Opc = AArch64::FRINTZv4f32; + break; + case 64: + if (NumElts == 2) + Opc = AArch64::FRINTZv2f64; + break; + } + } + + if (!Opc) { + // Didn't get an opcode above, bail. + LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); + return false; + } + + // Legalization would have set us up perfectly for this; we just need to + // set the opcode and move on. + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectIntrinsicRound( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); + + // Select the correct opcode. + unsigned Opc = 0; + if (!SrcTy.isVector()) { + switch (SrcTy.getSizeInBits()) { + default: + case 16: + Opc = AArch64::FRINTAHr; + break; + case 32: + Opc = AArch64::FRINTASr; + break; + case 64: + Opc = AArch64::FRINTADr; + break; + } + } else { + unsigned NumElts = SrcTy.getNumElements(); + switch (SrcTy.getElementType().getSizeInBits()) { + default: + break; + case 16: + if (NumElts == 4) + Opc = AArch64::FRINTAv4f16; + else if (NumElts == 8) + Opc = AArch64::FRINTAv8f16; + break; + case 32: + if (NumElts == 2) + Opc = AArch64::FRINTAv2f32; + else if (NumElts == 4) + Opc = AArch64::FRINTAv4f32; + break; + case 64: + if (NumElts == 2) + Opc = AArch64::FRINTAv2f64; + break; + } + } + + if (!Opc) { + // Didn't get an opcode above, bail. + LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); + return false; + } + + // Legalization would have set us up perfectly for this; we just need to + // set the opcode and move on. + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool AArch64InstructionSelector::selectVectorICmp( + MachineInstr &I, MachineRegisterInfo &MRI) const { + Register DstReg = I.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + Register SrcReg = I.getOperand(2).getReg(); + Register Src2Reg = I.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + + unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); + unsigned NumElts = DstTy.getNumElements(); + + // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b + // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 + // Third index is cc opcode: + // 0 == eq + // 1 == ugt + // 2 == uge + // 3 == ult + // 4 == ule + // 5 == sgt + // 6 == sge + // 7 == slt + // 8 == sle + // ne is done by negating 'eq' result. + + // This table below assumes that for some comparisons the operands will be + // commuted. + // ult op == commute + ugt op + // ule op == commute + uge op + // slt op == commute + sgt op + // sle op == commute + sge op + unsigned PredIdx = 0; + bool SwapOperands = false; + CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); + switch (Pred) { + case CmpInst::ICMP_NE: + case CmpInst::ICMP_EQ: + PredIdx = 0; + break; + case CmpInst::ICMP_UGT: + PredIdx = 1; + break; + case CmpInst::ICMP_UGE: + PredIdx = 2; + break; + case CmpInst::ICMP_ULT: + PredIdx = 3; + SwapOperands = true; + break; + case CmpInst::ICMP_ULE: + PredIdx = 4; + SwapOperands = true; + break; + case CmpInst::ICMP_SGT: + PredIdx = 5; + break; + case CmpInst::ICMP_SGE: + PredIdx = 6; + break; + case CmpInst::ICMP_SLT: + PredIdx = 7; + SwapOperands = true; + break; + case CmpInst::ICMP_SLE: + PredIdx = 8; + SwapOperands = true; + break; + default: + llvm_unreachable("Unhandled icmp predicate"); + return false; + } + + // This table obviously should be tablegen'd when we have our GISel native + // tablegen selector. + + static const unsigned OpcTable[4][4][9] = { + { + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, + AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, + AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, + {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, + AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, + AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} + }, + { + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, + AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, + AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, + {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, + AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, + AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + { + {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, + AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, + AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, + {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, + AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, + AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + { + {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, + AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, + AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */}, + {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, + 0 /* invalid */} + }, + }; + unsigned EltIdx = Log2_32(SrcEltSize / 8); + unsigned NumEltsIdx = Log2_32(NumElts / 2); + unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; + if (!Opc) { + LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); + return false; + } + + const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const TargetRegisterClass *SrcRC = + getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true); + if (!SrcRC) { + LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); + return false; + } + + unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; + if (SrcTy.getSizeInBits() == 128) + NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; + + if (SwapOperands) + std::swap(SrcReg, Src2Reg); + + MachineIRBuilder MIB(I); + auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + + // Invert if we had a 'ne' cc. + if (NotOpc) { + Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + } else { + MIB.buildCopy(DstReg, Cmp.getReg(0)); + } + RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); + I.eraseFromParent(); + return true; +} - unsigned UndefVec = MRI.createVirtualRegister(DstRC); - MachineInstr &UndefMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(), - TII.get(TargetOpcode::IMPLICIT_DEF)) - .addDef(UndefVec); +MachineInstr *AArch64InstructionSelector::emitScalarToVector( + unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, + MachineIRBuilder &MIRBuilder) const { + auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); auto BuildFn = [&](unsigned SubregIndex) { - MachineInstr &InsMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(), - TII.get(TargetOpcode::INSERT_SUBREG)) - .addDef(Dst) - .addUse(UndefVec) - .addUse(Scalar) - .addImm(SubregIndex); - constrainSelectedInstRegOperands(UndefMI, TII, TRI, RBI); - return constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + auto Ins = + MIRBuilder + .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) + .addImm(SubregIndex); + constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); + return &*Ins; }; - switch (DstTy.getElementType().getSizeInBits()) { + switch (EltSize) { + case 16: + return BuildFn(AArch64::hsub); case 32: return BuildFn(AArch64::ssub); case 64: return BuildFn(AArch64::dsub); default: - return false; + return nullptr; } } @@ -1610,14 +2489,14 @@ bool AArch64InstructionSelector::selectMergeValues( return false; auto *DstRC = &AArch64::GPR64RegClass; - unsigned SubToRegDef = MRI.createVirtualRegister(DstRC); + Register SubToRegDef = MRI.createVirtualRegister(DstRC); MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG)) .addDef(SubToRegDef) .addImm(0) .addUse(I.getOperand(1).getReg()) .addImm(AArch64::sub_32); - unsigned SubToRegDef2 = MRI.createVirtualRegister(DstRC); + Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); // Need to anyext the second scalar before we can use bfm MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG)) @@ -1639,122 +2518,1362 @@ bool AArch64InstructionSelector::selectMergeValues( return true; } -bool AArch64InstructionSelector::selectBuildVector( +static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, + const unsigned EltSize) { + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + switch (EltSize) { + case 16: + CopyOpc = AArch64::CPYi16; + ExtractSubReg = AArch64::hsub; + break; + case 32: + CopyOpc = AArch64::CPYi32; + ExtractSubReg = AArch64::ssub; + break; + case 64: + CopyOpc = AArch64::CPYi64; + ExtractSubReg = AArch64::dsub; + break; + default: + // Unknown size, bail out. + LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); + return false; + } + return true; +} + +MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( + Optional DstReg, const RegisterBank &DstRB, LLT ScalarTy, + Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { + LLVM_DEBUG( + dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); + return nullptr; + } + + const TargetRegisterClass *DstRC = + getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true); + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); + return nullptr; + } + + const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); + const LLT &VecTy = MRI.getType(VecReg); + const TargetRegisterClass *VecRC = + getRegClassForTypeOnBank(VecTy, VecRB, RBI, true); + if (!VecRC) { + LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); + return nullptr; + } + + // The register that we're going to copy into. + Register InsertReg = VecReg; + if (!DstReg) + DstReg = MRI.createVirtualRegister(DstRC); + // If the lane index is 0, we just use a subregister COPY. + if (LaneIdx == 0) { + auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) + .addReg(VecReg, 0, ExtractSubReg); + RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); + return &*Copy; + } + + // Lane copies require 128-bit wide registers. If we're dealing with an + // unpacked vector, then we need to move up to that width. Insert an implicit + // def and a subregister insert to get us there. + if (VecTy.getSizeInBits() != 128) { + MachineInstr *ScalarToVector = emitScalarToVector( + VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); + if (!ScalarToVector) + return nullptr; + InsertReg = ScalarToVector->getOperand(0).getReg(); + } + + MachineInstr *LaneCopyMI = + MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); + constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); + + // Make sure that we actually constrain the initial copy. + RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); + return LaneCopyMI; +} + +bool AArch64InstructionSelector::selectExtractElt( MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); - // Until we port more of the optimized selections, for now just use a vector - // insert sequence. - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); - unsigned EltSize = EltTy.getSizeInBits(); - if (EltSize < 32 || EltSize > 64) - return false; // Don't support all element types yet. - const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); - unsigned Opc; - unsigned SubregIdx; + assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && + "unexpected opcode!"); + Register DstReg = I.getOperand(0).getReg(); + const LLT NarrowTy = MRI.getType(DstReg); + const Register SrcReg = I.getOperand(1).getReg(); + const LLT WideTy = MRI.getType(SrcReg); + (void)WideTy; + assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && + "source register size too small!"); + assert(NarrowTy.isScalar() && "cannot extract vector into vector!"); + + // Need the lane index to determine the correct copy opcode. + MachineOperand &LaneIdxOp = I.getOperand(2); + assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); + + if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); + return false; + } + + // Find the index to extract from. + auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); + if (!VRegAndVal) + return false; + unsigned LaneIdx = VRegAndVal->Value; + + MachineIRBuilder MIRBuilder(I); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, + LaneIdx, MIRBuilder); + if (!Extract) + return false; + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectSplitVectorUnmerge( + MachineInstr &I, MachineRegisterInfo &MRI) const { + unsigned NumElts = I.getNumOperands() - 1; + Register SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT SrcTy = MRI.getType(SrcReg); + + assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); + if (SrcTy.getSizeInBits() > 128) { + LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); + return false; + } + + MachineIRBuilder MIB(I); + + // We implement a split vector operation by treating the sub-vectors as + // scalars and extracting them. + const RegisterBank &DstRB = + *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); + for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { + Register Dst = I.getOperand(OpIdx).getReg(); + MachineInstr *Extract = + emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); + if (!Extract) + return false; + } + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectUnmergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "unexpected opcode"); + + // TODO: Handle unmerging into GPRs and from scalars to scalars. + if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID || + RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " + "currently unsupported.\n"); + return false; + } + + // The last operand is the vector source register, and every other operand is + // a register to unpack into. + unsigned NumElts = I.getNumOperands() - 1; + Register SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT WideTy = MRI.getType(SrcReg); + (void)WideTy; + assert(WideTy.isVector() && "can only unmerge from vector types!"); + assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && + "source register size too small!"); + + if (!NarrowTy.isScalar()) + return selectSplitVectorUnmerge(I, MRI); + + MachineIRBuilder MIB(I); + + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) + return false; + + // Set up for the lane copies. + MachineBasicBlock &MBB = *I.getParent(); + + // Stores the registers we'll be copying from. + SmallVector InsertRegs; + + // We'll use the first register twice, so we only need NumElts-1 registers. + unsigned NumInsertRegs = NumElts - 1; + + // If our elements fit into exactly 128 bits, then we can copy from the source + // directly. Otherwise, we need to do a bit of setup with some subregister + // inserts. + if (NarrowTy.getSizeInBits() * NumElts == 128) { + InsertRegs = SmallVector(NumInsertRegs, SrcReg); + } else { + // No. We have to perform subregister inserts. For each insert, create an + // implicit def and a subregister insert, and save the register we create. + for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { + Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &ImpDefMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), + ImpDefReg); + + // Now, create the subregister insert from SrcReg. + Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &InsMI = + *BuildMI(MBB, I, I.getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) + .addUse(ImpDefReg) + .addUse(SrcReg) + .addImm(AArch64::dsub); + + constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + + // Save the register so that we can copy from it after. + InsertRegs.push_back(InsertReg); + } + } + + // Now that we've created any necessary subregister inserts, we can + // create the copies. + // + // Perform the first copy separately as a subregister copy. + Register CopyTo = I.getOperand(0).getReg(); + auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) + .addReg(InsertRegs[0], 0, ExtractSubReg); + constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); + + // Now, perform the remaining copies as vector lane copies. + unsigned LaneIdx = 1; + for (Register InsReg : InsertRegs) { + Register CopyTo = I.getOperand(LaneIdx).getReg(); + MachineInstr &CopyInst = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) + .addUse(InsReg) + .addImm(LaneIdx); + constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); + ++LaneIdx; + } + + // Separately constrain the first copy's destination. Because of the + // limitation in constrainOperandRegClass, we can't guarantee that this will + // actually be constrained. So, do it ourselves using the second operand. + const TargetRegisterClass *RC = + MRI.getRegClassOrNull(I.getOperand(1).getReg()); + if (!RC) { + LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); + return false; + } + + RBI.constrainGenericRegister(CopyTo, *RC, MRI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectConcatVectors( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && + "Unexpected opcode"); + Register Dst = I.getOperand(0).getReg(); + Register Op1 = I.getOperand(1).getReg(); + Register Op2 = I.getOperand(2).getReg(); + MachineIRBuilder MIRBuilder(I); + MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder); + if (!ConcatMI) + return false; + I.eraseFromParent(); + return true; +} + +void AArch64InstructionSelector::collectShuffleMaskIndices( + MachineInstr &I, MachineRegisterInfo &MRI, + SmallVectorImpl> &Idxs) const { + MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg()); + assert( + MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR && + "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR"); + // Find the constant indices. + for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) { + // Look through copies. + MachineInstr *ScalarDef = + getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI); + assert(ScalarDef && "Could not find vreg def of shufflevec index op"); + if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) { + // This be an undef if not a constant. + assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF); + Idxs.push_back(None); + } else { + Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue()); + } + } +} + +unsigned +AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal, + MachineFunction &MF) const { + Type *CPTy = CPVal->getType(); + unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy); + if (Align == 0) + Align = MF.getDataLayout().getTypeAllocSize(CPTy); + + MachineConstantPool *MCP = MF.getConstantPool(); + return MCP->getConstantPoolIndex(CPVal, Align); +} + +MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( + Constant *CPVal, MachineIRBuilder &MIRBuilder) const { + unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); + + auto Adrp = + MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) + .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); + + MachineInstr *LoadMI = nullptr; + switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) { + case 16: + LoadMI = + &*MIRBuilder + .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) + .addConstantPoolIndex(CPIdx, 0, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + case 8: + LoadMI = &*MIRBuilder + .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) + .addConstantPoolIndex( + CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + break; + default: + LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " + << *CPVal->getType()); + return nullptr; + } + constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); + constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); + return LoadMI; +} + +/// Return an pair to do an vector elt insert of a given +/// size and RB. +static std::pair +getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { + unsigned Opc, SubregIdx; if (RB.getID() == AArch64::GPRRegBankID) { if (EltSize == 32) { Opc = AArch64::INSvi32gpr; SubregIdx = AArch64::ssub; - } else { + } else if (EltSize == 64) { Opc = AArch64::INSvi64gpr; SubregIdx = AArch64::dsub; + } else { + llvm_unreachable("invalid elt size!"); } } else { - if (EltSize == 32) { + if (EltSize == 8) { + Opc = AArch64::INSvi8lane; + SubregIdx = AArch64::bsub; + } else if (EltSize == 16) { + Opc = AArch64::INSvi16lane; + SubregIdx = AArch64::hsub; + } else if (EltSize == 32) { Opc = AArch64::INSvi32lane; SubregIdx = AArch64::ssub; - } else { + } else if (EltSize == 64) { Opc = AArch64::INSvi64lane; SubregIdx = AArch64::dsub; - } - } - - if (EltSize * DstTy.getNumElements() != 128) - return false; // Don't handle unpacked vectors yet. - - unsigned DstVec = 0; - const TargetRegisterClass *DstRC = getRegClassForTypeOnBank( - DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI); - emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(), - *I.getParent(), I.getIterator(), MRI); - for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) { - unsigned InsDef; - // For the last insert re-use the dst reg of the G_BUILD_VECTOR. - if (i + 1 < e) - InsDef = MRI.createVirtualRegister(DstRC); - else - InsDef = I.getOperand(0).getReg(); - unsigned LaneIdx = i - 1; - if (RB.getID() == AArch64::FPRRegBankID) { - unsigned ImpDef = MRI.createVirtualRegister(DstRC); - MachineInstr &ImpDefMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(TargetOpcode::IMPLICIT_DEF)) - .addDef(ImpDef); - unsigned InsSubDef = MRI.createVirtualRegister(DstRC); - MachineInstr &InsSubMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(TargetOpcode::INSERT_SUBREG)) - .addDef(InsSubDef) - .addUse(ImpDef) - .addUse(I.getOperand(i).getReg()) - .addImm(SubregIdx); - MachineInstr &InsEltMI = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc)) - .addDef(InsDef) - .addUse(DstVec) - .addImm(LaneIdx) - .addUse(InsSubDef) - .addImm(0); - constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(InsSubMI, TII, TRI, RBI); - constrainSelectedInstRegOperands(InsEltMI, TII, TRI, RBI); - DstVec = InsDef; } else { - MachineInstr &InsMI = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc)) - .addDef(InsDef) - .addUse(DstVec) - .addImm(LaneIdx) - .addUse(I.getOperand(i).getReg()); - constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); - DstVec = InsDef; + llvm_unreachable("invalid elt size!"); } } - I.eraseFromParent(); - return true; + return std::make_pair(Opc, SubregIdx); } -/// SelectArithImmed - Select an immediate value that can be represented as -/// a 12-bit value shifted left by either 0 or 12. If so, return true with -/// Val set to the 12-bit value and Shift set to the shifter operand. -InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { - MachineInstr &MI = *Root.getParent(); - MachineBasicBlock &MBB = *MI.getParent(); +MachineInstr * +AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri}, + {AArch64::ADDSWrr, AArch64::ADDSWri}}; + bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + + auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + } else { + CmpMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +MachineInstr * +AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS, + MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + unsigned RegSize = MRI.getType(LHS).getSizeInBits(); + bool Is32Bit = (RegSize == 32); + static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri}, + {AArch64::ANDSWrr, AArch64::ANDSWri}}; + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + + // We might be able to fold in an immediate into the TST. We need to make sure + // it's a logical immediate though, since ANDS requires that. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI); + bool IsImmForm = ValAndVReg.hasValue() && + AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize); + unsigned Opc = OpcTable[Is32Bit][IsImmForm]; + auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); + + if (IsImmForm) + TstMI.addImm( + AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize)); + else + TstMI.addUse(RHS); + + constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); + return &*TstMI; +} + +MachineInstr *AArch64InstructionSelector::emitIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + + // Fold the compare if possible. + MachineInstr *FoldCmp = + tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder); + if (FoldCmp) + return FoldCmp; + + // Can't fold into a CMN. Just emit a normal compare. + unsigned CmpOpc = 0; + Register ZReg; + + LLT CmpTy = MRI.getType(LHS.getReg()); + assert((CmpTy.isScalar() || CmpTy.isPointer()) && + "Expected scalar or pointer"); + if (CmpTy == LLT::scalar(32)) { + CmpOpc = AArch64::SUBSWrr; + ZReg = AArch64::WZR; + } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) { + CmpOpc = AArch64::SUBSXrr; + ZReg = AArch64::XZR; + } else { + return nullptr; + } + + // Try to match immediate forms. + auto ImmFns = selectArithImmed(RHS); + if (ImmFns) + CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri; + + auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg()); + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(CmpMI); + } else { + CmpMI.addUse(RHS.getReg()); + } + + // Make sure that we can constrain the compare that we emitted. + constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); + return &*CmpMI; +} + +MachineInstr *AArch64InstructionSelector::emitVectorConcat( + Optional Dst, Register Op1, Register Op2, + MachineIRBuilder &MIRBuilder) const { + // We implement a vector concat by: + // 1. Use scalar_to_vector to insert the lower vector into the larger dest + // 2. Insert the upper vector into the destination's upper element + // TODO: some of this code is common with G_BUILD_VECTOR handling. + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + + const LLT Op1Ty = MRI.getType(Op1); + const LLT Op2Ty = MRI.getType(Op2); + + if (Op1Ty != Op2Ty) { + LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); + return nullptr; + } + assert(Op1Ty.isVector() && "Expected a vector for vector concat"); + + if (Op1Ty.getSizeInBits() >= 128) { + LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); + return nullptr; + } + + // At the moment we just support 64 bit vector concats. + if (Op1Ty.getSizeInBits() != 64) { + LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); + return nullptr; + } + + const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); + const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); + const TargetRegisterClass *DstRC = + getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); + + MachineInstr *WidenedOp1 = + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); + MachineInstr *WidenedOp2 = + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); + if (!WidenedOp1 || !WidenedOp2) { + LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); + return nullptr; + } + + // Now do the insert of the upper element. + unsigned InsertOpc, InsSubRegIdx; + std::tie(InsertOpc, InsSubRegIdx) = + getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); + + if (!Dst) + Dst = MRI.createVirtualRegister(DstRC); + auto InsElt = + MIRBuilder + .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) + .addImm(1) /* Lane index */ + .addUse(WidenedOp2->getOperand(0).getReg()) + .addImm(0); + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); + return &*InsElt; +} + +MachineInstr *AArch64InstructionSelector::emitFMovForFConstant( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && + "Expected a G_FCONSTANT!"); + MachineOperand &ImmOp = I.getOperand(1); + unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + + // Only handle 32 and 64 bit defs for now. + if (DefSize != 32 && DefSize != 64) + return nullptr; + + // Don't handle null values using FMOV. + if (ImmOp.getFPImm()->isNullValue()) + return nullptr; + + // Get the immediate representation for the FMOV. + const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF(); + int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF) + : AArch64_AM::getFP64Imm(ImmValAPF); + + // If this is -1, it means the immediate can't be represented as the requested + // floating point value. Bail. + if (Imm == -1) + return nullptr; + + // Update MI to represent the new FMOV instruction, constrain it, and return. + ImmOp.ChangeToImmediate(Imm); + unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi; + I.setDesc(TII.get(MovOpc)); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return &I; +} + +MachineInstr * +AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred, + MachineIRBuilder &MIRBuilder) const { + // CSINC increments the result when the predicate is false. Invert it. + const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( + CmpInst::getInversePredicate((CmpInst::Predicate)Pred)); + auto I = + MIRBuilder + .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)}) + .addImm(InvCC); + constrainSelectedInstRegOperands(*I, TII, TRI, RBI); + return &*I; +} + +bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { + MachineIRBuilder MIB(I); + MachineRegisterInfo &MRI = *MIB.getMRI(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + + // We want to recognize this pattern: + // + // $z = G_FCMP pred, $x, $y + // ... + // $w = G_SELECT $z, $a, $b + // + // Where the value of $z is *only* ever used by the G_SELECT (possibly with + // some copies/truncs in between.) + // + // If we see this, then we can emit something like this: + // + // fcmp $x, $y + // fcsel $w, $a, $b, pred + // + // Rather than emitting both of the rather long sequences in the standard + // G_FCMP/G_SELECT select methods. + + // First, check if the condition is defined by a compare. + MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); + while (CondDef) { + // We can only fold if all of the defs have one use. + if (!MRI.hasOneUse(CondDef->getOperand(0).getReg())) + return false; + + // We can skip over G_TRUNC since the condition is 1-bit. + // Truncating/extending can have no impact on the value. + unsigned Opc = CondDef->getOpcode(); + if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) + break; + + // Can't see past copies from physregs. + if (Opc == TargetOpcode::COPY && + TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg())) + return false; + + CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); + } + + // Is the condition defined by a compare? + if (!CondDef) + return false; + + unsigned CondOpc = CondDef->getOpcode(); + if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) + return false; + + AArch64CC::CondCode CondCode; + if (CondOpc == TargetOpcode::G_ICMP) { + CondCode = changeICMPPredToAArch64CC( + (CmpInst::Predicate)CondDef->getOperand(1).getPredicate()); + if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), + CondDef->getOperand(1), MIB)) { + LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); + return false; + } + } else { + // Get the condition code for the select. + AArch64CC::CondCode CondCode2; + changeFCMPPredToAArch64CC( + (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode, + CondCode2); + + // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two + // instructions to emit the comparison. + // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be + // unnecessary. + if (CondCode2 != AArch64CC::AL) + return false; + + // Make sure we'll be able to select the compare. + unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI); + if (!CmpOpc) + return false; + + // Emit a new compare. + auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()}); + if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri) + Cmp.addUse(CondDef->getOperand(3).getReg()); + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); + } + + // Emit the select. + unsigned CSelOpc = selectSelectOpc(I, MRI, RBI); + auto CSel = + MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()}, + {I.getOperand(2).getReg(), I.getOperand(3).getReg()}) + .addImm(CondCode); + constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( + MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && + "Unexpected MachineOperand"); + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + // We want to find this sort of thing: + // x = G_SUB 0, y + // G_ICMP z, x + // + // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. + // e.g: + // + // cmn z, y + + // Helper lambda to detect the subtract followed by the compare. + // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0. + auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) { + if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB) + return false; + + // Need to make sure NZCV is the same at the end of the transformation. + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return false; + + // We want to match against SUBs. + if (DefMI->getOpcode() != TargetOpcode::G_SUB) + return false; + + // Make sure that we're getting + // x = G_SUB 0, y + auto ValAndVReg = + getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI); + if (!ValAndVReg || ValAndVReg->Value != 0) + return false; + + // This can safely be represented as a CMN. + return true; + }; + + // Check if the RHS or LHS of the G_ICMP is defined by a SUB + MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); + MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); + CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P); + + // Given this: + // + // x = G_SUB 0, y + // G_ICMP x, z + // + // Produce this: + // + // cmn y, z + if (IsCMN(LHSDef, CC)) + return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + + // Same idea here, but with the RHS of the compare instead: + // + // Given this: + // + // x = G_SUB 0, y + // G_ICMP z, x + // + // Produce this: + // + // cmn z, y + if (IsCMN(RHSDef, CC)) + return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); + + // Given this: + // + // z = G_AND x, y + // G_ICMP z, 0 + // + // Produce this if the compare is signed: + // + // tst x, y + if (!isUnsignedICMPPred(P) && LHSDef && + LHSDef->getOpcode() == TargetOpcode::G_AND) { + // Make sure that the RHS is 0. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI); + if (!ValAndVReg || ValAndVReg->Value != 0) + return nullptr; + + return emitTST(LHSDef->getOperand(1).getReg(), + LHSDef->getOperand(2).getReg(), MIRBuilder); + } + + return nullptr; +} + +bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, + // %zerovec(<2 x s32>) + // + // ...into: + // %splat = DUP %scalar + // We use the regbank of the scalar to determine which kind of dup to use. + MachineIRBuilder MIB(I); + MachineRegisterInfo &MRI = *MIB.getMRI(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + using namespace TargetOpcode; + using namespace MIPatternMatch; + + // Begin matching the insert. + auto *InsMI = + getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI); + if (!InsMI) + return false; + // Match the undef vector operand. + auto *UndefMI = + getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI); + if (!UndefMI) + return false; + // Match the scalar being splatted. + Register ScalarReg = InsMI->getOperand(2).getReg(); + const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI); + // Match the index constant 0. + int64_t Index = 0; + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) + return false; + + // The shuffle's second operand doesn't matter if the mask is all zero. + auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI); + if (!ZeroVec) + return false; + int64_t Zero = 0; + if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero) + return false; + for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) { + if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg()) + return false; // This wasn't an all zeros vector. + } + + // We're done, now find out what kind of splat we need. + LLT VecTy = MRI.getType(I.getOperand(0).getReg()); + LLT EltTy = VecTy.getElementType(); + if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) { + LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 128b yet"); + return false; + } + bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; + static const unsigned OpcTable[2][2] = { + {AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr}, + {AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}}; + unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64]; + + // For FP splats, we need to widen the scalar reg via undef too. + if (IsFP) { + MachineInstr *Widen = emitScalarToVector( + EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB); + if (!Widen) + return false; + ScalarReg = Widen->getOperand(0).getReg(); + } + auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg}); + if (IsFP) + Dup.addImm(0); + constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const { + if (TM.getOptLevel() == CodeGenOpt::None) + return false; + if (tryOptVectorDup(I)) + return true; + return false; +} + +bool AArch64InstructionSelector::selectShuffleVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + if (tryOptVectorShuffle(I)) + return true; + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + Register Src1Reg = I.getOperand(1).getReg(); + const LLT Src1Ty = MRI.getType(Src1Reg); + Register Src2Reg = I.getOperand(2).getReg(); + const LLT Src2Ty = MRI.getType(Src2Reg); + + MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); + LLVMContext &Ctx = MF.getFunction().getContext(); + + // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask + // operand, it comes in as a normal vector value which we have to analyze to + // find the mask indices. If the mask element is undef, then + // collectShuffleMaskIndices() will add a None entry for that index into + // the list. + SmallVector, 8> Mask; + collectShuffleMaskIndices(I, MRI, Mask); + assert(!Mask.empty() && "Expected to find mask indices"); + + // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if + // it's originated from a <1 x T> type. Those should have been lowered into + // G_BUILD_VECTOR earlier. + if (!Src1Ty.isVector() || !Src2Ty.isVector()) { + LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); + return false; + } - // This function is called from the addsub_shifted_imm ComplexPattern, - // which lists [imm] as the list of opcode it's interested in, however - // we still need to check whether the operand is actually an immediate - // here because the ComplexPattern opcode list is only used in - // root-level opcode matching. + unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; + + SmallVector CstIdxs; + for (auto &MaybeVal : Mask) { + // For now, any undef indexes we'll just assume to be 0. This should be + // optimized in future, e.g. to select DUP etc. + int Val = MaybeVal.hasValue() ? *MaybeVal : 0; + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); + } + } + + MachineIRBuilder MIRBuilder(I); + + // Use a constant pool to load the index vector for TBL. + Constant *CPVal = ConstantVector::get(CstIdxs); + MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder); + if (!IndexLoad) { + LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); + return false; + } + + if (DstTy.getSizeInBits() != 128) { + assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); + // This case can be done with TBL1. + MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder); + if (!Concat) { + LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); + return false; + } + + // The constant pool load will be 64 bits, so need to convert to FPR128 reg. + IndexLoad = + emitScalarToVector(64, &AArch64::FPR128RegClass, + IndexLoad->getOperand(0).getReg(), MIRBuilder); + + auto TBL1 = MIRBuilder.buildInstr( + AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, + {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); + constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); + + auto Copy = + MIRBuilder + .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) + .addReg(TBL1.getReg(0), 0, AArch64::dsub); + RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); + I.eraseFromParent(); + return true; + } + + // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive + // Q registers for regalloc. + auto RegSeq = MIRBuilder + .buildInstr(TargetOpcode::REG_SEQUENCE, + {&AArch64::QQRegClass}, {Src1Reg}) + .addImm(AArch64::qsub0) + .addUse(Src2Reg) + .addImm(AArch64::qsub1); + + auto TBL2 = + MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()}, + {RegSeq, IndexLoad->getOperand(0).getReg()}); + constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); + constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +MachineInstr *AArch64InstructionSelector::emitLaneInsert( + Optional DstReg, Register SrcReg, Register EltReg, + unsigned LaneIdx, const RegisterBank &RB, + MachineIRBuilder &MIRBuilder) const { + MachineInstr *InsElt = nullptr; + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + + // Create a register to define with the insert if one wasn't passed in. + if (!DstReg) + DstReg = MRI.createVirtualRegister(DstRC); + + unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); + unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; + + if (RB.getID() == AArch64::FPRRegBankID) { + auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); + InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) + .addImm(LaneIdx) + .addUse(InsSub->getOperand(0).getReg()) + .addImm(0); + } else { + InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) + .addImm(LaneIdx) + .addUse(EltReg); + } + + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); + return InsElt; +} + +bool AArch64InstructionSelector::selectInsertElt( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); + + // Get information on the destination. + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + unsigned VecSize = DstTy.getSizeInBits(); + + // Get information on the element we want to insert into the destination. + Register EltReg = I.getOperand(2).getReg(); + const LLT EltTy = MRI.getType(EltReg); + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize < 16 || EltSize > 64) + return false; // Don't support all element types yet. + + // Find the definition of the index. Bail out if it's not defined by a + // G_CONSTANT. + Register IdxReg = I.getOperand(3).getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI); + if (!VRegAndVal) + return false; + unsigned LaneIdx = VRegAndVal->Value; + + // Perform the lane insert. + Register SrcReg = I.getOperand(1).getReg(); + const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); + MachineIRBuilder MIRBuilder(I); + + if (VecSize < 128) { + // If the vector we're inserting into is smaller than 128 bits, widen it + // to 128 to do the insert. + MachineInstr *ScalarToVec = emitScalarToVector( + VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder); + if (!ScalarToVec) + return false; + SrcReg = ScalarToVec->getOperand(0).getReg(); + } + + // Create an insert into a new FPR128 register. + // Note that if our vector is already 128 bits, we end up emitting an extra + // register. + MachineInstr *InsMI = + emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder); + + if (VecSize < 128) { + // If we had to widen to perform the insert, then we have to demote back to + // the original size to get the result we want. + Register DemoteVec = InsMI->getOperand(0).getReg(); + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize); + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return false; + } + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return false; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize + << "\n"); + return false; + } + MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(DemoteVec, 0, SubReg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // No widening needed. + InsMI->getOperand(0).setReg(DstReg); + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectBuildVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + // Until we port more of the optimized selections, for now just use a vector + // insert sequence. + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize < 16 || EltSize > 64) + return false; // Don't support all element types yet. + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + MachineIRBuilder MIRBuilder(I); + + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + MachineInstr *ScalarToVec = + emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, + I.getOperand(1).getReg(), MIRBuilder); + if (!ScalarToVec) + return false; + + Register DstVec = ScalarToVec->getOperand(0).getReg(); + unsigned DstSize = DstTy.getSizeInBits(); + + // Keep track of the last MI we inserted. Later on, we might be able to save + // a copy using it. + MachineInstr *PrevMI = nullptr; + for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { + // Note that if we don't do a subregister copy, we can end up making an + // extra register. + PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, + MIRBuilder); + DstVec = PrevMI->getOperand(0).getReg(); + } + + // If DstTy's size in bits is less than 128, then emit a subregister copy + // from DstVec to the last register we've defined. + if (DstSize < 128) { + // Force this to be FPR using the destination vector. + const TargetRegisterClass *RC = + getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize); + if (!RC) + return false; + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return false; + } + + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return false; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize + << "\n"); + return false; + } + + Register Reg = MRI.createVirtualRegister(RC); + Register DstReg = I.getOperand(0).getReg(); + + MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(DstVec, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(Reg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // We don't need a subregister copy. Save a copy by re-using the + // destination register on the final insert. + assert(PrevMI && "PrevMI was null?"); + PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); + constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the +/// ID if it exists, and 0 otherwise. +static unsigned findIntrinsicID(MachineInstr &I) { + auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) { + return Op.isIntrinsicID(); + }); + if (IntrinOp == I.operands_end()) + return 0; + return IntrinOp->getIntrinsicID(); +} + +/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr +/// intrinsic. +static unsigned getStlxrOpcode(unsigned NumBytesToStore) { + switch (NumBytesToStore) { + // TODO: 1, 2, and 4 byte stores. + case 8: + return AArch64::STLXRX; + default: + LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! (" + << NumBytesToStore << ")\n"); + break; + } + return 0; +} + +bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // Find the intrinsic ID. + unsigned IntrinID = findIntrinsicID(I); + if (!IntrinID) + return false; + MachineIRBuilder MIRBuilder(I); + + // Select the instruction. + switch (IntrinID) { + default: + return false; + case Intrinsic::trap: + MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1); + break; + case Intrinsic::debugtrap: + if (!STI.isTargetWindows()) + return false; + MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); + break; + case Intrinsic::aarch64_stlxr: + Register StatReg = I.getOperand(0).getReg(); + assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 && + "Status register must be 32 bits!"); + Register SrcReg = I.getOperand(2).getReg(); + + if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) { + LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n"); + return false; + } + + Register PtrReg = I.getOperand(3).getReg(); + assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand"); + + // Expect only one memory operand. + if (!I.hasOneMemOperand()) + return false; + + const MachineMemOperand *MemOp = *I.memoperands_begin(); + unsigned NumBytesToStore = MemOp->getSize(); + unsigned Opc = getStlxrOpcode(NumBytesToStore); + if (!Opc) + return false; + + auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg}); + constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI); + } + + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectIntrinsic( + MachineInstr &I, MachineRegisterInfo &MRI) const { + unsigned IntrinID = findIntrinsicID(I); + if (!IntrinID) + return false; + MachineIRBuilder MIRBuilder(I); + + switch (IntrinID) { + default: + break; + case Intrinsic::aarch64_crypto_sha1h: + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(2).getReg(); + + // FIXME: Should this be an assert? + if (MRI.getType(DstReg).getSizeInBits() != 32 || + MRI.getType(SrcReg).getSizeInBits() != 32) + return false; + + // The operation has to happen on FPRs. Set up some new FPR registers for + // the source and destination if they are on GPRs. + if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { + SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)}); + + // Make sure the copy ends up getting constrained properly. + RBI.constrainGenericRegister(I.getOperand(2).getReg(), + AArch64::GPR32RegClass, MRI); + } + + if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) + DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + + // Actually insert the instruction. + auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); + constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); + + // Did we create a new register for the destination? + if (DstReg != I.getOperand(0).getReg()) { + // Yep. Copy the result of the instruction back into the original + // destination. + MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg}); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AArch64::GPR32RegClass, MRI); + } + + I.eraseFromParent(); + return true; + } + return false; +} + +static Optional getImmedFromMO(const MachineOperand &Root) { + auto &MI = *Root.getParent(); + auto &MBB = *MI.getParent(); + auto &MF = *MBB.getParent(); + auto &MRI = MF.getRegInfo(); uint64_t Immed; if (Root.isImm()) Immed = Root.getImm(); else if (Root.isCImm()) Immed = Root.getCImm()->getZExtValue(); else if (Root.isReg()) { - MachineInstr *Def = MRI.getVRegDef(Root.getReg()); - if (Def->getOpcode() != TargetOpcode::G_CONSTANT) + auto ValAndVReg = + getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); + if (!ValAndVReg) return None; - MachineOperand &Op1 = Def->getOperand(1); - if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64) - return None; - Immed = Op1.getCImm()->getZExtValue(); + Immed = ValAndVReg->Value; } else return None; + return Immed; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 31) + return None; + uint64_t Enc = (32 - *MaybeImmed) & 0x1f; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 31) + return None; + uint64_t Enc = 31 - *MaybeImmed; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 63) + return None; + uint64_t Enc = (64 - *MaybeImmed) & 0x3f; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None || *MaybeImmed > 63) + return None; + uint64_t Enc = 63 - *MaybeImmed; + return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; +} + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; + uint64_t Immed = *MaybeImmed; unsigned ShiftAmt; if (Immed >> 12 == 0) { diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 6f7fb7a8bc21..a985b330eafa 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -22,8 +21,11 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" +#define DEBUG_TYPE "aarch64-legalinfo" + using namespace llvm; using namespace LegalizeActions; +using namespace LegalizeMutations; using namespace LegalityPredicates; AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { @@ -46,9 +48,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { const LLT v2s32 = LLT::vector(2, 32); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); + const LLT v2p0 = LLT::vector(2, p0); getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({p0, s1, s8, s16, s32, s64, v2s64}) + .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8) .fewerElementsIf( @@ -65,33 +68,58 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { }); getActionDefinitionsBuilder(G_PHI) - .legalFor({p0, s16, s32, s64}) + .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64}) .clampScalar(0, s16, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_BSWAP) - .legalFor({s32, s64}) + .legalFor({s32, s64, v4s32, v2s32, v2s64}) .clampScalar(0, s16, s64) .widenScalarToNextPow2(0); - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) - .legalFor({s32, s64, v2s32, v4s32, v2s64}) + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) .clampNumElements(0, v2s32, v4s32) .clampNumElements(0, v2s64, v2s64) .moreElementsToNextPow2(0); + getActionDefinitionsBuilder(G_SHL) + .legalFor({{s32, s32}, {s64, s64}, + {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + .moreElementsToNextPow2(0) + .minScalarSameAs(1, 0); + getActionDefinitionsBuilder(G_GEP) .legalFor({{p0, s64}}) .clampScalar(1, s64, s64); getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0}); - getActionDefinitionsBuilder({G_LSHR, G_ASHR, G_SDIV, G_UDIV}) + getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32, s64}) .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0); + .widenScalarToNextPow2(0) + .scalarize(0); + + getActionDefinitionsBuilder({G_LSHR, G_ASHR}) + .customIf([=](const LegalityQuery &Query) { + const auto &SrcTy = Query.Types[0]; + const auto &AmtTy = Query.Types[1]; + return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && + AmtTy.getSizeInBits() == 32; + }) + .legalFor( + {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .minScalarSameAs(1, 0); getActionDefinitionsBuilder({G_SREM, G_UREM}) .lowerFor({s1, s8, s16, s32, s64}); @@ -101,15 +129,26 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); - getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO}) + getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO}) .legalFor({{s32, s1}, {s64, s1}}); - getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV}) - .legalFor({s32, s64}); + getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) + .legalFor({s32, s64, v2s64, v4s32, v2s32}); - getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64}); + getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); - getActionDefinitionsBuilder(G_FCEIL) + getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, + G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, + G_FNEARBYINT}) + // If we don't have full FP16 support, then scalarize the elements of + // vectors containing fp16 types. + .fewerElementsIf( + [=, &ST](const LegalityQuery &Query) { + const auto &Ty = Query.Types[0]; + return Ty.isVector() && Ty.getElementType() == s16 && + !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) // If we don't have full FP16 support, then widen s16 to s32 if we // encounter it. .widenScalarIf( @@ -117,7 +156,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return Query.Types[0] == s16 && !ST.hasFullFP16(); }, [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) - .legalFor({s16, s32, s64, v2s32, v4s32, v2s64}); + .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); + + getActionDefinitionsBuilder( + {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW}) + // We need a call for these, so we always need to scalarize. + .scalarize(0) + // Regardless of FP16 support, widen 16-bit elements to 32-bits. + .minScalar(0, s32) + .libcallFor({s32, s64, v2s32, v4s32, v2s64}); getActionDefinitionsBuilder(G_INSERT) .unsupportedIf([=](const LegalityQuery &Query) { @@ -158,12 +205,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .widenScalarToNextPow2(0); getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .legalForTypesWithMemSize({{s32, p0, 8}, - {s32, p0, 16}, - {s32, p0, 32}, - {s64, p0, 64}, - {p0, p0, 64}, - {v2s32, p0, 64}}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 8, 2}, + {s64, p0, 16, 2}, + {s64, p0, 32, 4}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {v2s32, p0, 64, 8}}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) // TODO: We could support sum-of-pow2's but the lowering code doesn't know @@ -172,16 +222,30 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { // Lower anything left over into G_*EXT and G_LOAD .lower(); + auto IsPtrVecPred = [=](const LegalityQuery &Query) { + const LLT &ValTy = Query.Types[0]; + if (!ValTy.isVector()) + return false; + const LLT EltTy = ValTy.getElementType(); + return EltTy.isPointer() && EltTy.getAddressSpace() == 0; + }; + getActionDefinitionsBuilder(G_LOAD) - .legalForTypesWithMemSize({{s8, p0, 8}, - {s16, p0, 16}, - {s32, p0, 32}, - {s64, p0, 64}, - {p0, p0, 64}, - {v2s32, p0, 64}}) + .legalForTypesWithMemDesc({{s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {v8s8, p0, 64, 8}, + {v16s8, p0, 128, 8}, + {v4s16, p0, 64, 8}, + {v8s16, p0, 128, 8}, + {v2s32, p0, 64, 8}, + {v4s32, p0, 128, 8}, + {v2s64, p0, 128, 8}}) // These extends are also legal - .legalForTypesWithMemSize({{s32, p0, 8}, - {s32, p0, 16}}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}}) .clampScalar(0, s8, s64) .widenScalarToNextPow2(0) // TODO: We could support sum-of-pow2's but the lowering code doesn't know @@ -191,16 +255,22 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; }) - .clampNumElements(0, v2s32, v2s32) - .clampMaxNumElements(0, s64, 1); + .clampMaxNumElements(0, s32, 2) + .clampMaxNumElements(0, s64, 1) + .customIf(IsPtrVecPred); getActionDefinitionsBuilder(G_STORE) - .legalForTypesWithMemSize({{s8, p0, 8}, - {s16, p0, 16}, - {s32, p0, 32}, - {s64, p0, 64}, - {p0, p0, 64}, - {v2s32, p0, 64}}) + .legalForTypesWithMemDesc({{s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 64, 8}, + {v16s8, p0, 128, 8}, + {v4s16, p0, 64, 8}, + {v8s16, p0, 128, 8}, + {v2s32, p0, 64, 8}, + {v4s32, p0, 128, 8}, + {v2s64, p0, 128, 8}}) .clampScalar(0, s8, s64) .widenScalarToNextPow2(0) // TODO: We could support sum-of-pow2's but the lowering code doesn't know @@ -210,23 +280,48 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return Query.Types[0].isScalar() && Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; }) - .clampNumElements(0, v2s32, v2s32) - .clampMaxNumElements(0, s64, 1); + .clampMaxNumElements(0, s32, 2) + .clampMaxNumElements(0, s64, 1) + .customIf(IsPtrVecPred); // Constants getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({p0, s32, s64}) - .clampScalar(0, s32, s64) + .legalFor({p0, s8, s16, s32, s64}) + .clampScalar(0, s8, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({s32, s64}) .clampScalar(0, s32, s64); getActionDefinitionsBuilder(G_ICMP) - .legalFor({{s32, s32}, {s32, s64}, {s32, p0}}) + .legalFor({{s32, s32}, + {s32, s64}, + {s32, p0}, + {v4s32, v4s32}, + {v2s32, v2s32}, + {v2s64, v2s64}, + {v2s64, v2p0}, + {v4s16, v4s16}, + {v8s16, v8s16}, + {v8s8, v8s8}, + {v16s8, v16s8}}) .clampScalar(0, s32, s32) .clampScalar(1, s32, s64) - .widenScalarToNextPow2(1); + .minScalarEltSameAsIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + return Ty.isVector() && !SrcTy.getElementType().isPointer() && + Ty.getElementType() != SrcTy.getElementType(); + }, + 0, 1) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, + 1, s32) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, + s64) + .widenScalarOrEltToNextPow2(1); getActionDefinitionsBuilder(G_FCMP) .legalFor({{s32, s32}, {s32, s64}}) @@ -236,24 +331,48 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { // Extensions getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) - .legalForCartesianProduct({s8, s16, s32, s64}, {s1, s8, s16, s32}); + .legalIf([=](const LegalityQuery &Query) { + unsigned DstSize = Query.Types[0].getSizeInBits(); + + // Make sure that we have something that will fit in a register, and + // make sure it's a power of 2. + if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) + return false; + + const LLT &SrcTy = Query.Types[1]; + + // Special case for s1. + if (SrcTy == s1) + return true; + + // Make sure we fit in a register otherwise. Don't bother checking that + // the source type is below 128 bits. We shouldn't be allowing anything + // through which is wider than the destination in the first place. + unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) + return false; + + return true; + }); + + getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); // FP conversions getActionDefinitionsBuilder(G_FPTRUNC).legalFor( - {{s16, s32}, {s16, s64}, {s32, s64}}); + {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); getActionDefinitionsBuilder(G_FPEXT).legalFor( - {{s32, s16}, {s64, s16}, {s64, s32}}); + {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}); // Conversions getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalForCartesianProduct({s32, s64}) + .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) .clampScalar(1, s32, s64) .widenScalarToNextPow2(1); getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalForCartesianProduct({s32, s64}) + .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) .clampScalar(1, s32, s64) .widenScalarToNextPow2(1) .clampScalar(0, s32, s64) @@ -264,10 +383,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); // Select + // FIXME: We can probably do a bit better than just scalarizing vector + // selects. getActionDefinitionsBuilder(G_SELECT) .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0); + .widenScalarToNextPow2(0) + .scalarize(0); // Pointer-handling getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); @@ -291,7 +413,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { // number of bits but it's what the previous code described and fixing // it breaks tests. .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, - v8s16, v4s16, v2s16, v4s32, v2s32, v2s64}); + v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, + v2p0}); getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); @@ -335,11 +458,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { } return false; }; - auto scalarize = - [](const LegalityQuery &Query, unsigned TypeIdx) { - const LLT &Ty = Query.Types[TypeIdx]; - return std::make_pair(TypeIdx, Ty.getElementType()); - }; // FIXME: This rule is horrible, but specifies the same as what we had // before with the particularly strange definitions removed (e.g. @@ -353,10 +471,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { // Break up vectors with weird elements into scalars .fewerElementsIf( [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, - [=](const LegalityQuery &Query) { return scalarize(Query, 0); }) + scalarize(0)) .fewerElementsIf( [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, - [=](const LegalityQuery &Query) { return scalarize(Query, 1); }) + scalarize(1)) // Clamp the big scalar to s8-s512 and make it either a power of 2, 192, // or 384. .clampScalar(BigTyIdx, s8, s512) @@ -397,16 +515,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0; }) // Any vectors left are the wrong size. Scalarize them. - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 0, Query.Types[0].getElementType()); - }) - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 1, Query.Types[1].getElementType()); - }); + .scalarize(0) + .scalarize(1); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -417,11 +527,24 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .minScalar(2, s64) .legalIf([=](const LegalityQuery &Query) { const LLT &VecTy = Query.Types[1]; - return VecTy == v4s32 || VecTy == v2s64; + return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || + VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32; + }); + + getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[0]; + // TODO: Support s8 and s16 + return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64; }); getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalFor({{v4s32, s32}, {v2s64, s64}}) + .legalFor({{v4s16, s16}, + {v8s16, s16}, + {v2s32, s32}, + {v4s32, s32}, + {v2p0, p0}, + {v2s64, s64}}) .clampNumElements(0, v4s32, v4s32) .clampNumElements(0, v2s64, v2s64) @@ -432,6 +555,42 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { }) .minScalarSameAs(1, 0); + getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct( + {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + .scalarize(1); + + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .legalIf([=](const LegalityQuery &Query) { + const LLT &DstTy = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + // For now just support the TBL2 variant which needs the source vectors + // to be the same size as the dest. + if (DstTy != SrcTy) + return false; + for (auto &Ty : {v2s32, v4s32, v2s64}) { + if (DstTy == Ty) + return true; + } + return false; + }) + // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we + // just want those lowered into G_BUILD_VECTOR + .lowerIf([=](const LegalityQuery &Query) { + return !Query.Types[1].isVector(); + }) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64); + + getActionDefinitionsBuilder(G_CONCAT_VECTORS) + .legalFor({{v4s32, v2s32}, {v8s16, v4s16}}); + + getActionDefinitionsBuilder(G_JUMP_TABLE) + .legalFor({{p0}, {s64}}); + + getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { + return Query.Types[0] == p0 && Query.Types[1] == s64; + }); + computeTables(); verify(*ST.getInstrInfo()); } @@ -446,37 +605,106 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, return false; case TargetOpcode::G_VAARG: return legalizeVaArg(MI, MRI, MIRBuilder); + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); + case TargetOpcode::G_SHL: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); } llvm_unreachable("expected switch to return"); } +bool AArch64LegalizerInfo::legalizeShlAshrLshr( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_ASHR || + MI.getOpcode() == TargetOpcode::G_LSHR || + MI.getOpcode() == TargetOpcode::G_SHL); + // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the + // imported patterns can select it later. Either way, it will be legal. + Register AmtReg = MI.getOperand(2).getReg(); + auto *CstMI = MRI.getVRegDef(AmtReg); + assert(CstMI && "expected to find a vreg def"); + if (CstMI->getOpcode() != TargetOpcode::G_CONSTANT) + return true; + // Check the shift amount is in range for an immediate form. + unsigned Amount = CstMI->getOperand(1).getCImm()->getZExtValue(); + if (Amount > 31) + return true; // This will have to remain a register variant. + assert(MRI.getType(AmtReg).getSizeInBits() == 32); + MIRBuilder.setInstr(MI); + auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); + MI.getOperand(2).setReg(ExtCst.getReg(0)); + return true; +} + +bool AArch64LegalizerInfo::legalizeLoadStore( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + assert(MI.getOpcode() == TargetOpcode::G_STORE || + MI.getOpcode() == TargetOpcode::G_LOAD); + // Here we just try to handle vector loads/stores where our value type might + // have pointer elements, which the SelectionDAG importer can't handle. To + // allow the existing patterns for s64 to fire for p0, we just try to bitcast + // the value to use s64 types. + + // Custom legalization requires the instruction, if not deleted, must be fully + // legalized. In order to allow further legalization of the inst, we create + // a new instruction and erase the existing one. + + unsigned ValReg = MI.getOperand(0).getReg(); + const LLT ValTy = MRI.getType(ValReg); + + if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || + ValTy.getElementType().getAddressSpace() != 0) { + LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); + return false; + } + + MIRBuilder.setInstr(MI); + unsigned PtrSize = ValTy.getElementType().getSizeInBits(); + const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize); + auto &MMO = **MI.memoperands_begin(); + if (MI.getOpcode() == TargetOpcode::G_STORE) { + auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg}); + MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO); + } else { + unsigned NewReg = MRI.createGenericVirtualRegister(NewTy); + auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO); + MIRBuilder.buildBitcast({ValReg}, {NewLoad}); + } + MI.eraseFromParent(); + return true; +} + bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const { MIRBuilder.setInstr(MI); MachineFunction &MF = MIRBuilder.getMF(); unsigned Align = MI.getOperand(2).getImm(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned ListPtr = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register ListPtr = MI.getOperand(1).getReg(); LLT PtrTy = MRI.getType(ListPtr); LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); const unsigned PtrSize = PtrTy.getSizeInBits() / 8; - unsigned List = MRI.createGenericVirtualRegister(PtrTy); + Register List = MRI.createGenericVirtualRegister(PtrTy); MIRBuilder.buildLoad( List, ListPtr, *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, PtrSize, /* Align = */ PtrSize)); - unsigned DstPtr; + Register DstPtr; if (Align > PtrSize) { // Realign the list to the actual required alignment. auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1); - unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildGEP(ListTmp, List, AlignMinus1->getOperand(0).getReg()); + auto ListTmp = MIRBuilder.buildGEP(PtrTy, List, AlignMinus1.getReg(0)); DstPtr = MRI.createGenericVirtualRegister(PtrTy); MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); @@ -489,11 +717,9 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, ValSize, std::max(Align, PtrSize))); - unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy); - MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize)); + auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize)); - unsigned NewList = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildGEP(NewList, DstPtr, SizeReg); + auto NewList = MIRBuilder.buildGEP(PtrTy, DstPtr, Size.getReg(0)); MIRBuilder.buildStore( NewList, ListPtr, diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h index 77e8bdc7623c..f3362a18620f 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -1,9 +1,8 @@ //===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -35,6 +34,12 @@ public: private: bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const; + bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; + bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; }; } // End llvm namespace. #endif diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index aa732a99469c..65b5f906e3f6 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1,9 +1,8 @@ //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -934,8 +933,6 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, ? getLdStOffsetOp(*StoreI).getImm() : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; - int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); - int Imms = Immr + Width - 1; unsigned DestReg = IsStoreXReg ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, &AArch64::GPR64RegClass) @@ -945,8 +942,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && "Invalid offset"); - Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); - Imms = Immr + Width - 1; + int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + int Imms = Immr + Width - 1; if (UnscaledLdOffset == UnscaledStOffset) { uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N | ((Immr) << 6) // immr diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index d71359223b1b..e7d4a2789a28 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -1,9 +1,8 @@ //==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h index aa30fe1fa707..8f3148a98410 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.h +++ b/lib/Target/AArch64/AArch64MCInstLower.h @@ -1,9 +1,8 @@ //===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 5183e7d3c0d0..0efeeb272ec1 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -1,9 +1,8 @@ //=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -92,6 +91,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// other stack allocations. bool CalleeSaveStackHasFreeSpace = false; + /// SRetReturnReg - sret lowering includes returning the value of the + /// returned struct in a register. This field holds the virtual register into + /// which the sret argument is passed. + unsigned SRetReturnReg = 0; + /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. /// Initialized during frame lowering, unless the function has the noredzone @@ -101,6 +105,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. SmallVector ForwardedMustTailRegParms; + + // Offset from SP-at-entry to the tagged base pointer. + // Tagged base pointer is set up to point to the first (lowest address) tagged + // stack slot. + unsigned TaggedBasePointerOffset; + public: AArch64FunctionInfo() = default; @@ -166,6 +176,9 @@ public: unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } + unsigned getSRetReturnReg() const { return SRetReturnReg; } + void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } + unsigned getJumpTableEntrySize(int Idx) const { auto It = JumpTableEntryInfo.find(Idx); if (It != JumpTableEntryInfo.end()) @@ -217,6 +230,13 @@ public: return ForwardedMustTailRegParms; } + unsigned getTaggedBasePointerOffset() const { + return TaggedBasePointerOffset; + } + void setTaggedBasePointerOffset(unsigned Offset) { + TaggedBasePointerOffset = Offset; + } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp index bc596dd38b6e..9a2103579a6a 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -1,9 +1,8 @@ //===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h index 32d90d4c40d6..4e7ccbe4baab 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.h +++ b/lib/Target/AArch64/AArch64MacroFusion.h @@ -1,9 +1,8 @@ //===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index ccf646575296..aff861aae6be 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -1,9 +1,8 @@ //===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file contains the AArch64 / Cortex-A57 specific register allocation diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h index b99c1d1d6b3e..5ea91b4a1967 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.h +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -1,9 +1,8 @@ //==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h index 9e9eec48c555..f443cd03935c 100644 --- a/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -1,9 +1,8 @@ //===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64PfmCounters.td b/lib/Target/AArch64/AArch64PfmCounters.td index 16ba3e4282a0..b1d1664e3f1b 100644 --- a/lib/Target/AArch64/AArch64PfmCounters.td +++ b/lib/Target/AArch64/AArch64PfmCounters.td @@ -1,9 +1,8 @@ //===-- AArch64PfmCounters.td - AArch64 Hardware Counters --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 3da9306e6460..5f7245bfbd74 100644 --- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -1,9 +1,8 @@ //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -44,6 +43,10 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, switch (MI.getOpcode()) { default: return false; + case TargetOpcode::COPY: + return Helper.tryCombineCopy(MI); + case TargetOpcode::G_BR: + return Helper.tryCombineBr(MI); case TargetOpcode::G_LOAD: case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp index 01d8a35bbc23..a594ecb71fc9 100644 --- a/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -1,9 +1,8 @@ //==- AArch64PromoteConstant.cpp - Promote constant to global for AArch64 --==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -494,7 +493,8 @@ void AArch64PromoteConstant::insertDefinitions(Function &F, for (const auto &IPI : InsertPts) { // Create the load of the global variable. IRBuilder<> Builder(IPI.first); - LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV); + LoadInst *LoadedCst = + Builder.CreateLoad(PromotedGV.getValueType(), &PromotedGV); LLVM_DEBUG(dbgs() << "**********\n"); LLVM_DEBUG(dbgs() << "New def: "); LLVM_DEBUG(LoadedCst->print(dbgs())); diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index fcb0b36a9f6d..0d75ab7ac8a9 100644 --- a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -1,9 +1,8 @@ //=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // This pass removes unnecessary copies/moves in BBs based on a dominating // condition. @@ -380,8 +379,8 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) { bool IsCopy = MI->isCopy(); bool IsMoveImm = MI->isMoveImmediate(); if (IsCopy || IsMoveImm) { - MCPhysReg DefReg = MI->getOperand(0).getReg(); - MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0; + Register DefReg = MI->getOperand(0).getReg(); + Register SrcReg = IsCopy ? MI->getOperand(1).getReg() : Register(); int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0; if (!MRI->isReserved(DefReg) && ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) || diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 68c48a5ec216..b52259cc9acd 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- AArch64RegisterBankInfo.cpp ----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -243,12 +242,17 @@ const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass( case AArch64::GPR32RegClassID: case AArch64::GPR32spRegClassID: case AArch64::GPR32sponlyRegClassID: + case AArch64::GPR32argRegClassID: case AArch64::GPR32allRegClassID: case AArch64::GPR64commonRegClassID: case AArch64::GPR64RegClassID: case AArch64::GPR64spRegClassID: case AArch64::GPR64sponlyRegClassID: + case AArch64::GPR64argRegClassID: case AArch64::GPR64allRegClassID: + case AArch64::GPR64noipRegClassID: + case AArch64::GPR64common_and_GPR64noipRegClassID: + case AArch64::GPR64noip_and_tcGPR64RegClassID: case AArch64::tcGPR64RegClassID: case AArch64::WSeqPairsClassRegClassID: case AArch64::XSeqPairsClassRegClassID: @@ -385,11 +389,26 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: + case TargetOpcode::G_FMA: case TargetOpcode::G_FDIV: case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_FPEXT: case TargetOpcode::G_FPTRUNC: case TargetOpcode::G_FCEIL: + case TargetOpcode::G_FFLOOR: + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_FNEG: + case TargetOpcode::G_FCOS: + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FLOG10: + case TargetOpcode::G_FLOG: + case TargetOpcode::G_FLOG2: + case TargetOpcode::G_FSQRT: + case TargetOpcode::G_FABS: + case TargetOpcode::G_FEXP: + case TargetOpcode::G_FRINT: + case TargetOpcode::G_INTRINSIC_TRUNC: + case TargetOpcode::G_INTRINSIC_ROUND: return true; } return false; @@ -438,6 +457,54 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping( getValueMapping(RBIdx, Size), NumOperands); } +bool AArch64RegisterBankInfo::hasFPConstraints( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + unsigned Op = MI.getOpcode(); + + // Do we have an explicit floating point instruction? + if (isPreISelGenericFloatingPointOpcode(Op)) + return true; + + // No. Check if we have a copy-like instruction. If we do, then we could + // still be fed by floating point instructions. + if (Op != TargetOpcode::COPY && !MI.isPHI()) + return false; + + // MI is copy-like. Return true if it outputs an FPR. + return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank; +} + +bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + case TargetOpcode::G_FCMP: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI); +} + +bool AArch64RegisterBankInfo::onlyDefinesFP( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + case TargetOpcode::G_INSERT_VECTOR_ELT: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI); +} + const RegisterBankInfo::InstructionMapping & AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const unsigned Opc = MI.getOpcode(); @@ -470,10 +537,6 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_AND: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: - // Shifts. - case TargetOpcode::G_SHL: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: // Floating point ops. case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: @@ -487,6 +550,17 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { DefaultMappingID, /*Cost*/ 1, getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), /*NumOperands*/ 2); + } + // Shifts. + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: { + LLT ShiftAmtTy = MRI.getType(MI.getOperand(2).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (ShiftAmtTy.getSizeInBits() == 64 && SrcTy.getSizeInBits() == 32) + return getInstructionMapping(DefaultMappingID, 1, + &ValMappings[Shift64Imm], 3); + return getSameKindOfOperandsMapping(MI); } case TargetOpcode::COPY: { unsigned DstReg = MI.getOperand(0).getReg(); @@ -563,10 +637,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { switch (Opc) { case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + break; OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; break; case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + break; OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; break; case TargetOpcode::G_FCMP: @@ -600,15 +678,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // assume this was a floating point load in the IR. // If it was not, we would have had a bitcast before // reaching that instruction. - unsigned UseOpc = UseMI.getOpcode(); - if (isPreISelGenericFloatingPointOpcode(UseOpc) || - // Check if we feed a copy-like instruction with - // floating point constraints. In that case, we are still - // feeding fp instructions, but indirectly - // (e.g., through ABI copies). - ((UseOpc == TargetOpcode::COPY || UseMI.isPHI()) && - getRegBank(UseMI.getOperand(0).getReg(), MRI, TRI) == - &AArch64::FPRRegBank)) { + if (onlyUsesFP(UseMI, MRI, TRI)) { OpRegBankIdx[0] = PMI_FirstFPR; break; } @@ -621,18 +691,134 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (!VReg) break; MachineInstr *DefMI = MRI.getVRegDef(VReg); - unsigned DefOpc = DefMI->getOpcode(); - if (isPreISelGenericFloatingPointOpcode(DefOpc) || - // Check if we come from a copy-like instruction with - // floating point constraints. In that case, we are still - // fed by fp instructions, but indirectly - // (e.g., through ABI copies). - ((DefOpc == TargetOpcode::COPY || DefMI->isPHI()) && - getRegBank(DefMI->getOperand(0).getReg(), MRI, TRI) == - &AArch64::FPRRegBank)) + if (onlyDefinesFP(*DefMI, MRI, TRI)) OpRegBankIdx[0] = PMI_FirstFPR; break; } + break; + case TargetOpcode::G_SELECT: { + // If the destination is FPR, preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + // If we're taking in vectors, we have no choice but to put everything on + // FPRs. + LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); + if (SrcTy.isVector()) { + for (unsigned Idx = 0; Idx < 4; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + break; + } + + // Try to minimize the number of copies. If we have more floating point + // constrained values than not, then we'll put everything on FPR. Otherwise, + // everything has to be on GPR. + unsigned NumFP = 0; + + // Check if the uses of the result always produce floating point values. + // + // For example: + // + // %z = G_SELECT %cond %x %y + // fpr = G_FOO %z ... + if (any_of( + MRI.use_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) + ++NumFP; + + // Check if the defs of the source values always produce floating point + // values. + // + // For example: + // + // %x = G_SOMETHING_ALWAYS_FLOAT %a ... + // %z = G_SELECT %cond %x %y + // + // Also check whether or not the sources have already been decided to be + // FPR. Keep track of this. + // + // This doesn't check the condition, since it's just whatever is in NZCV. + // This isn't passed explicitly in a register to fcsel/csel. + for (unsigned Idx = 2; Idx < 4; ++Idx) { + unsigned VReg = MI.getOperand(Idx).getReg(); + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*DefMI, MRI, TRI)) + ++NumFP; + } + + // If we have more FP constraints than not, then move everything over to + // FPR. + if (NumFP >= 2) + for (unsigned Idx = 0; Idx < 4; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + + break; + } + case TargetOpcode::G_UNMERGE_VALUES: { + // If the first operand belongs to a FPR register bank, then make sure that + // we preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg()); + // UNMERGE into scalars from a vector should always use FPR. + // Likewise if any of the uses are FP instructions. + if (SrcTy.isVector() || + any_of(MRI.use_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { + // Set the register bank of every operand to FPR. + for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); + Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + // Destination and source need to be FPRs. + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + + // Index needs to be a GPR. + OpRegBankIdx[2] = PMI_FirstGPR; + break; + case TargetOpcode::G_INSERT_VECTOR_ELT: + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + + // The element may be either a GPR or FPR. Preserve that behaviour. + if (getRegBank(MI.getOperand(2).getReg(), MRI, TRI) == &AArch64::FPRRegBank) + OpRegBankIdx[2] = PMI_FirstFPR; + else + OpRegBankIdx[2] = PMI_FirstGPR; + + // Index needs to be a GPR. + OpRegBankIdx[3] = PMI_FirstGPR; + break; + case TargetOpcode::G_BUILD_VECTOR: + // If the first source operand belongs to a FPR register bank, then make + // sure that we preserve that. + if (OpRegBankIdx[1] != PMI_FirstGPR) + break; + unsigned VReg = MI.getOperand(1).getReg(); + if (!VReg) + break; + + // Get the instruction that defined the source operand reg, and check if + // it's a floating point operation. Or, if it's a type like s16 which + // doesn't have a exact size gpr register class. + MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned DefOpc = DefMI->getOpcode(); + const LLT SrcTy = MRI.getType(VReg); + if (isPreISelGenericFloatingPointOpcode(DefOpc) || + SrcTy.getSizeInBits() < 32) { + // Have a floating point op. + // Make sure every operand gets mapped to a FPR register class. + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; } // Finally construct the computed mapping. diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h index 008221dbef58..016fed65eb2a 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -1,9 +1,8 @@ //===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -58,6 +57,7 @@ protected: FPExt16To64Idx = 43, FPExt32To64Idx = 45, FPExt64To128Idx = 47, + Shift64Imm = 49 }; static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, @@ -114,6 +114,18 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { const InstructionMapping & getSameKindOfOperandsMapping(const MachineInstr &MI) const; + /// Returns true if the output of \p MI must be stored on a FPR register. + bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + /// Returns true if the source registers of \p MI must all be FPRs. + bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + /// Returns true if the destination register of \p MI must be a FPR. + bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + public: AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); diff --git a/lib/Target/AArch64/AArch64RegisterBanks.td b/lib/Target/AArch64/AArch64RegisterBanks.td index eee584708f69..7bbd992890d1 100644 --- a/lib/Target/AArch64/AArch64RegisterBanks.td +++ b/lib/Target/AArch64/AArch64RegisterBanks.td @@ -1,9 +1,8 @@ //=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 96ae45ae3d0d..6d5a4e3d2f76 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1,9 +1,8 @@ //===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -217,11 +216,8 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, } bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const { - // FIXME: Get the list of argument registers from TableGen. - static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, - AArch64::X3, AArch64::X4, AArch64::X5, - AArch64::X6, AArch64::X7 }; - return std::any_of(std::begin(GPRArgRegs), std::end(GPRArgRegs), + return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC), + std::end(*AArch64::GPR64argRegClass.MC), [this, &MF](MCPhysReg r){return isReservedReg(MF, r);}); } @@ -283,7 +279,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { return false; } -unsigned +Register AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP; @@ -457,15 +453,34 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || MI.getOpcode() == TargetOpcode::PATCHPOINT) { Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, - /*PreferFP=*/true); + /*PreferFP=*/true, + /*ForSimm=*/false); Offset += MI.getOperand(FIOperandNum + 1).getImm(); MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); return; } + if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) { + MachineOperand &FI = MI.getOperand(FIOperandNum); + Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); + FI.ChangeToImmediate(Offset); + return; + } + + if (MI.getOpcode() == AArch64::TAGPstack) { + // TAGPstack must use the virtual frame register in its 3rd operand. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const AArch64FunctionInfo *AFI = MF.getInfo(); + FrameReg = MI.getOperand(3).getReg(); + Offset = + MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset(); + } else { + Offset = TFI->resolveFrameIndexReference( + MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); + } + // Modify MI as necessary to handle as much of 'Offset' as possible - Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg); if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) return; @@ -519,3 +534,13 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return 16; } } + +unsigned AArch64RegisterInfo::getLocalAddressRegister( + const MachineFunction &MF) const { + const auto &MFI = MF.getFrameInfo(); + if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects()) + return AArch64::SP; + else if (needsStackRealignment(MF)) + return getBaseRegister(); + return getFrameRegister(MF); +} diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index c4153228a7c0..2c3f82c530d8 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -1,9 +1,8 @@ //==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -114,7 +113,7 @@ public: unsigned getBaseRegister() const; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -122,6 +121,8 @@ public: bool trackLivenessAfterRegAlloc(const MachineFunction&) const override { return true; } + + unsigned getLocalAddressRegister(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index d3710cea0687..61fc0795c242 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1,9 +1,8 @@ //=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -188,6 +187,10 @@ def GPR64z : RegisterOperand { let GIZeroRegister = XZR; } +// GPR argument registers. +def GPR32arg : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 7)>; +def GPR64arg : RegisterClass<"AArch64", [i64], 64, (sequence "X%u", 0, 7)>; + // GPR register classes which include WZR/XZR AND SP/WSP. This is not a // constraint used by any instructions, it is used as a common super-class. def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>; @@ -206,6 +209,11 @@ def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X2 // BTI-protected function. def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>; +// Register set that excludes registers that are reserved for procedure calls. +// This is used for pseudo-instructions that are actually implemented using a +// procedure call. +def GPR64noip : RegisterClass<"AArch64", [i64], 64, (sub GPR64, X16, X17, LR)>; + // GPR register classes for post increment amount of vector load/store that // has alternate printing when Rm=31 and prints a constant immediate value // equal to the total number of bytes transferred. @@ -649,10 +657,12 @@ def FPR128Op : RegisterOperand { // ARMv8.1a atomic CASP register operands -def WSeqPairs : RegisterTuples<[sube32, subo32], - [(rotl GPR32, 0), (rotl GPR32, 1)]>; -def XSeqPairs : RegisterTuples<[sube64, subo64], - [(rotl GPR64, 0), (rotl GPR64, 1)]>; +def WSeqPairs : RegisterTuples<[sube32, subo32], + [(decimate (rotl GPR32, 0), 2), + (decimate (rotl GPR32, 1), 2)]>; +def XSeqPairs : RegisterTuples<[sube64, subo64], + [(decimate (rotl GPR64, 0), 2), + (decimate (rotl GPR64, 1), 2)]>; def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, (add WSeqPairs)>{ diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index af555f6d2266..854670079e40 100644 --- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -1,8 +1,7 @@ // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index 0fde68011e86..79ab42f4c080 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1,9 +1,8 @@ //=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -*- tablegen -*-----=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,10 +25,10 @@ let Predicates = [HasSVE] in { defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">; defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">; - def AND_ZZZ : sve_int_bin_cons_log<0b00, "and">; - def ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">; - def EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">; - def BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">; + defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and">; + defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">; + defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">; + defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">; defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">; defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">; @@ -876,10 +875,10 @@ let Predicates = [HasSVE] in { defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b000, "asr">; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b001, "lsr">; - defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b100, "asrd">; + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">; + defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">; defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">; defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">; @@ -1022,3 +1021,406 @@ let Predicates = [HasSVE] in { def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; } + +let Predicates = [HasSVE2] in { + // SVE2 integer multiply-add (indexed) + defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">; + defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">; + + // SVE2 saturating multiply-add high (indexed) + defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">; + defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">; + + // SVE2 saturating multiply-add high (vectors, unpredicated) + defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">; + defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">; + + // SVE2 integer multiply (indexed) + defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">; + + // SVE2 saturating multiply high (indexed) + defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">; + defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">; + + // SVE2 signed saturating doubling multiply high (unpredicated) + defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">; + defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">; + + // SVE2 integer multiply vectors (unpredicated) + defm MUL_ZZZ : sve2_int_mul<0b000, "mul">; + defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">; + defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">; + def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>; + + // SVE2 complex integer dot product (indexed) + defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">; + + // SVE2 complex integer dot product + defm CDOT_ZZZ : sve2_cintx_dot<"cdot">; + + // SVE2 complex integer multiply-add (indexed) + defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">; + // SVE2 complex saturating multiply-add (indexed) + defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">; + + // SVE2 complex integer multiply-add + defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">; + defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">; + + // SVE2 integer multiply long (indexed) + defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">; + defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">; + defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">; + defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">; + + // SVE2 saturating multiply (indexed) + defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">; + defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">; + + // SVE2 integer multiply-add long (indexed) + defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">; + defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">; + defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">; + defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">; + defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">; + defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">; + defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">; + defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">; + + // SVE2 integer multiply-add long (vectors, unpredicated) + defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">; + defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">; + defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">; + defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">; + defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">; + defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">; + defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">; + defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">; + + // SVE2 saturating multiply-add long (indexed) + defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">; + defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">; + defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">; + defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">; + + // SVE2 saturating multiply-add long (vectors, unpredicated) + defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">; + defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">; + defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">; + defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">; + + // SVE2 saturating multiply-add interleaved long + defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">; + defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">; + + // SVE2 integer halving add/subtract (predicated) + defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">; + defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">; + defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">; + defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">; + defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">; + defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">; + defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">; + defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">; + + // SVE2 integer pairwise add and accumulate long + defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">; + defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">; + + // SVE2 integer pairwise arithmetic + defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">; + defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">; + defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">; + defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">; + defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">; + + // SVE2 integer unary operations (predicated) + defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">; + defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">; + defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">; + defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">; + + // SVE2 saturating add/subtract + defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">; + defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">; + defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">; + defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">; + defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">; + defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">; + defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">; + defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">; + + // SVE2 saturating/rounding bitwise shift left (predicated) + defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">; + defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">; + defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">; + defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">; + defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">; + defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">; + defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">; + defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">; + defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">; + defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">; + defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">; + defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">; + + // SVE2 integer add/subtract long + defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">; + defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">; + defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">; + defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">; + defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">; + defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">; + defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">; + defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">; + defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">; + defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">; + defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">; + defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">; + + // SVE2 integer add/subtract wide + defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">; + defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">; + defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">; + defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">; + defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">; + defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">; + defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">; + defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">; + + // SVE2 integer multiply long + defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">; + defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">; + defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">; + defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">; + defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">; + defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">; + defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">; + defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">; + + // SVE2 bitwise shift and insert + defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">; + defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">; + + // SVE2 bitwise shift right and accumulate + defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">; + defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">; + defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">; + defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">; + + // SVE2 complex integer add + defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">; + defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">; + + // SVE2 integer absolute difference and accumulate + defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">; + defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">; + + // SVE2 integer absolute difference and accumulate long + defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">; + defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">; + defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">; + defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">; + + // SVE2 integer add/subtract long with carry + defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">; + defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">; + defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">; + defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">; + + // SVE2 bitwise shift right narrow + defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">; + defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">; + defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">; + defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">; + defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">; + defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">; + defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">; + defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">; + defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">; + defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">; + defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">; + defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">; + defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">; + defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">; + defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">; + defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">; + + // SVE2 integer add/subtract narrow high part + defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">; + defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">; + defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">; + defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">; + defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">; + defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">; + defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">; + defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">; + + // SVE2 saturating extract narrow + defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">; + defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">; + defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">; + defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">; + defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">; + defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">; + + // SVE2 character match + defm MATCH_PPzZZ : sve2_char_match<0b0, "match">; + defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">; + + // SVE2 bitwise exclusive-or interleaved + defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">; + defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">; + + // SVE2 bitwise shift left long + defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">; + defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">; + defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">; + defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">; + + // SVE2 integer add/subtract interleaved long + defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">; + defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">; + defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">; + + // SVE2 histogram generation (segment) + def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">; + + // SVE2 histogram generation (vector) + defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; + + // SVE2 floating-point convert precision + defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">; + defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">; + defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">; + + // SVE2 floating-point pairwise operations + defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">; + defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">; + defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">; + defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">; + defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">; + + // SVE2 floating-point multiply-add long (indexed) + def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">; + def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">; + def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">; + def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">; + + // SVE2 floating-point multiply-add long + def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">; + def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">; + def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">; + def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">; + + // SVE2 bitwise ternary operations + defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">; + defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">; + def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">; + def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">; + def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">; + def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">; + + // sve_int_rotate_imm + defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">; + + // SVE2 extract vector (immediate offset, constructive) + def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; + + // SVE floating-point convert precision + def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>; + + // SVE floating-point convert to integer + defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; + + // Non-temporal contiguous loads (vector + register) + defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; + defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; + defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; + defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; + defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; + + defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; + defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; + defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; + defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; + defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; + defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; + defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + + // SVE2 vector splice (constructive) + defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; + + // Predicated shifts + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; + defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; + + // Non-temporal contiguous stores (vector + register) + defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; + defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; + defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; + + defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; + defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; + defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; + defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + + // SVE table lookup (three sources) + defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">; + defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">; + + // SVE integer compare scalar count and limit + defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">; + defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">; + defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">; + defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">; + + defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">; + defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">; + defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">; + defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">; + + // SVE pointer conflict compare + defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">; + defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">; +} + +let Predicates = [HasSVE2AES] in { + // SVE2 crypto destructive binary operations + def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>; + def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>; + + // SVE2 crypto unary operations + def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">; + def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">; + + // PMULLB and PMULLT instructions which operate with 64-bit source and + // 128-bit destination elements are enabled with crypto extensions, similar + // to NEON PMULL2 instruction. + def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb", + ZPR128, ZPR64, ZPR64>; + def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt", + ZPR128, ZPR64, ZPR64>; +} + +let Predicates = [HasSVE2SM4] in { + // SVE2 crypto constructive binary operations + def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>; + // SVE2 crypto destructive binary operations + def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>; +} + +let Predicates = [HasSVE2SHA3] in { + // SVE2 crypto constructive binary operations + def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>; +} + +let Predicates = [HasSVE2BitPerm] in { + // SVE2 bitwise permute + defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">; + defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">; + defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">; +} diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td index f253a4f3e25a..a6df0f3f083c 100644 --- a/lib/Target/AArch64/AArch64SchedA53.td +++ b/lib/Target/AArch64/AArch64SchedA53.td @@ -1,9 +1,8 @@ //==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,7 +26,7 @@ def CortexA53Model : SchedMachineModel { // v 1.0 Spreadsheet let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; } diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td index ade03f23f8c7..9f566d1c7079 100644 --- a/lib/Target/AArch64/AArch64SchedA57.td +++ b/lib/Target/AArch64/AArch64SchedA57.td @@ -1,9 +1,8 @@ //=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -32,7 +31,7 @@ def CortexA57Model : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td index 55005e1d9ed1..987ed3c4ebfb 100644 --- a/lib/Target/AArch64/AArch64SchedA57WriteRes.td +++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -1,9 +1,8 @@ //=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td index 7a474ba8ef9b..798ecb7508c0 100644 --- a/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/lib/Target/AArch64/AArch64SchedCyclone.td @@ -1,9 +1,8 @@ //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,7 @@ def CycloneModel : SchedMachineModel { let MispredictPenalty = 16; // 14-19 cycles are typical. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td index f757d53b6c1c..f1e76e2c20d3 100644 --- a/lib/Target/AArch64/AArch64SchedExynosM1.td +++ b/lib/Target/AArch64/AArch64SchedExynosM1.td @@ -1,9 +1,8 @@ //=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,7 +24,7 @@ def ExynosM1Model : SchedMachineModel { let MispredictPenalty = 14; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td index 15935088a17e..c9d29d75d9db 100644 --- a/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -1,9 +1,8 @@ //=- AArch64SchedExynosM3.td - Samsung Exynos M3 Sched Defs --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,7 +24,7 @@ def ExynosM3Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SchedExynosM4.td b/lib/Target/AArch64/AArch64SchedExynosM4.td index 4d892465b3f2..c8bf05f16131 100644 --- a/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -1,9 +1,8 @@ //=- AArch64SchedExynosM4.td - Samsung Exynos M4 Sched Defs --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,7 +24,7 @@ def ExynosM4Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; } //===----------------------------------------------------------------------===// @@ -239,7 +238,6 @@ def M4WriteNEONK : SchedWriteRes<[M4UnitNSHF, M4UnitS0]> { let Latency = 5; let NumMicroOps = 2; } def M4WriteNEONL : SchedWriteRes<[M4UnitNMUL]> { let Latency = 3; } -def M4WriteNEONM : SchedWriteRes<[M4UnitNMUL]> { let Latency = 3; } def M4WriteNEONN : SchedWriteRes<[M4UnitNMSC, M4UnitNMSC]> { let Latency = 5; let NumMicroOps = 2; } @@ -480,8 +478,6 @@ def M4WriteCOPY : SchedWriteVariant<[SchedVar, SchedVar]>; def M4WriteMOVI : SchedWriteVariant<[SchedVar, SchedVar]>; -def M4WriteMULL : SchedWriteVariant<[SchedVar, - SchedVar]>; // Fast forwarding. def M4ReadAESM1 : SchedReadAdvance<+1, [M4WriteNCRY1]>; @@ -489,7 +485,8 @@ def M4ReadFMACM1 : SchedReadAdvance<+1, [M4WriteFMAC4, M4WriteFMAC4H, M4WriteFMAC5]>; def M4ReadNMULM1 : SchedReadAdvance<+1, [M4WriteNMUL3]>; -def M4ReadMULLP2 : SchedReadAdvance<-2, [M4WriteNEONM]>; +def M4ReadNMULP2 : SchedReadAdvance<-2, [M4WriteNMUL3]>; + //===----------------------------------------------------------------------===// // Coarse scheduling model. @@ -662,10 +659,8 @@ def : InstRW<[M4WriteNEONK], (instregex "^FMOVDXHighr")>; def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev1f16")>; def : InstRW<[M4WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev1i(32|64)")>; def : InstRW<[M4WriteNMSC1], (instregex "^FRECPXv1")>; -def : InstRW<[M4WriteFMAC4H, - M4ReadFMACM1], (instregex "^F(RECP|RSQRT)S16")>; -def : InstRW<[M4WriteFMAC4, - M4ReadFMACM1], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[M4WriteFMAC4], (instregex "^F(RECP|RSQRT)S(32|64)")>; // FP load instructions. def : InstRW<[WriteVLD], (instregex "^LDR[SDQ]l")>; @@ -736,14 +731,20 @@ def : InstRW<[M4WriteNALU1], (instregex "^(AND|BIC|EOR|NOT|ORN|ORR)v")>; def : InstRW<[M4WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>; def : InstRW<[M4WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>; def : InstRW<[M4WriteNHAD3], (instregex "^[SU](MIN|MAX)Vv")>; -def : InstRW<[M4WriteNMUL3], (instregex "^(SQR?D)?MULH?v")>; def : InstRW<[M4WriteNMUL3, M4ReadNMULM1], (instregex "^ML[AS]v")>; -def : InstRW<[M4WriteNMUL3], (instregex "^SQRDML[AS]H")>; -def : InstRW<[M4WriteMULL, - M4ReadMULLP2], (instregex "^(S|U|SQD)ML[AS]Lv")>; -def : InstRW<[M4WriteMULL, - M4ReadMULLP2], (instregex "^(S|U|SQD)MULLv")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^(SQR?D)?MULH?v")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^SQRDML[AS]H")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^(S|U|SQD)ML[AS]L(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULP2], (instregex "^(S|U|SQD)ML[AS]L(v4i32|v8i16|v16i8)")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^(S|U|SQD)MULL(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULP2], (instregex "^(S|U|SQD)MULL(v4i32|v8i16|v16i8)")>; def : InstRW<[M4WriteNMUL3], (instregex "^[SU]DOT(lane)?v")>; def : InstRW<[M4WriteNHAD3], (instregex "^[SU]ADALPv")>; def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]R?SRA[dv]")>; @@ -808,10 +809,8 @@ def : InstRW<[M4WriteNALU1], (instregex "^FMOVv.f(32|64)")>; def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev[248]f16")>; def : InstRW<[M4WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev[248]f(32|64)")>; def : InstRW<[M4WriteFCVT3], (instregex "^U(RECP|RSQRT)Ev[24]i32")>; -def : InstRW<[M4WriteFMAC4H, - M4ReadFMACM1], (instregex "^F(RECP|RSQRT)Sv.f16")>; -def : InstRW<[M4WriteFMAC4, - M4ReadFMACM1], (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>; +def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)Sv.f16")>; +def : InstRW<[M4WriteFMAC4], (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>; def : InstRW<[M4WriteNSHF1], (instregex "^REV(16|32|64)v")>; def : InstRW<[M4WriteNSHFA], (instregex "^TB[LX]v(8|16)i8One")>; def : InstRW<[M4WriteNSHFB], (instregex "^TB[LX]v(8|16)i8Two")>; diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td index 84825458e47c..92d03963de57 100644 --- a/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/lib/Target/AArch64/AArch64SchedFalkor.td @@ -1,9 +1,8 @@ //==- AArch64SchedFalkor.td - Falkor Scheduling Definitions -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,7 +23,7 @@ def FalkorModel : SchedMachineModel { let MispredictPenalty = 11; // Minimum branch misprediction penalty. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index ff14e639d1a5..697a0f69c58c 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -1,9 +1,8 @@ //==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td index 68de3e077c96..0e1a24103121 100644 --- a/lib/Target/AArch64/AArch64SchedKryo.td +++ b/lib/Target/AArch64/AArch64SchedKryo.td @@ -1,9 +1,8 @@ //==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -28,7 +27,7 @@ def KryoModel : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td index cf4cdabb8cbf..4c60992e6351 100644 --- a/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -1,9 +1,8 @@ //=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64SchedPredExynos.td b/lib/Target/AArch64/AArch64SchedPredExynos.td index 48c54230e9d8..0c1d82d354c0 100644 --- a/lib/Target/AArch64/AArch64SchedPredExynos.td +++ b/lib/Target/AArch64/AArch64SchedPredExynos.td @@ -1,9 +1,8 @@ //===- AArch64SchedPredExynos.td - AArch64 Sched Preds -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -103,17 +102,6 @@ def ExynosScaledIdxPred : MCSchedPredicate; // Identify FP instructions. def ExynosFPPred : MCSchedPredicate>; -// Identify whether an instruction whose result is a long vector -// operates on the upper half of the input registers. -def ExynosLongVectorUpperFn : TIIPredicate< - "isExynosLongVectorUpper", - MCOpcodeSwitchStatement< - [MCOpcodeSwitchCase< - IsLongVectorUpperOp.ValidOpcodes, - MCReturnStatement>], - MCReturnStatement>>; -def ExynosLongVectorUpperPred : MCSchedPredicate; - // Identify 128-bit NEON instructions. def ExynosQFormPred : MCSchedPredicate; diff --git a/lib/Target/AArch64/AArch64SchedPredicates.td b/lib/Target/AArch64/AArch64SchedPredicates.td index dbaf11fc95dd..0ef0f3f8675a 100644 --- a/lib/Target/AArch64/AArch64SchedPredicates.td +++ b/lib/Target/AArch64/AArch64SchedPredicates.td @@ -1,9 +1,8 @@ //===- AArch64SchedPredicates.td - AArch64 Sched Preds -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -268,59 +267,6 @@ def IsStoreRegOffsetOp : CheckOpcode<[STRBBroW, STRBBroX, def IsLoadStoreRegOffsetOp : CheckOpcode; -// Identify whether an instruction whose result is a long vector -// operates on the upper half of the input registers. -def IsLongVectorUpperOp : CheckOpcode<[FCVTLv8i16, FCVTLv4i32, - FCVTNv8i16, FCVTNv4i32, - FCVTXNv4f32, - PMULLv16i8, PMULLv2i64, - RADDHNv8i16_v16i8, RADDHNv4i32_v8i16, RADDHNv2i64_v4i32, - RSHRNv16i8_shift, RSHRNv8i16_shift, RSHRNv4i32_shift, - RSUBHNv8i16_v16i8, RSUBHNv4i32_v8i16, RSUBHNv2i64_v4i32, - SABALv16i8_v8i16, SABALv8i16_v4i32, SABALv4i32_v2i64, - SABDLv16i8_v8i16, SABDLv8i16_v4i32, SABDLv4i32_v2i64, - SADDLv16i8_v8i16, SADDLv8i16_v4i32, SADDLv4i32_v2i64, - SADDWv16i8_v8i16, SADDWv8i16_v4i32, SADDWv4i32_v2i64, - SHLLv16i8, SHLLv8i16, SHLLv4i32, - SHRNv16i8_shift, SHRNv8i16_shift, SHRNv4i32_shift, - SMLALv16i8_v8i16, SMLALv8i16_v4i32, SMLALv4i32_v2i64, - SMLALv8i16_indexed, SMLALv4i32_indexed, - SMLSLv16i8_v8i16, SMLSLv8i16_v4i32, SMLSLv4i32_v2i64, - SMLSLv8i16_indexed, SMLSLv4i32_indexed, - SMULLv16i8_v8i16, SMULLv8i16_v4i32, SMULLv4i32_v2i64, - SMULLv8i16_indexed, SMULLv4i32_indexed, - SQDMLALv8i16_v4i32, SQDMLALv4i32_v2i64, - SQDMLALv8i16_indexed, SQDMLALv4i32_indexed, - SQDMLSLv8i16_v4i32, SQDMLSLv4i32_v2i64, - SQDMLSLv8i16_indexed, SQDMLSLv4i32_indexed, - SQDMULLv8i16_v4i32, SQDMULLv4i32_v2i64, - SQDMULLv8i16_indexed, SQDMULLv4i32_indexed, - SQRSHRNv16i8_shift, SQRSHRNv8i16_shift, SQRSHRNv4i32_shift, - SQRSHRUNv16i8_shift, SQRSHRUNv8i16_shift, SQRSHRUNv4i32_shift, - SQSHRNv16i8_shift, SQSHRNv8i16_shift, SQSHRNv4i32_shift, - SQSHRUNv16i8_shift, SQSHRUNv8i16_shift, SQSHRUNv4i32_shift, - SQXTNv16i8, SQXTNv8i16, SQXTNv4i32, - SQXTUNv16i8, SQXTUNv8i16, SQXTUNv4i32, - SSHLLv16i8_shift, SSHLLv8i16_shift, SSHLLv4i32_shift, - SSUBLv16i8_v8i16, SSUBLv8i16_v4i32, SSUBLv4i32_v2i64, - SSUBWv16i8_v8i16, SSUBWv8i16_v4i32, SSUBWv4i32_v2i64, - UABALv16i8_v8i16, UABALv8i16_v4i32, UABALv4i32_v2i64, - UABDLv16i8_v8i16, UABDLv8i16_v4i32, UABDLv4i32_v2i64, - UADDLv16i8_v8i16, UADDLv8i16_v4i32, UADDLv4i32_v2i64, - UADDWv16i8_v8i16, UADDWv8i16_v4i32, UADDWv4i32_v2i64, - UMLALv16i8_v8i16, UMLALv8i16_v4i32, UMLALv4i32_v2i64, - UMLALv8i16_indexed, UMLALv4i32_indexed, - UMLSLv16i8_v8i16, UMLSLv8i16_v4i32, UMLSLv4i32_v2i64, - UMLSLv8i16_indexed, UMLSLv4i32_indexed, - UMULLv16i8_v8i16, UMULLv8i16_v4i32, UMULLv4i32_v2i64, - UMULLv8i16_indexed, UMULLv4i32_indexed, - UQSHRNv16i8_shift, UQSHRNv8i16_shift, UQSHRNv4i32_shift, - UQXTNv16i8, UQXTNv8i16, UQXTNv4i32, - USHLLv16i8_shift, USHLLv8i16_shift, USHLLv4i32_shift, - USUBLv16i8_v8i16, USUBLv8i16_v4i32, USUBLv4i32_v2i64, - USUBWv16i8_v8i16, USUBWv8i16_v4i32, USUBWv4i32_v2i64, - XTNv16i8, XTNv8i16, XTNv4i32]>; - // Target predicates. // Identify an instruction that effectively transfers a register to another. diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td index fbbd3850d0fd..3b6aecf5c035 100644 --- a/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/lib/Target/AArch64/AArch64SchedThunderX.td @@ -1,9 +1,8 @@ //==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,7 +25,7 @@ def ThunderXT8XModel : SchedMachineModel { let PostRAScheduler = 1; // Use PostRA scheduler. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td index bee3392b6d3b..674ea19b082f 100644 --- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -1,9 +1,8 @@ //=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,7 +25,7 @@ def ThunderX2T99Model : SchedMachineModel { let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = SVEUnsupported.F; // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td index f55ba4d42fce..49c0c1782236 100644 --- a/lib/Target/AArch64/AArch64Schedule.td +++ b/lib/Target/AArch64/AArch64Schedule.td @@ -1,9 +1,8 @@ //==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index a719d47618e5..60dbace03ca6 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,3 +56,91 @@ bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner( CodeGenOpt::Level OptLevel) const { return OptLevel >= CodeGenOpt::Aggressive; } + +static const int kSetTagLoopThreshold = 176; + +static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Ptr, uint64_t ObjSize, + const MachineMemOperand *BaseMemOperand, + bool ZeroData) { + MachineFunction &MF = DAG.getMachineFunction(); + unsigned ObjSizeScaled = ObjSize / 16; + + SDValue TagSrc = Ptr; + if (Ptr.getOpcode() == ISD::FrameIndex) { + int FI = cast(Ptr)->getIndex(); + Ptr = DAG.getTargetFrameIndex(FI, MVT::i64); + // A frame index operand may end up as [SP + offset] => it is fine to use SP + // register as the tag source. + TagSrc = DAG.getRegister(AArch64::SP, MVT::i64); + } + + const unsigned OpCode1 = ZeroData ? AArch64ISD::STZG : AArch64ISD::STG; + const unsigned OpCode2 = ZeroData ? AArch64ISD::STZ2G : AArch64ISD::ST2G; + + SmallVector OutChains; + unsigned OffsetScaled = 0; + while (OffsetScaled < ObjSizeScaled) { + if (ObjSizeScaled - OffsetScaled >= 2) { + SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl); + SDValue St = DAG.getMemIntrinsicNode( + OpCode2, dl, DAG.getVTList(MVT::Other), + {Chain, TagSrc, AddrNode}, + MVT::v4i64, + MF.getMachineMemOperand(BaseMemOperand, OffsetScaled * 16, 16 * 2)); + OffsetScaled += 2; + OutChains.push_back(St); + continue; + } + + if (ObjSizeScaled - OffsetScaled > 0) { + SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl); + SDValue St = DAG.getMemIntrinsicNode( + OpCode1, dl, DAG.getVTList(MVT::Other), + {Chain, TagSrc, AddrNode}, + MVT::v2i64, + MF.getMachineMemOperand(BaseMemOperand, OffsetScaled * 16, 16)); + OffsetScaled += 1; + OutChains.push_back(St); + } + } + + SDValue Res = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); + return Res; +} + +SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Addr, + SDValue Size, MachinePointerInfo DstPtrInfo, bool ZeroData) const { + uint64_t ObjSize = cast(Size)->getZExtValue(); + assert(ObjSize % 16 == 0); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand( + DstPtrInfo, MachineMemOperand::MOStore, ObjSize, 16); + + bool UseSetTagRangeLoop = + kSetTagLoopThreshold >= 0 && (int)ObjSize >= kSetTagLoopThreshold; + if (!UseSetTagRangeLoop) + return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand, + ZeroData); + + if (ObjSize % 32 != 0) { + SDNode *St1 = DAG.getMachineNode( + ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl, + {MVT::i64, MVT::Other}, + {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain}); + DAG.setNodeMemRefs(cast(St1), {BaseMemOperand}); + ObjSize -= 16; + Addr = SDValue(St1, 0); + Chain = SDValue(St1, 1); + } + + const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; + SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain}; + SDNode *St = DAG.getMachineNode( + ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops); + + DAG.setNodeMemRefs(cast(St), {BaseMemOperand}); + return SDValue(St, 2); +} diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 7e4f11091226..d0967fb973cc 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,6 +23,10 @@ public: SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; + SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Op1, SDValue Op2, + MachinePointerInfo DstPtrInfo, + bool ZeroData) const override; bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override; }; } diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp index e9699b0367d3..3087e6ce441d 100644 --- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -1,9 +1,8 @@ //===- AArch64SpeculationHardening.cpp - Harden Against Missspeculation --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -103,6 +102,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" @@ -146,25 +146,31 @@ private: BitVector RegsAlreadyMasked; bool functionUsesHardeningRegister(MachineFunction &MF) const; - bool instrumentControlFlow(MachineBasicBlock &MBB); + bool instrumentControlFlow(MachineBasicBlock &MBB, + bool &UsesFullSpeculationBarrier); bool endsWithCondControlFlow(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, AArch64CC::CondCode &CondCode) const; void insertTrackingCode(MachineBasicBlock &SplitEdgeBB, AArch64CC::CondCode &CondCode, DebugLoc DL) const; - void insertSPToRegTaintPropagation(MachineBasicBlock *MBB, + void insertSPToRegTaintPropagation(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; - void insertRegToSPTaintPropagation(MachineBasicBlock *MBB, + void insertRegToSPTaintPropagation(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned TmpReg) const; + void insertFullSpeculationBarrier(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const; bool slhLoads(MachineBasicBlock &MBB); bool makeGPRSpeculationSafe(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineInstr &MI, unsigned Reg); - bool lowerSpeculationSafeValuePseudos(MachineBasicBlock &MBB); + bool lowerSpeculationSafeValuePseudos(MachineBasicBlock &MBB, + bool UsesFullSpeculationBarrier); bool expandSpeculationSafeValue(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + MachineBasicBlock::iterator MBBI, + bool UsesFullSpeculationBarrier); bool insertCSDB(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL); }; @@ -207,15 +213,19 @@ bool AArch64SpeculationHardening::endsWithCondControlFlow( return true; } +void AArch64SpeculationHardening::insertFullSpeculationBarrier( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { + // A full control flow speculation barrier consists of (DSB SYS + ISB) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::DSB)).addImm(0xf); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ISB)).addImm(0xf); +} + void AArch64SpeculationHardening::insertTrackingCode( MachineBasicBlock &SplitEdgeBB, AArch64CC::CondCode &CondCode, DebugLoc DL) const { if (UseControlFlowSpeculationBarrier) { - // insert full control flow speculation barrier (DSB SYS + ISB) - BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::ISB)) - .addImm(0xf); - BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::DSB)) - .addImm(0xf); + insertFullSpeculationBarrier(SplitEdgeBB, SplitEdgeBB.begin(), DL); } else { BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::CSELXr)) .addDef(MisspeculatingTaintReg) @@ -227,7 +237,7 @@ void AArch64SpeculationHardening::insertTrackingCode( } bool AArch64SpeculationHardening::instrumentControlFlow( - MachineBasicBlock &MBB) { + MachineBasicBlock &MBB, bool &UsesFullSpeculationBarrier) { LLVM_DEBUG(dbgs() << "Instrument control flow tracking on MBB: " << MBB); bool Modified = false; @@ -263,55 +273,105 @@ bool AArch64SpeculationHardening::instrumentControlFlow( } // Perform correct code generation around function calls and before returns. - { - SmallVector ReturnInstructions; - SmallVector CallInstructions; + // The below variables record the return/terminator instructions and the call + // instructions respectively; including which register is available as a + // temporary register just before the recorded instructions. + SmallVector, 4> ReturnInstructions; + SmallVector, 4> CallInstructions; + // if a temporary register is not available for at least one of the + // instructions for which we need to transfer taint to the stack pointer, we + // need to insert a full speculation barrier. + // TmpRegisterNotAvailableEverywhere tracks that condition. + bool TmpRegisterNotAvailableEverywhere = false; + + RegScavenger RS; + RS.enterBasicBlock(MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); I++) { + MachineInstr &MI = *I; + if (!MI.isReturn() && !MI.isCall()) + continue; - for (MachineInstr &MI : MBB) { - if (MI.isReturn()) - ReturnInstructions.push_back(&MI); - else if (MI.isCall()) - CallInstructions.push_back(&MI); - } + // The RegScavenger represents registers available *after* the MI + // instruction pointed to by RS.getCurrentPosition(). + // We need to have a register that is available *before* the MI is executed. + if (I != MBB.begin()) + RS.forward(std::prev(I)); + // FIXME: The below just finds *a* unused register. Maybe code could be + // optimized more if this looks for the register that isn't used for the + // longest time around this place, to enable more scheduling freedom. Not + // sure if that would actually result in a big performance difference + // though. Maybe RegisterScavenger::findSurvivorBackwards has some logic + // already to do this - but it's unclear if that could easily be used here. + unsigned TmpReg = RS.FindUnusedReg(&AArch64::GPR64commonRegClass); + LLVM_DEBUG(dbgs() << "RS finds " + << ((TmpReg == 0) ? "no register " : "register "); + if (TmpReg != 0) dbgs() << printReg(TmpReg, TRI) << " "; + dbgs() << "to be available at MI " << MI); + if (TmpReg == 0) + TmpRegisterNotAvailableEverywhere = true; + if (MI.isReturn()) + ReturnInstructions.push_back({&MI, TmpReg}); + else if (MI.isCall()) + CallInstructions.push_back({&MI, TmpReg}); + } - Modified |= - (ReturnInstructions.size() > 0) || (CallInstructions.size() > 0); + if (TmpRegisterNotAvailableEverywhere) { + // When a temporary register is not available everywhere in this basic + // basic block where a propagate-taint-to-sp operation is needed, just + // emit a full speculation barrier at the start of this basic block, which + // renders the taint/speculation tracking in this basic block unnecessary. + insertFullSpeculationBarrier(MBB, MBB.begin(), + (MBB.begin())->getDebugLoc()); + UsesFullSpeculationBarrier = true; + Modified = true; + } else { + for (auto MI_Reg : ReturnInstructions) { + assert(MI_Reg.second != 0); + LLVM_DEBUG( + dbgs() + << " About to insert Reg to SP taint propagation with temp register " + << printReg(MI_Reg.second, TRI) + << " on instruction: " << *MI_Reg.first); + insertRegToSPTaintPropagation(MBB, MI_Reg.first, MI_Reg.second); + Modified = true; + } - for (MachineInstr *Return : ReturnInstructions) - insertRegToSPTaintPropagation(Return->getParent(), Return, AArch64::X17); - for (MachineInstr *Call : CallInstructions) { + for (auto MI_Reg : CallInstructions) { + assert(MI_Reg.second != 0); + LLVM_DEBUG(dbgs() << " About to insert Reg to SP and back taint " + "propagation with temp register " + << printReg(MI_Reg.second, TRI) + << " around instruction: " << *MI_Reg.first); // Just after the call: - MachineBasicBlock::iterator i = Call; - i++; - insertSPToRegTaintPropagation(Call->getParent(), i); + insertSPToRegTaintPropagation( + MBB, std::next((MachineBasicBlock::iterator)MI_Reg.first)); // Just before the call: - insertRegToSPTaintPropagation(Call->getParent(), Call, AArch64::X17); + insertRegToSPTaintPropagation(MBB, MI_Reg.first, MI_Reg.second); + Modified = true; } } - return Modified; } void AArch64SpeculationHardening::insertSPToRegTaintPropagation( - MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { // If full control flow speculation barriers are used, emit a control flow // barrier to block potential miss-speculation in flight coming in to this // function. if (UseControlFlowSpeculationBarrier) { - // insert full control flow speculation barrier (DSB SYS + ISB) - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::DSB)).addImm(0xf); - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ISB)).addImm(0xf); + insertFullSpeculationBarrier(MBB, MBBI, DebugLoc()); return; } // CMP SP, #0 === SUBS xzr, SP, #0 - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri)) + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri)) .addDef(AArch64::XZR) .addUse(AArch64::SP) .addImm(0) .addImm(0); // no shift // CSETM x16, NE === CSINV x16, xzr, xzr, EQ - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr)) + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr)) .addDef(MisspeculatingTaintReg) .addUse(AArch64::XZR) .addUse(AArch64::XZR) @@ -319,7 +379,7 @@ void AArch64SpeculationHardening::insertSPToRegTaintPropagation( } void AArch64SpeculationHardening::insertRegToSPTaintPropagation( - MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned TmpReg) const { // If full control flow speculation barriers are used, there will not be // miss-speculation when returning from this function, and therefore, also @@ -328,19 +388,19 @@ void AArch64SpeculationHardening::insertRegToSPTaintPropagation( return; // mov Xtmp, SP === ADD Xtmp, SP, #0 - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri)) + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri)) .addDef(TmpReg) .addUse(AArch64::SP) .addImm(0) .addImm(0); // no shift // and Xtmp, Xtmp, TaintReg === AND Xtmp, Xtmp, TaintReg, #0 - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs)) + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs)) .addDef(TmpReg, RegState::Renamable) .addUse(TmpReg, RegState::Kill | RegState::Renamable) .addUse(MisspeculatingTaintReg, RegState::Kill) .addImm(0); // mov SP, Xtmp === ADD SP, Xtmp, #0 - BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri)) + BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri)) .addDef(AArch64::SP) .addUse(TmpReg, RegState::Kill) .addImm(0) @@ -484,7 +544,8 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) { /// \brief If MBBI references a pseudo instruction that should be expanded /// here, do the expansion and return true. Otherwise return false. bool AArch64SpeculationHardening::expandSpeculationSafeValue( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + bool UsesFullSpeculationBarrier) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); bool Is64Bit = true; @@ -499,7 +560,7 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue( // Just remove the SpeculationSafe pseudo's if control flow // miss-speculation isn't happening because we're already inserting barriers // to guarantee that. - if (!UseControlFlowSpeculationBarrier) { + if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); // Mark this register and all its aliasing registers as needing to be @@ -537,7 +598,7 @@ bool AArch64SpeculationHardening::insertCSDB(MachineBasicBlock &MBB, } bool AArch64SpeculationHardening::lowerSpeculationSafeValuePseudos( - MachineBasicBlock &MBB) { + MachineBasicBlock &MBB, bool UsesFullSpeculationBarrier) { bool Modified = false; RegsNeedingCSDBBeforeUse.reset(); @@ -572,15 +633,16 @@ bool AArch64SpeculationHardening::lowerSpeculationSafeValuePseudos( break; } - if (NeedToEmitBarrier) + if (NeedToEmitBarrier && !UsesFullSpeculationBarrier) Modified |= insertCSDB(MBB, MBBI, DL); - Modified |= expandSpeculationSafeValue(MBB, MBBI); + Modified |= + expandSpeculationSafeValue(MBB, MBBI, UsesFullSpeculationBarrier); MBBI = NMBBI; } - if (RegsNeedingCSDBBeforeUse.any()) + if (RegsNeedingCSDBBeforeUse.any() && !UsesFullSpeculationBarrier) Modified |= insertCSDB(MBB, MBBI, DL); return Modified; @@ -609,7 +671,7 @@ bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) { Modified |= slhLoads(MBB); } - // 2.a Add instrumentation code to function entry and exits. + // 2. Add instrumentation code to function entry and exits. LLVM_DEBUG( dbgs() << "***** AArch64SpeculationHardening - track control flow *****\n"); @@ -620,17 +682,15 @@ bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) { EntryBlocks.push_back(LPI.LandingPadBlock); for (auto Entry : EntryBlocks) insertSPToRegTaintPropagation( - Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin())); - - // 2.b Add instrumentation code to every basic block. - for (auto &MBB : MF) - Modified |= instrumentControlFlow(MBB); + *Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin())); - LLVM_DEBUG(dbgs() << "***** AArch64SpeculationHardening - Lowering " - "SpeculationSafeValue Pseudos *****\n"); - // Step 3: Lower SpeculationSafeValue pseudo instructions. - for (auto &MBB : MF) - Modified |= lowerSpeculationSafeValuePseudos(MBB); + // 3. Add instrumentation code to every basic block. + for (auto &MBB : MF) { + bool UsesFullSpeculationBarrier = false; + Modified |= instrumentControlFlow(MBB, UsesFullSpeculationBarrier); + Modified |= + lowerSpeculationSafeValuePseudos(MBB, UsesFullSpeculationBarrier); + } return Modified; } diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp new file mode 100644 index 000000000000..6e99c48bf1d7 --- /dev/null +++ b/lib/Target/AArch64/AArch64StackTagging.cpp @@ -0,0 +1,345 @@ +//===- AArch64StackTagging.cpp - Stack tagging in IR --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "stack-tagging" + +static constexpr unsigned kTagGranuleSize = 16; + +namespace { +class AArch64StackTagging : public FunctionPass { + struct AllocaInfo { + AllocaInst *AI; + SmallVector LifetimeStart; + SmallVector LifetimeEnd; + SmallVector DbgVariableIntrinsics; + int Tag; // -1 for non-tagged allocations + }; + +public: + static char ID; // Pass ID, replacement for typeid + + AArch64StackTagging() : FunctionPass(ID) { + initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry()); + } + + bool isInterestingAlloca(const AllocaInst &AI); + void alignAndPadAlloca(AllocaInfo &Info); + + void tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, + uint64_t Size); + void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size); + + Instruction * + insertBaseTaggedPointer(const MapVector &Allocas, + const DominatorTree *DT); + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return "AArch64 Stack Tagging"; } + +private: + Function *F; + Function *SetTagFunc; + const DataLayout *DL; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } +}; + +} // end anonymous namespace + +char AArch64StackTagging::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", + false, false) +INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", + false, false) + +FunctionPass *llvm::createAArch64StackTaggingPass() { + return new AArch64StackTagging(); +} + +bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { + // FIXME: support dynamic allocas + bool IsInteresting = + AI.getAllocatedType()->isSized() && AI.isStaticAlloca() && + // alloca() may be called with 0 size, ignore it. + AI.getAllocationSizeInBits(*DL).getValue() > 0 && + // inalloca allocas are not treated as static, and we don't want + // dynamic alloca instrumentation for them as well. + !AI.isUsedWithInAlloca() && + // swifterror allocas are register promoted by ISel + !AI.isSwiftError(); + return IsInteresting; +} + +void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore, + Value *Ptr, uint64_t Size) { + IRBuilder<> IRB(InsertBefore); + IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); +} + +void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore, + uint64_t Size) { + IRBuilder<> IRB(InsertBefore); + IRB.CreateCall(SetTagFunc, {IRB.CreatePointerCast(AI, IRB.getInt8PtrTy()), + ConstantInt::get(IRB.getInt64Ty(), Size)}); +} + +Instruction *AArch64StackTagging::insertBaseTaggedPointer( + const MapVector &Allocas, + const DominatorTree *DT) { + BasicBlock *PrologueBB = nullptr; + // Try sinking IRG as deep as possible to avoid hurting shrink wrap. + for (auto &I : Allocas) { + const AllocaInfo &Info = I.second; + AllocaInst *AI = Info.AI; + if (Info.Tag < 0) + continue; + if (!PrologueBB) { + PrologueBB = AI->getParent(); + continue; + } + PrologueBB = DT->findNearestCommonDominator(PrologueBB, AI->getParent()); + } + assert(PrologueBB); + + IRBuilder<> IRB(&PrologueBB->front()); + Function *IRG_SP = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_irg_sp); + Instruction *Base = + IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())}); + Base->setName("basetag"); + return Base; +} + +void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { + unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize); + Info.AI->setAlignment(NewAlignment); + + uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; + uint64_t AlignedSize = alignTo(Size, kTagGranuleSize); + if (Size == AlignedSize) + return; + + // Add padding to the alloca. + Type *AllocatedType = + Info.AI->isArrayAllocation() + ? ArrayType::get( + Info.AI->getAllocatedType(), + dyn_cast(Info.AI->getArraySize())->getZExtValue()) + : Info.AI->getAllocatedType(); + Type *PaddingType = + ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size); + Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType); + auto *NewAI = new AllocaInst( + TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI); + NewAI->takeName(Info.AI); + NewAI->setAlignment(Info.AI->getAlignment()); + NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca()); + NewAI->setSwiftError(Info.AI->isSwiftError()); + NewAI->copyMetadata(*Info.AI); + + auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI); + Info.AI->replaceAllUsesWith(NewPtr); + Info.AI->eraseFromParent(); + Info.AI = NewAI; +} + +// FIXME: check for MTE extension +bool AArch64StackTagging::runOnFunction(Function &Fn) { + if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag)) + return false; + + F = &Fn; + DL = &Fn.getParent()->getDataLayout(); + + MapVector Allocas; // need stable iteration order + SmallVector RetVec; + DenseMap AllocaForValue; + SmallVector UnrecognizedLifetimes; + + for (auto &BB : *F) { + for (BasicBlock::iterator IT = BB.begin(); IT != BB.end(); ++IT) { + Instruction *I = &*IT; + if (auto *AI = dyn_cast(I)) { + Allocas[AI].AI = AI; + continue; + } + + if (auto *DVI = dyn_cast(I)) { + if (auto *AI = + dyn_cast_or_null(DVI->getVariableLocation())) { + Allocas[AI].DbgVariableIntrinsics.push_back(DVI); + } + continue; + } + + auto *II = dyn_cast(I); + if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end)) { + AllocaInst *AI = + llvm::findAllocaForValue(II->getArgOperand(1), AllocaForValue); + if (!AI) { + UnrecognizedLifetimes.push_back(I); + continue; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Allocas[AI].LifetimeStart.push_back(II); + else + Allocas[AI].LifetimeEnd.push_back(II); + } + + if (isa(I) || isa(I) || isa(I)) + RetVec.push_back(I); + } + } + + if (Allocas.empty()) + return false; + + int NextTag = 0; + int NumInterestingAllocas = 0; + for (auto &I : Allocas) { + AllocaInfo &Info = I.second; + assert(Info.AI); + + if (!isInterestingAlloca(*Info.AI)) { + Info.Tag = -1; + continue; + } + + alignAndPadAlloca(Info); + NumInterestingAllocas++; + Info.Tag = NextTag; + NextTag = (NextTag + 1) % 16; + } + + if (NumInterestingAllocas == 0) + return true; + + SetTagFunc = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); + + // Compute DT only if the function has the attribute, there are more than 1 + // interesting allocas, and it is not available for free. + Instruction *Base; + if (NumInterestingAllocas > 1) { + auto *DTWP = getAnalysisIfAvailable(); + if (DTWP) { + Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree()); + } else { + DominatorTree DT(*F); + Base = insertBaseTaggedPointer(Allocas, &DT); + } + } else { + Base = insertBaseTaggedPointer(Allocas, nullptr); + } + + for (auto &I : Allocas) { + const AllocaInfo &Info = I.second; + AllocaInst *AI = Info.AI; + if (Info.Tag < 0) + continue; + + // Replace alloca with tagp(alloca). + IRBuilder<> IRB(Info.AI->getNextNode()); + Function *TagP = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()}); + Instruction *TagPCall = + IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base, + ConstantInt::get(IRB.getInt64Ty(), Info.Tag)}); + if (Info.AI->hasName()) + TagPCall->setName(Info.AI->getName() + ".tag"); + Info.AI->replaceAllUsesWith(TagPCall); + TagPCall->setOperand(0, Info.AI); + + if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 && + Info.LifetimeEnd.size() == 1) { + IntrinsicInst *Start = Info.LifetimeStart[0]; + uint64_t Size = + dyn_cast(Start->getArgOperand(0))->getZExtValue(); + Size = alignTo(Size, kTagGranuleSize); + tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size); + untagAlloca(AI, Info.LifetimeEnd[0], Size); + } else { + uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; + Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy()); + tagAlloca(AI, &*IRB.GetInsertPoint(), Ptr, Size); + for (auto &RI : RetVec) { + untagAlloca(AI, RI, Size); + } + // We may have inserted tag/untag outside of any lifetime interval. + // Remove all lifetime intrinsics for this alloca. + for (auto &II : Info.LifetimeStart) + II->eraseFromParent(); + for (auto &II : Info.LifetimeEnd) + II->eraseFromParent(); + } + + // Fixup debug intrinsics to point to the new alloca. + for (auto DVI : Info.DbgVariableIntrinsics) + DVI->setArgOperand( + 0, + MetadataAsValue::get(F->getContext(), LocalAsMetadata::get(Info.AI))); + } + + // If we have instrumented at least one alloca, all unrecognized lifetime + // instrinsics have to go. + for (auto &I : UnrecognizedLifetimes) + I->eraseFromParent(); + + return true; +} diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index d5643d384283..0e84a00df006 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -1,9 +1,8 @@ //===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -148,7 +147,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { for (auto &MI : MBB) { if (!isNarrowFPStore(MI)) continue; - MachineOperand *BaseOp; + const MachineOperand *BaseOp; int64_t Offset; if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) && BaseOp->isReg()) { diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index dd30d25b2b50..3bc89b91c3f7 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -1,9 +1,8 @@ //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,6 +82,7 @@ void AArch64Subtarget::initializeProperties() { case CortexA72: case CortexA73: case CortexA75: + case CortexA76: PrefFunctionAlignment = 4; break; case Cyclone: diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 82f7bb755951..0c84cfb8329a 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -1,9 +1,8 @@ //===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -46,6 +45,7 @@ public: CortexA72, CortexA73, CortexA75, + CortexA76, Cyclone, ExynosM1, ExynosM3, @@ -93,6 +93,12 @@ protected: bool HasPAN_RWV = false; bool HasCCPP = false; + // Armv8.2 Crypto extensions + bool HasSM4 = false; + bool HasSHA3 = false; + bool HasSHA2 = false; + bool HasAES = false; + // ARMv8.3 extensions bool HasPA = false; bool HasJS = false; @@ -110,15 +116,10 @@ protected: bool HasTLB_RMI = false; bool HasFMI = false; bool HasRCPC_IMMO = false; - // ARMv8.4 Crypto extensions - bool HasSM4 = true; - bool HasSHA3 = true; - - bool HasSHA2 = true; - bool HasAES = true; bool HasLSLFast = false; bool HasSVE = false; + bool HasSVE2 = false; bool HasRCPC = false; bool HasAggressiveFMA = false; @@ -134,6 +135,12 @@ protected: bool HasRandGen = false; bool HasMTE = false; + // Arm SVE2 extensions + bool HasSVE2AES = false; + bool HasSVE2SM4 = false; + bool HasSVE2SHA3 = false; + bool HasSVE2BitPerm = false; + // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -173,6 +180,9 @@ protected: bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; bool Force32BitJumpTables = false; + bool UseEL1ForTP = false; + bool UseEL2ForTP = false; + bool UseEL3ForTP = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -324,6 +334,10 @@ public: hasFuseCCSelect() || hasFuseLiterals(); } + bool useEL1ForTP() const { return UseEL1ForTP; } + bool useEL2ForTP() const { return UseEL2ForTP; } + bool useEL3ForTP() const { return UseEL3ForTP; } + bool useRSqrt() const { return UseRSqrt; } bool force32BitJumpTables() const { return Force32BitJumpTables; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } @@ -353,6 +367,7 @@ public: bool hasSPE() const { return HasSPE; } bool hasLSLFast() const { return HasLSLFast; } bool hasSVE() const { return HasSVE; } + bool hasSVE2() const { return HasSVE2; } bool hasRCPC() const { return HasRCPC; } bool hasAggressiveFMA() const { return HasAggressiveFMA; } bool hasAlternativeNZCV() const { return HasAlternativeNZCV; } @@ -365,6 +380,11 @@ public: bool hasBTI() const { return HasBTI; } bool hasRandGen() const { return HasRandGen; } bool hasMTE() const { return HasMTE; } + // Arm SVE2 extensions + bool hasSVE2AES() const { return HasSVE2AES; } + bool hasSVE2SM4() const { return HasSVE2SM4; } + bool hasSVE2SHA3() const { return HasSVE2SHA3; } + bool hasSVE2BitPerm() const { return HasSVE2BitPerm; } bool isLittleEndian() const { return IsLittle; } diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index a804fb11175b..536a6591478b 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -1,9 +1,8 @@ //===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1458,6 +1457,7 @@ def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0110, 0b0101, 0b000>; def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0110, 0b0110, 0b000>; def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0110, 0b0110, 0b000>; def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0110, 0b0110, 0b001>; +def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>; } // HasMTE // Cyclone specific system registers diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 4e016525f7e4..865461480499 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -1,9 +1,8 @@ //===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,9 +16,11 @@ #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "TargetInfo/AArch64TargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/CSEConfigBase.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" @@ -178,6 +179,7 @@ extern "C" void LLVMInitializeAArch64Target() { initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); + initializeAArch64StackTaggingPass(*PR); } //===----------------------------------------------------------------------===// @@ -209,8 +211,8 @@ static std::string computeDataLayout(const Triple &TT, static Reloc::Model getEffectiveRelocModel(const Triple &TT, Optional RM) { - // AArch64 Darwin is always PIC. - if (TT.isOSDarwin()) + // AArch64 Darwin and Windows are always PIC. + if (TT.isOSDarwin() || TT.isOSWindows()) return Reloc::PIC_; // On ELF platforms the default static relocation model has a smart enough // linker to cope with referencing external symbols defined in a shared @@ -384,6 +386,8 @@ public: void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + + std::unique_ptr getCSEConfig() const override; }; } // end anonymous namespace @@ -397,6 +401,10 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { return new AArch64PassConfig(*this, PM); } +std::unique_ptr AArch64PassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} + void AArch64PassConfig::addIRPasses() { // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg // ourselves. @@ -439,6 +447,8 @@ void AArch64PassConfig::addIRPasses() { // invariant. addPass(createLICMPass()); } + + addPass(createAArch64StackTaggingPass()); } // Pass Pipeline Configuration @@ -455,7 +465,20 @@ bool AArch64PassConfig::addPreISel() { EnableGlobalMerge == cl::BOU_TRUE) { bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && (EnableGlobalMerge == cl::BOU_UNSET); - addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize)); + + // Merging of extern globals is enabled by default on non-Mach-O as we + // expect it to be generally either beneficial or harmless. On Mach-O it + // is disabled as we emit the .subsections_via_symbols directive which + // means that merging extern globals is not safe. + bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO(); + + // FIXME: extern global merging is only enabled when we optimise for size + // because there are some regressions with it also enabled for performance. + if (!OnlyOptimizeForSize) + MergeExternalByDefault = false; + + addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize, + MergeExternalByDefault)); } return false; diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 8d28a5e30ebf..5264efb89b9c 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -1,9 +1,8 @@ //==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 8ae72a7ddb57..1c3d5d0743ad 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 9077eb7902fd..7ead363d42fe 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -1,9 +1,8 @@ //===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a256cb7c9215..a4b78f2a7d6b 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1,12 +1,12 @@ //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include "AArch64ExpandImm.h" #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/LoopInfo.h" @@ -50,8 +50,9 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) { Val = ~Val; // Calculate how many moves we will need to materialize this constant. - unsigned LZ = countLeadingZeros((uint64_t)Val); - return (64 - LZ + 15) / 16; + SmallVector Insn; + AArch64_IMM::expandMOVImm(Val, 64, Insn); + return Insn.size(); } /// Calculate the cost of materializing the given constant. @@ -665,7 +666,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); - if (!UseMaskForCond && !UseMaskForGaps && + if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 08c1a8924220..10c15a139b4c 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -1,9 +1,8 @@ //===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -166,6 +165,10 @@ public: return false; } + unsigned getGISelRematGlobalCost() const { + return 2; + } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 6cc9b67e4d27..f4c55d48d215 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1,9 +1,8 @@ //==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,6 +10,7 @@ #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "MCTargetDesc/AArch64TargetStreamer.h" +#include "TargetInfo/AArch64TargetInfo.h" #include "AArch64InstrInfo.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APFloat.h" @@ -242,11 +242,13 @@ public: if (S.getTargetStreamer() == nullptr) new AArch64TargetStreamer(S); - // Alias .hword/.word/xword to the target-independent .2byte/.4byte/.8byte - // directives as they have the same form and semantics: - /// ::= (.hword | .word | .xword ) [ expression (, expression)* ] + // Alias .hword/.word/.[dx]word to the target-independent + // .2byte/.4byte/.8byte directives as they have the same form and + // semantics: + /// ::= (.hword | .word | .dword | .xword ) [ expression (, expression)* ] Parser.addAliasForDirective(".hword", ".2byte"); Parser.addAliasForDirective(".word", ".4byte"); + Parser.addAliasForDirective(".dword", ".8byte"); Parser.addAliasForDirective(".xword", ".8byte"); // Initialize the set of available features. @@ -1079,8 +1081,7 @@ public: if (Kind != k_Register || Reg.Kind != RegKind::SVEPredicateVector) return DiagnosticPredicateTy::NoMatch; - if (isSVEVectorReg() && - (ElementWidth == 0 || Reg.ElementWidth == ElementWidth)) + if (isSVEVectorReg() && (Reg.ElementWidth == ElementWidth)) return DiagnosticPredicateTy::Match; return DiagnosticPredicateTy::NearMatch; @@ -1091,8 +1092,7 @@ public: if (Kind != k_Register || Reg.Kind != RegKind::SVEDataVector) return DiagnosticPredicateTy::NoMatch; - if (isSVEVectorReg() && - (ElementWidth == 0 || Reg.ElementWidth == ElementWidth)) + if (isSVEVectorReg() && Reg.ElementWidth == ElementWidth) return DiagnosticPredicateTy::Match; return DiagnosticPredicateTy::NearMatch; @@ -1272,9 +1272,11 @@ public: bool isExtend64() const { if (!isExtend()) return false; - // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class). + // Make sure the extend expects a 32-bit source register. AArch64_AM::ShiftExtendType ET = getShiftExtendType(); - return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX; + return ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB || + ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH || + ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW; } bool isExtendLSL64() const { @@ -2473,7 +2475,7 @@ OperandMatchResultTy AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; if (Parser.getTok().is(AsmToken::Hash)) { Parser.Lex(); // Eat hash token. @@ -2500,6 +2502,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE && DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE && DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && + ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC && ELFRefKind != AArch64MCExpr::VK_GOT_PAGE && ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE && ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) { @@ -2523,7 +2526,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { OperandMatchResultTy AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) { SMLoc S = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; // Leave anything with a bracket to the default for SVE if (getParser().getTok().is(AsmToken::LBrac)) @@ -2621,7 +2624,7 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) { // Operand should start from # or should be integer, emit error otherwise. return MatchOperand_NoMatch; - const MCExpr *Imm; + const MCExpr *Imm = nullptr; if (parseSymbolicImmVal(Imm)) return MatchOperand_ParseFail; else if (Parser.getTok().isNot(AsmToken::Comma)) { @@ -2660,7 +2663,7 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) { Parser.Lex(); // Eat the number // Just in case the optional lsl #0 is used for immediates other than zero. - if (ShiftAmount == 0 && Imm != 0) { + if (ShiftAmount == 0 && Imm != nullptr) { SMLoc E = Parser.getTok().getLoc(); Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext())); return MatchOperand_Success; @@ -2833,6 +2836,11 @@ static const struct Extension { {"pan-rwv", {AArch64::FeaturePAN_RWV}}, {"ccpp", {AArch64::FeatureCCPP}}, {"sve", {AArch64::FeatureSVE}}, + {"sve2", {AArch64::FeatureSVE2}}, + {"sve2-aes", {AArch64::FeatureSVE2AES}}, + {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, + {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, + {"bitperm", {AArch64::FeatureSVE2BitPerm}}, // FIXME: Unsupported extensions {"pan", {}}, {"lor", {}}, @@ -3260,6 +3268,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12) .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12) .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC) + .Case("pg_hi21_nc", AArch64MCExpr::VK_ABS_PAGE_NC) .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2) .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1) .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC) @@ -4098,15 +4107,6 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, "unpredictable STXP instruction, status is also a source"); break; } - case AArch64::LDGV: { - unsigned Rt = Inst.getOperand(0).getReg(); - unsigned Rn = Inst.getOperand(1).getReg(); - if (RI->isSubRegisterEq(Rt, Rn)) { - return Error(Loc[0], - "unpredictable LDGV instruction, writeback register is also " - "the target register"); - } - } } @@ -4167,7 +4167,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, } } -static std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string AArch64MnemonicSpellCheck(StringRef S, + const FeatureBitset &FBS, unsigned VariantID = 0); bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, @@ -4199,7 +4200,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, return Error(Loc, "expected AArch64 condition code"); case Match_AddSubRegExtendSmall: return Error(Loc, - "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]"); + "expected '[su]xt[bhw]' with optional integer in range [0, 4]"); case Match_AddSubRegExtendLarge: return Error(Loc, "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]"); @@ -4442,7 +4443,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, case Match_InvalidZPR64LSL64: return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #3'"); case Match_InvalidZPR0: - return Error(Loc, "expected register without element width sufix"); + return Error(Loc, "expected register without element width suffix"); case Match_InvalidZPR8: case Match_InvalidZPR16: case Match_InvalidZPR32: @@ -4470,11 +4471,15 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, case Match_InvalidSVEPredicateDReg: return Error(Loc, "invalid predicate register."); case Match_InvalidSVEPredicate3bAnyReg: + return Error(Loc, "invalid restricted predicate register, expected p0..p7 (without element suffix)"); case Match_InvalidSVEPredicate3bBReg: + return Error(Loc, "invalid restricted predicate register, expected p0.b..p7.b"); case Match_InvalidSVEPredicate3bHReg: + return Error(Loc, "invalid restricted predicate register, expected p0.h..p7.h"); case Match_InvalidSVEPredicate3bSReg: + return Error(Loc, "invalid restricted predicate register, expected p0.s..p7.s"); case Match_InvalidSVEPredicate3bDReg: - return Error(Loc, "restricted predicate has range [0, 7]."); + return Error(Loc, "invalid restricted predicate register, expected p0.d..p7.d"); case Match_InvalidSVEExactFPImmOperandHalfOne: return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0."); case Match_InvalidSVEExactFPImmOperandHalfTwo: @@ -4777,10 +4782,12 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } MCInst Inst; + FeatureBitset MissingFeatures; // First try to match against the secondary set of tables containing the // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2"). unsigned MatchResult = - MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1); + MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures, + MatchingInlineAsm, 1); // If that fails, try against the alternate table containing long-form NEON: // "fadd v0.2s, v1.2s, v2.2s" @@ -4789,9 +4796,11 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // long-form match also fails. auto ShortFormNEONErrorInfo = ErrorInfo; auto ShortFormNEONMatchResult = MatchResult; + auto ShortFormNEONMissingFeatures = MissingFeatures; MatchResult = - MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); + MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures, + MatchingInlineAsm, 0); // Now, both matches failed, and the long-form match failed on the mnemonic // suffix token operand. The short-form match failure is probably more @@ -4801,6 +4810,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ((AArch64Operand &)*Operands[1]).isTokenSuffix()) { MatchResult = ShortFormNEONMatchResult; ErrorInfo = ShortFormNEONErrorInfo; + MissingFeatures = ShortFormNEONMissingFeatures; } } @@ -4819,17 +4829,15 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return false; } case Match_MissingFeature: { - assert(ErrorInfo && "Unknown missing feature!"); + assert(MissingFeatures.any() && "Unknown missing feature!"); // Special case the error message for the very common case where only // a single subtarget feature is missing (neon, e.g.). std::string Msg = "instruction requires:"; - uint64_t Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { - if (ErrorInfo & Mask) { + for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) { + if (MissingFeatures[i]) { Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfo & Mask); + Msg += getSubtargetFeatureName(i); } - Mask <<= 1; } return Error(IDLoc, Msg); } @@ -5148,7 +5156,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { FeatureBitset ToggleFeatures = EnableFeature ? (~Features & Extension.Features) : ( Features & Extension.Features); - uint64_t Features = + FeatureBitset Features = ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); setAvailableFeatures(Features); break; @@ -5160,15 +5168,9 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { /// parseDirectiveArchExtension /// ::= .arch_extension [no]feature bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) { - MCAsmParser &Parser = getParser(); - - if (getLexer().isNot(AsmToken::Identifier)) - return Error(getLexer().getLoc(), "expected architecture extension name"); + SMLoc ExtLoc = getLoc(); - const AsmToken &Tok = Parser.getTok(); - StringRef Name = Tok.getString(); - SMLoc ExtLoc = Tok.getLoc(); - Lex(); + StringRef Name = getParser().parseStringToEndOfStatement().trim(); if (parseToken(AsmToken::EndOfStatement, "unexpected token in '.arch_extension' directive")) @@ -5192,7 +5194,7 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) { FeatureBitset ToggleFeatures = EnableFeature ? (~Features & Extension.Features) : (Features & Extension.Features); - uint64_t Features = + FeatureBitset Features = ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); setAvailableFeatures(Features); return false; @@ -5257,7 +5259,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { FeatureBitset ToggleFeatures = EnableFeature ? (~Features & Extension.Features) : ( Features & Extension.Features); - uint64_t Features = + FeatureBitset Features = ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); setAvailableFeatures(Features); FoundExtension = true; @@ -5518,6 +5520,8 @@ extern "C" void LLVMInitializeAArch64AsmParser() { RegisterMCAsmParser X(getTheAArch64leTarget()); RegisterMCAsmParser Y(getTheAArch64beTarget()); RegisterMCAsmParser Z(getTheARM64Target()); + RegisterMCAsmParser W(getTheARM64_32Target()); + RegisterMCAsmParser V(getTheAArch64_32Target()); } #define GET_REGISTER_MATCHER diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 4102f1eb5cc1..145ffef6f6f9 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1,9 +1,8 @@ //===- AArch64Disassembler.cpp - Disassembler for AArch64 -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,6 +13,7 @@ #include "AArch64ExternalSymbolizer.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" #include "llvm-c/Disassembler.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" @@ -220,11 +220,6 @@ static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeLoadAllocTagArrayInstruction(MCInst &Inst, - uint32_t insn, - uint64_t address, - const void* Decoder); - static bool Check(DecodeStatus &Out, DecodeStatus In) { switch (In) { case MCDisassembler::Success: @@ -292,11 +287,19 @@ extern "C" void LLVMInitializeAArch64Disassembler() { createAArch64ExternalSymbolizer); TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(), createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(), + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(), + createAArch64ExternalSymbolizer); TargetRegistry::RegisterMCDisassembler(getTheARM64Target(), createAArch64Disassembler); TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(), createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(), + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(), + createAArch64ExternalSymbolizer); } static const unsigned FPR128DecoderTable[] = { @@ -1619,7 +1622,7 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, case AArch64::MOVIv4s_msl: case AArch64::MVNIv2s_msl: case AArch64::MVNIv4s_msl: - Inst.addOperand(MCOperand::createImm(cmode & 1 ? 0x110 : 0x108)); + Inst.addOperand(MCOperand::createImm((cmode & 1) ? 0x110 : 0x108)); break; } @@ -1779,8 +1782,8 @@ static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, if (RegNo & 0x1) return Fail; - unsigned Register = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); + unsigned Reg = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo / 2); + Inst.addOperand(MCOperand::createReg(Reg)); return Success; } @@ -1852,25 +1855,3 @@ static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm, Inst.addOperand(MCOperand::createImm(Imm + 1)); return Success; } - -static DecodeStatus DecodeLoadAllocTagArrayInstruction(MCInst &Inst, - uint32_t insn, - uint64_t address, - const void* Decoder) { - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Rt = fieldFromInstruction(insn, 0, 5); - - // Outputs - DecodeGPR64spRegisterClass(Inst, Rn, address, Decoder); - DecodeGPR64RegisterClass(Inst, Rt, address, Decoder); - - // Input (Rn again) - Inst.addOperand(Inst.getOperand(0)); - - //Do this post decode since the raw number for xzr and sp is the same - if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) { - return SoftFail; - } else { - return Success; - } -} diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index bc2f7f181699..2ba5a695701f 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -1,9 +1,8 @@ //===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 342655a29b1d..3f815ac8c3d0 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -1,9 +1,8 @@ //===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h index 49e844963797..dc72331660cc 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -1,9 +1,8 @@ //===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp deleted file mode 100644 index dcf2dd251149..000000000000 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ /dev/null @@ -1,1582 +0,0 @@ -//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an AArch64 MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "AArch64InstPrinter.h" -#include "MCTargetDesc/AArch64AddressingModes.h" -#include "Utils/AArch64BaseInfo.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#define GET_INSTRUCTION_NAME -#define PRINT_ALIAS_INSTR -#include "AArch64GenAsmWriter.inc" -#define GET_INSTRUCTION_NAME -#define PRINT_ALIAS_INSTR -#include "AArch64GenAsmWriter1.inc" - -AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - -AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : AArch64InstPrinter(MAI, MII, MRI) {} - -void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - // This is for .cfi directives. - OS << getRegisterName(RegNo); -} - -void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, - const MCSubtargetInfo &STI) { - // Check for special encodings and print the canonical alias instead. - - unsigned Opcode = MI->getOpcode(); - - if (Opcode == AArch64::SYSxt) - if (printSysAlias(MI, STI, O)) { - printAnnotation(O, Annot); - return; - } - - // SBFM/UBFM should print to a nicer aliased form if possible. - if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri || - Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) { - const MCOperand &Op0 = MI->getOperand(0); - const MCOperand &Op1 = MI->getOperand(1); - const MCOperand &Op2 = MI->getOperand(2); - const MCOperand &Op3 = MI->getOperand(3); - - bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri); - bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri); - if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) { - const char *AsmMnemonic = nullptr; - - switch (Op3.getImm()) { - default: - break; - case 7: - if (IsSigned) - AsmMnemonic = "sxtb"; - else if (!Is64Bit) - AsmMnemonic = "uxtb"; - break; - case 15: - if (IsSigned) - AsmMnemonic = "sxth"; - else if (!Is64Bit) - AsmMnemonic = "uxth"; - break; - case 31: - // *xtw is only valid for signed 64-bit operations. - if (Is64Bit && IsSigned) - AsmMnemonic = "sxtw"; - break; - } - - if (AsmMnemonic) { - O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) - << ", " << getRegisterName(getWRegFromXReg(Op1.getReg())); - printAnnotation(O, Annot); - return; - } - } - - // All immediate shifts are aliases, implemented using the Bitfield - // instruction. In all cases the immediate shift amount shift must be in - // the range 0 to (reg.size -1). - if (Op2.isImm() && Op3.isImm()) { - const char *AsmMnemonic = nullptr; - int shift = 0; - int64_t immr = Op2.getImm(); - int64_t imms = Op3.getImm(); - if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) { - AsmMnemonic = "lsl"; - shift = 31 - imms; - } else if (Opcode == AArch64::UBFMXri && imms != 0x3f && - ((imms + 1 == immr))) { - AsmMnemonic = "lsl"; - shift = 63 - imms; - } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) { - AsmMnemonic = "lsr"; - shift = immr; - } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) { - AsmMnemonic = "lsr"; - shift = immr; - } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) { - AsmMnemonic = "asr"; - shift = immr; - } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) { - AsmMnemonic = "asr"; - shift = immr; - } - if (AsmMnemonic) { - O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) - << ", " << getRegisterName(Op1.getReg()) << ", #" << shift; - printAnnotation(O, Annot); - return; - } - } - - // SBFIZ/UBFIZ aliases - if (Op2.getImm() > Op3.getImm()) { - O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t' - << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) - << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1; - printAnnotation(O, Annot); - return; - } - - // Otherwise SBFX/UBFX is the preferred form - O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t' - << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) - << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1; - printAnnotation(O, Annot); - return; - } - - if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) { - const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0 - const MCOperand &Op2 = MI->getOperand(2); - int ImmR = MI->getOperand(3).getImm(); - int ImmS = MI->getOperand(4).getImm(); - - if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) && - (ImmR == 0 || ImmS < ImmR)) { - // BFC takes precedence over its entire range, sligtly differently to BFI. - int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; - int LSB = (BitWidth - ImmR) % BitWidth; - int Width = ImmS + 1; - - O << "\tbfc\t" << getRegisterName(Op0.getReg()) - << ", #" << LSB << ", #" << Width; - printAnnotation(O, Annot); - return; - } else if (ImmS < ImmR) { - // BFI alias - int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; - int LSB = (BitWidth - ImmR) % BitWidth; - int Width = ImmS + 1; - - O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", " - << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width; - printAnnotation(O, Annot); - return; - } - - int LSB = ImmR; - int Width = ImmS - ImmR + 1; - // Otherwise BFXIL the preferred form - O << "\tbfxil\t" - << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg()) - << ", #" << LSB << ", #" << Width; - printAnnotation(O, Annot); - return; - } - - // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift - // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be - // printed. - if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi || - Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && - MI->getOperand(1).isExpr()) { - if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) - O << "\tmovz\t"; - else - O << "\tmovn\t"; - - O << getRegisterName(MI->getOperand(0).getReg()) << ", #"; - MI->getOperand(1).getExpr()->print(O, &MAI); - return; - } - - if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) && - MI->getOperand(2).isExpr()) { - O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"; - MI->getOperand(2).getExpr()->print(O, &MAI); - return; - } - - // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their - // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 > - // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction - // that can represent the move is the MOV alias, and the rest get printed - // normally. - if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) && - MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { - int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32; - int Shift = MI->getOperand(2).getImm(); - uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift; - - if (AArch64_AM::isMOVZMovAlias(Value, Shift, - Opcode == AArch64::MOVZXi ? 64 : 32)) { - O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" - << formatImm(SignExtend64(Value, RegWidth)); - return; - } - } - - if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && - MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { - int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32; - int Shift = MI->getOperand(2).getImm(); - uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift); - if (RegWidth == 32) - Value = Value & 0xffffffff; - - if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) { - O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" - << formatImm(SignExtend64(Value, RegWidth)); - return; - } - } - - if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) && - (MI->getOperand(1).getReg() == AArch64::XZR || - MI->getOperand(1).getReg() == AArch64::WZR) && - MI->getOperand(2).isImm()) { - int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32; - uint64_t Value = AArch64_AM::decodeLogicalImmediate( - MI->getOperand(2).getImm(), RegWidth); - if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) { - O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" - << formatImm(SignExtend64(Value, RegWidth)); - return; - } - } - - if (Opcode == AArch64::CompilerBarrier) { - O << '\t' << MAI.getCommentString() << " COMPILER BARRIER"; - printAnnotation(O, Annot); - return; - } - - // Instruction TSB is specified as a one operand instruction, but 'csync' is - // not encoded, so for printing it is treated as a special case here: - if (Opcode == AArch64::TSB) { - O << "\ttsb\tcsync"; - return; - } - - if (!printAliasInstr(MI, STI, O)) - printInstruction(MI, STI, O); - - printAnnotation(O, Annot); -} - -static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout, - bool &IsTbx) { - switch (Opcode) { - case AArch64::TBXv8i8One: - case AArch64::TBXv8i8Two: - case AArch64::TBXv8i8Three: - case AArch64::TBXv8i8Four: - IsTbx = true; - Layout = ".8b"; - return true; - case AArch64::TBLv8i8One: - case AArch64::TBLv8i8Two: - case AArch64::TBLv8i8Three: - case AArch64::TBLv8i8Four: - IsTbx = false; - Layout = ".8b"; - return true; - case AArch64::TBXv16i8One: - case AArch64::TBXv16i8Two: - case AArch64::TBXv16i8Three: - case AArch64::TBXv16i8Four: - IsTbx = true; - Layout = ".16b"; - return true; - case AArch64::TBLv16i8One: - case AArch64::TBLv16i8Two: - case AArch64::TBLv16i8Three: - case AArch64::TBLv16i8Four: - IsTbx = false; - Layout = ".16b"; - return true; - default: - return false; - } -} - -struct LdStNInstrDesc { - unsigned Opcode; - const char *Mnemonic; - const char *Layout; - int ListOperand; - bool HasLane; - int NaturalOffset; -}; - -static const LdStNInstrDesc LdStNInstInfo[] = { - { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, - { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, - { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, - { AArch64::LD1i64, "ld1", ".d", 1, true, 0 }, - { AArch64::LD1i8_POST, "ld1", ".b", 2, true, 1 }, - { AArch64::LD1i16_POST, "ld1", ".h", 2, true, 2 }, - { AArch64::LD1i32_POST, "ld1", ".s", 2, true, 4 }, - { AArch64::LD1i64_POST, "ld1", ".d", 2, true, 8 }, - { AArch64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 }, - { AArch64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 }, - { AArch64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 }, - { AArch64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 }, - { AArch64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 }, - { AArch64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 }, - { AArch64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 }, - { AArch64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 }, - { AArch64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 }, - { AArch64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 }, - { AArch64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 }, - { AArch64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 }, - { AArch64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 }, - { AArch64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 }, - { AArch64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 }, - { AArch64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 }, - { AArch64::LD1Onev16b, "ld1", ".16b", 0, false, 0 }, - { AArch64::LD1Onev8h, "ld1", ".8h", 0, false, 0 }, - { AArch64::LD1Onev4s, "ld1", ".4s", 0, false, 0 }, - { AArch64::LD1Onev2d, "ld1", ".2d", 0, false, 0 }, - { AArch64::LD1Onev8b, "ld1", ".8b", 0, false, 0 }, - { AArch64::LD1Onev4h, "ld1", ".4h", 0, false, 0 }, - { AArch64::LD1Onev2s, "ld1", ".2s", 0, false, 0 }, - { AArch64::LD1Onev1d, "ld1", ".1d", 0, false, 0 }, - { AArch64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 }, - { AArch64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 }, - { AArch64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 }, - { AArch64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 }, - { AArch64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 }, - { AArch64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 }, - { AArch64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 }, - { AArch64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 }, - { AArch64::LD1Twov16b, "ld1", ".16b", 0, false, 0 }, - { AArch64::LD1Twov8h, "ld1", ".8h", 0, false, 0 }, - { AArch64::LD1Twov4s, "ld1", ".4s", 0, false, 0 }, - { AArch64::LD1Twov2d, "ld1", ".2d", 0, false, 0 }, - { AArch64::LD1Twov8b, "ld1", ".8b", 0, false, 0 }, - { AArch64::LD1Twov4h, "ld1", ".4h", 0, false, 0 }, - { AArch64::LD1Twov2s, "ld1", ".2s", 0, false, 0 }, - { AArch64::LD1Twov1d, "ld1", ".1d", 0, false, 0 }, - { AArch64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 }, - { AArch64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 }, - { AArch64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 }, - { AArch64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 }, - { AArch64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 }, - { AArch64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 }, - { AArch64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 }, - { AArch64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 }, - { AArch64::LD1Threev16b, "ld1", ".16b", 0, false, 0 }, - { AArch64::LD1Threev8h, "ld1", ".8h", 0, false, 0 }, - { AArch64::LD1Threev4s, "ld1", ".4s", 0, false, 0 }, - { AArch64::LD1Threev2d, "ld1", ".2d", 0, false, 0 }, - { AArch64::LD1Threev8b, "ld1", ".8b", 0, false, 0 }, - { AArch64::LD1Threev4h, "ld1", ".4h", 0, false, 0 }, - { AArch64::LD1Threev2s, "ld1", ".2s", 0, false, 0 }, - { AArch64::LD1Threev1d, "ld1", ".1d", 0, false, 0 }, - { AArch64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 }, - { AArch64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 }, - { AArch64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 }, - { AArch64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 }, - { AArch64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 }, - { AArch64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 }, - { AArch64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 }, - { AArch64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 }, - { AArch64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 }, - { AArch64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 }, - { AArch64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 }, - { AArch64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 }, - { AArch64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 }, - { AArch64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 }, - { AArch64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 }, - { AArch64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 }, - { AArch64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 }, - { AArch64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 }, - { AArch64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 }, - { AArch64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 }, - { AArch64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 }, - { AArch64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 }, - { AArch64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 }, - { AArch64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 }, - { AArch64::LD2i8, "ld2", ".b", 1, true, 0 }, - { AArch64::LD2i16, "ld2", ".h", 1, true, 0 }, - { AArch64::LD2i32, "ld2", ".s", 1, true, 0 }, - { AArch64::LD2i64, "ld2", ".d", 1, true, 0 }, - { AArch64::LD2i8_POST, "ld2", ".b", 2, true, 2 }, - { AArch64::LD2i16_POST, "ld2", ".h", 2, true, 4 }, - { AArch64::LD2i32_POST, "ld2", ".s", 2, true, 8 }, - { AArch64::LD2i64_POST, "ld2", ".d", 2, true, 16 }, - { AArch64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 }, - { AArch64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 }, - { AArch64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 }, - { AArch64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 }, - { AArch64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 }, - { AArch64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 }, - { AArch64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 }, - { AArch64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 }, - { AArch64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 }, - { AArch64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 }, - { AArch64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 }, - { AArch64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 }, - { AArch64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 }, - { AArch64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 }, - { AArch64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 }, - { AArch64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 }, - { AArch64::LD2Twov16b, "ld2", ".16b", 0, false, 0 }, - { AArch64::LD2Twov8h, "ld2", ".8h", 0, false, 0 }, - { AArch64::LD2Twov4s, "ld2", ".4s", 0, false, 0 }, - { AArch64::LD2Twov2d, "ld2", ".2d", 0, false, 0 }, - { AArch64::LD2Twov8b, "ld2", ".8b", 0, false, 0 }, - { AArch64::LD2Twov4h, "ld2", ".4h", 0, false, 0 }, - { AArch64::LD2Twov2s, "ld2", ".2s", 0, false, 0 }, - { AArch64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 }, - { AArch64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 }, - { AArch64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 }, - { AArch64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 }, - { AArch64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 }, - { AArch64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 }, - { AArch64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 }, - { AArch64::LD3i8, "ld3", ".b", 1, true, 0 }, - { AArch64::LD3i16, "ld3", ".h", 1, true, 0 }, - { AArch64::LD3i32, "ld3", ".s", 1, true, 0 }, - { AArch64::LD3i64, "ld3", ".d", 1, true, 0 }, - { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, - { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, - { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, - { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, - { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, - { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, - { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, - { AArch64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 }, - { AArch64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 }, - { AArch64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 }, - { AArch64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 }, - { AArch64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 }, - { AArch64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 }, - { AArch64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 }, - { AArch64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 }, - { AArch64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 }, - { AArch64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 }, - { AArch64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 }, - { AArch64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 }, - { AArch64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 }, - { AArch64::LD3Threev16b, "ld3", ".16b", 0, false, 0 }, - { AArch64::LD3Threev8h, "ld3", ".8h", 0, false, 0 }, - { AArch64::LD3Threev4s, "ld3", ".4s", 0, false, 0 }, - { AArch64::LD3Threev2d, "ld3", ".2d", 0, false, 0 }, - { AArch64::LD3Threev8b, "ld3", ".8b", 0, false, 0 }, - { AArch64::LD3Threev4h, "ld3", ".4h", 0, false, 0 }, - { AArch64::LD3Threev2s, "ld3", ".2s", 0, false, 0 }, - { AArch64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 }, - { AArch64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 }, - { AArch64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 }, - { AArch64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 }, - { AArch64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 }, - { AArch64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 }, - { AArch64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 }, - { AArch64::LD4i8, "ld4", ".b", 1, true, 0 }, - { AArch64::LD4i16, "ld4", ".h", 1, true, 0 }, - { AArch64::LD4i32, "ld4", ".s", 1, true, 0 }, - { AArch64::LD4i64, "ld4", ".d", 1, true, 0 }, - { AArch64::LD4i8_POST, "ld4", ".b", 2, true, 4 }, - { AArch64::LD4i16_POST, "ld4", ".h", 2, true, 8 }, - { AArch64::LD4i32_POST, "ld4", ".s", 2, true, 16 }, - { AArch64::LD4i64_POST, "ld4", ".d", 2, true, 32 }, - { AArch64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 }, - { AArch64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 }, - { AArch64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 }, - { AArch64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 }, - { AArch64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 }, - { AArch64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 }, - { AArch64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 }, - { AArch64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 }, - { AArch64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 }, - { AArch64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 }, - { AArch64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 }, - { AArch64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 }, - { AArch64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 }, - { AArch64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 }, - { AArch64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 }, - { AArch64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 }, - { AArch64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 }, - { AArch64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 }, - { AArch64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 }, - { AArch64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 }, - { AArch64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 }, - { AArch64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 }, - { AArch64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 }, - { AArch64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 }, - { AArch64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 }, - { AArch64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 }, - { AArch64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 }, - { AArch64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 }, - { AArch64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 }, - { AArch64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 }, - { AArch64::ST1i8, "st1", ".b", 0, true, 0 }, - { AArch64::ST1i16, "st1", ".h", 0, true, 0 }, - { AArch64::ST1i32, "st1", ".s", 0, true, 0 }, - { AArch64::ST1i64, "st1", ".d", 0, true, 0 }, - { AArch64::ST1i8_POST, "st1", ".b", 1, true, 1 }, - { AArch64::ST1i16_POST, "st1", ".h", 1, true, 2 }, - { AArch64::ST1i32_POST, "st1", ".s", 1, true, 4 }, - { AArch64::ST1i64_POST, "st1", ".d", 1, true, 8 }, - { AArch64::ST1Onev16b, "st1", ".16b", 0, false, 0 }, - { AArch64::ST1Onev8h, "st1", ".8h", 0, false, 0 }, - { AArch64::ST1Onev4s, "st1", ".4s", 0, false, 0 }, - { AArch64::ST1Onev2d, "st1", ".2d", 0, false, 0 }, - { AArch64::ST1Onev8b, "st1", ".8b", 0, false, 0 }, - { AArch64::ST1Onev4h, "st1", ".4h", 0, false, 0 }, - { AArch64::ST1Onev2s, "st1", ".2s", 0, false, 0 }, - { AArch64::ST1Onev1d, "st1", ".1d", 0, false, 0 }, - { AArch64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 }, - { AArch64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 }, - { AArch64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 }, - { AArch64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 }, - { AArch64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 }, - { AArch64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 }, - { AArch64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 }, - { AArch64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 }, - { AArch64::ST1Twov16b, "st1", ".16b", 0, false, 0 }, - { AArch64::ST1Twov8h, "st1", ".8h", 0, false, 0 }, - { AArch64::ST1Twov4s, "st1", ".4s", 0, false, 0 }, - { AArch64::ST1Twov2d, "st1", ".2d", 0, false, 0 }, - { AArch64::ST1Twov8b, "st1", ".8b", 0, false, 0 }, - { AArch64::ST1Twov4h, "st1", ".4h", 0, false, 0 }, - { AArch64::ST1Twov2s, "st1", ".2s", 0, false, 0 }, - { AArch64::ST1Twov1d, "st1", ".1d", 0, false, 0 }, - { AArch64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 }, - { AArch64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 }, - { AArch64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 }, - { AArch64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 }, - { AArch64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 }, - { AArch64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 }, - { AArch64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 }, - { AArch64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 }, - { AArch64::ST1Threev16b, "st1", ".16b", 0, false, 0 }, - { AArch64::ST1Threev8h, "st1", ".8h", 0, false, 0 }, - { AArch64::ST1Threev4s, "st1", ".4s", 0, false, 0 }, - { AArch64::ST1Threev2d, "st1", ".2d", 0, false, 0 }, - { AArch64::ST1Threev8b, "st1", ".8b", 0, false, 0 }, - { AArch64::ST1Threev4h, "st1", ".4h", 0, false, 0 }, - { AArch64::ST1Threev2s, "st1", ".2s", 0, false, 0 }, - { AArch64::ST1Threev1d, "st1", ".1d", 0, false, 0 }, - { AArch64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 }, - { AArch64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 }, - { AArch64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 }, - { AArch64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 }, - { AArch64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 }, - { AArch64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 }, - { AArch64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 }, - { AArch64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 }, - { AArch64::ST1Fourv16b, "st1", ".16b", 0, false, 0 }, - { AArch64::ST1Fourv8h, "st1", ".8h", 0, false, 0 }, - { AArch64::ST1Fourv4s, "st1", ".4s", 0, false, 0 }, - { AArch64::ST1Fourv2d, "st1", ".2d", 0, false, 0 }, - { AArch64::ST1Fourv8b, "st1", ".8b", 0, false, 0 }, - { AArch64::ST1Fourv4h, "st1", ".4h", 0, false, 0 }, - { AArch64::ST1Fourv2s, "st1", ".2s", 0, false, 0 }, - { AArch64::ST1Fourv1d, "st1", ".1d", 0, false, 0 }, - { AArch64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 }, - { AArch64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 }, - { AArch64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 }, - { AArch64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 }, - { AArch64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 }, - { AArch64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 }, - { AArch64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 }, - { AArch64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 }, - { AArch64::ST2i8, "st2", ".b", 0, true, 0 }, - { AArch64::ST2i16, "st2", ".h", 0, true, 0 }, - { AArch64::ST2i32, "st2", ".s", 0, true, 0 }, - { AArch64::ST2i64, "st2", ".d", 0, true, 0 }, - { AArch64::ST2i8_POST, "st2", ".b", 1, true, 2 }, - { AArch64::ST2i16_POST, "st2", ".h", 1, true, 4 }, - { AArch64::ST2i32_POST, "st2", ".s", 1, true, 8 }, - { AArch64::ST2i64_POST, "st2", ".d", 1, true, 16 }, - { AArch64::ST2Twov16b, "st2", ".16b", 0, false, 0 }, - { AArch64::ST2Twov8h, "st2", ".8h", 0, false, 0 }, - { AArch64::ST2Twov4s, "st2", ".4s", 0, false, 0 }, - { AArch64::ST2Twov2d, "st2", ".2d", 0, false, 0 }, - { AArch64::ST2Twov8b, "st2", ".8b", 0, false, 0 }, - { AArch64::ST2Twov4h, "st2", ".4h", 0, false, 0 }, - { AArch64::ST2Twov2s, "st2", ".2s", 0, false, 0 }, - { AArch64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 }, - { AArch64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 }, - { AArch64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 }, - { AArch64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 }, - { AArch64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 }, - { AArch64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 }, - { AArch64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 }, - { AArch64::ST3i8, "st3", ".b", 0, true, 0 }, - { AArch64::ST3i16, "st3", ".h", 0, true, 0 }, - { AArch64::ST3i32, "st3", ".s", 0, true, 0 }, - { AArch64::ST3i64, "st3", ".d", 0, true, 0 }, - { AArch64::ST3i8_POST, "st3", ".b", 1, true, 3 }, - { AArch64::ST3i16_POST, "st3", ".h", 1, true, 6 }, - { AArch64::ST3i32_POST, "st3", ".s", 1, true, 12 }, - { AArch64::ST3i64_POST, "st3", ".d", 1, true, 24 }, - { AArch64::ST3Threev16b, "st3", ".16b", 0, false, 0 }, - { AArch64::ST3Threev8h, "st3", ".8h", 0, false, 0 }, - { AArch64::ST3Threev4s, "st3", ".4s", 0, false, 0 }, - { AArch64::ST3Threev2d, "st3", ".2d", 0, false, 0 }, - { AArch64::ST3Threev8b, "st3", ".8b", 0, false, 0 }, - { AArch64::ST3Threev4h, "st3", ".4h", 0, false, 0 }, - { AArch64::ST3Threev2s, "st3", ".2s", 0, false, 0 }, - { AArch64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 }, - { AArch64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 }, - { AArch64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 }, - { AArch64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 }, - { AArch64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 }, - { AArch64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 }, - { AArch64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 }, - { AArch64::ST4i8, "st4", ".b", 0, true, 0 }, - { AArch64::ST4i16, "st4", ".h", 0, true, 0 }, - { AArch64::ST4i32, "st4", ".s", 0, true, 0 }, - { AArch64::ST4i64, "st4", ".d", 0, true, 0 }, - { AArch64::ST4i8_POST, "st4", ".b", 1, true, 4 }, - { AArch64::ST4i16_POST, "st4", ".h", 1, true, 8 }, - { AArch64::ST4i32_POST, "st4", ".s", 1, true, 16 }, - { AArch64::ST4i64_POST, "st4", ".d", 1, true, 32 }, - { AArch64::ST4Fourv16b, "st4", ".16b", 0, false, 0 }, - { AArch64::ST4Fourv8h, "st4", ".8h", 0, false, 0 }, - { AArch64::ST4Fourv4s, "st4", ".4s", 0, false, 0 }, - { AArch64::ST4Fourv2d, "st4", ".2d", 0, false, 0 }, - { AArch64::ST4Fourv8b, "st4", ".8b", 0, false, 0 }, - { AArch64::ST4Fourv4h, "st4", ".4h", 0, false, 0 }, - { AArch64::ST4Fourv2s, "st4", ".2s", 0, false, 0 }, - { AArch64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 }, - { AArch64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 }, - { AArch64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 }, - { AArch64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 }, - { AArch64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 }, - { AArch64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 }, - { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, -}; - -static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { - unsigned Idx; - for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) - if (LdStNInstInfo[Idx].Opcode == Opcode) - return &LdStNInstInfo[Idx]; - - return nullptr; -} - -void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, - const MCSubtargetInfo &STI) { - unsigned Opcode = MI->getOpcode(); - StringRef Layout; - - bool IsTbx; - if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) { - O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t' - << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", "; - - unsigned ListOpNum = IsTbx ? 2 : 1; - printVectorList(MI, ListOpNum, STI, O, ""); - - O << ", " - << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg); - printAnnotation(O, Annot); - return; - } - - if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { - O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; - - // Now onto the operands: first a vector list with possible lane - // specifier. E.g. { v0 }[2] - int OpNum = LdStDesc->ListOperand; - printVectorList(MI, OpNum++, STI, O, ""); - - if (LdStDesc->HasLane) - O << '[' << MI->getOperand(OpNum++).getImm() << ']'; - - // Next the address: [xN] - unsigned AddrReg = MI->getOperand(OpNum++).getReg(); - O << ", [" << getRegisterName(AddrReg) << ']'; - - // Finally, there might be a post-indexed offset. - if (LdStDesc->NaturalOffset != 0) { - unsigned Reg = MI->getOperand(OpNum++).getReg(); - if (Reg != AArch64::XZR) - O << ", " << getRegisterName(Reg); - else { - assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?"); - O << ", #" << LdStDesc->NaturalOffset; - } - } - - printAnnotation(O, Annot); - return; - } - - AArch64InstPrinter::printInst(MI, O, Annot, STI); -} - -bool AArch64InstPrinter::printSysAlias(const MCInst *MI, - const MCSubtargetInfo &STI, - raw_ostream &O) { -#ifndef NDEBUG - unsigned Opcode = MI->getOpcode(); - assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); -#endif - - const MCOperand &Op1 = MI->getOperand(0); - const MCOperand &Cn = MI->getOperand(1); - const MCOperand &Cm = MI->getOperand(2); - const MCOperand &Op2 = MI->getOperand(3); - - unsigned Op1Val = Op1.getImm(); - unsigned CnVal = Cn.getImm(); - unsigned CmVal = Cm.getImm(); - unsigned Op2Val = Op2.getImm(); - - uint16_t Encoding = Op2Val; - Encoding |= CmVal << 3; - Encoding |= CnVal << 7; - Encoding |= Op1Val << 11; - - bool NeedsReg; - std::string Ins; - std::string Name; - - if (CnVal == 7) { - switch (CmVal) { - default: return false; - // Maybe IC, maybe Prediction Restriction - case 1: - switch (Op1Val) { - default: return false; - case 0: goto Search_IC; - case 3: goto Search_PRCTX; - } - // Prediction Restriction aliases - case 3: { - Search_PRCTX: - const AArch64PRCTX::PRCTX *PRCTX = AArch64PRCTX::lookupPRCTXByEncoding(Encoding >> 3); - if (!PRCTX || !PRCTX->haveFeatures(STI.getFeatureBits())) - return false; - - NeedsReg = PRCTX->NeedsReg; - switch (Op2Val) { - default: return false; - case 4: Ins = "cfp\t"; break; - case 5: Ins = "dvp\t"; break; - case 7: Ins = "cpp\t"; break; - } - Name = std::string(PRCTX->Name); - } - break; - // IC aliases - case 5: { - Search_IC: - const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding); - if (!IC || !IC->haveFeatures(STI.getFeatureBits())) - return false; - - NeedsReg = IC->NeedsReg; - Ins = "ic\t"; - Name = std::string(IC->Name); - } - break; - // DC aliases - case 4: case 6: case 10: case 11: case 12: case 13: case 14: - { - const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding); - if (!DC || !DC->haveFeatures(STI.getFeatureBits())) - return false; - - NeedsReg = true; - Ins = "dc\t"; - Name = std::string(DC->Name); - } - break; - // AT aliases - case 8: case 9: { - const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding); - if (!AT || !AT->haveFeatures(STI.getFeatureBits())) - return false; - - NeedsReg = true; - Ins = "at\t"; - Name = std::string(AT->Name); - } - break; - } - } else if (CnVal == 8) { - // TLBI aliases - const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding); - if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits())) - return false; - - NeedsReg = TLBI->NeedsReg; - Ins = "tlbi\t"; - Name = std::string(TLBI->Name); - } - else - return false; - - std::string Str = Ins + Name; - std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower); - - O << '\t' << Str; - if (NeedsReg) - O << ", " << getRegisterName(MI->getOperand(4).getReg()); - - return true; -} - -void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - O << getRegisterName(Reg); - } else if (Op.isImm()) { - printImm(MI, OpNo, STI, O); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI); - } -} - -void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - O << "#" << formatImm(Op.getImm()); -} - -void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - O << format("#%#llx", Op.getImm()); -} - -void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, - unsigned Imm, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - if (Reg == AArch64::XZR) - O << "#" << Imm; - else - O << getRegisterName(Reg); - } else - llvm_unreachable("unknown operand kind in printPostIncOperand64"); -} - -void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isReg() && "Non-register vreg operand!"); - unsigned Reg = Op.getReg(); - O << getRegisterName(Reg, AArch64::vreg); -} - -void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm() && "System instruction C[nm] operands must be immediates!"); - O << "c" << Op.getImm(); -} - -void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - if (MO.isImm()) { - unsigned Val = (MO.getImm() & 0xfff); - assert(Val == MO.getImm() && "Add/sub immediate out of range!"); - unsigned Shift = - AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); - O << '#' << formatImm(Val); - if (Shift != 0) - printShifter(MI, OpNum + 1, STI, O); - - if (CommentStream) - *CommentStream << '=' << formatImm(Val << Shift) << '\n'; - } else { - assert(MO.isExpr() && "Unexpected operand type!"); - MO.getExpr()->print(O, &MAI); - printShifter(MI, OpNum + 1, STI, O); - } -} - -template -void AArch64InstPrinter::printLogicalImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint64_t Val = MI->getOperand(OpNum).getImm(); - O << "#0x"; - O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 8 * sizeof(T))); -} - -void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNum).getImm(); - // LSL #0 should not be printed. - if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL && - AArch64_AM::getShiftValue(Val) == 0) - return; - O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val)) - << " #" << AArch64_AM::getShiftValue(Val); -} - -void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << getRegisterName(MI->getOperand(OpNum).getReg()); - printShifter(MI, OpNum + 1, STI, O); -} - -void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << getRegisterName(MI->getOperand(OpNum).getReg()); - printArithExtend(MI, OpNum + 1, STI, O); -} - -void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNum).getImm(); - AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val); - unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val); - - // If the destination or first source register operand is [W]SP, print - // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at - // all. - if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) { - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src1 = MI->getOperand(1).getReg(); - if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) && - ExtType == AArch64_AM::UXTX) || - ((Dest == AArch64::WSP || Src1 == AArch64::WSP) && - ExtType == AArch64_AM::UXTW) ) { - if (ShiftVal != 0) - O << ", lsl #" << ShiftVal; - return; - } - } - O << ", " << AArch64_AM::getShiftExtendName(ExtType); - if (ShiftVal != 0) - O << " #" << ShiftVal; -} - -static void printMemExtendImpl(bool SignExtend, bool DoShift, - unsigned Width, char SrcRegKind, - raw_ostream &O) { - // sxtw, sxtx, uxtw or lsl (== uxtx) - bool IsLSL = !SignExtend && SrcRegKind == 'x'; - if (IsLSL) - O << "lsl"; - else - O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind; - - if (DoShift || IsLSL) - O << " #" << Log2_32(Width / 8); -} - -void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum, - raw_ostream &O, char SrcRegKind, - unsigned Width) { - bool SignExtend = MI->getOperand(OpNum).getImm(); - bool DoShift = MI->getOperand(OpNum + 1).getImm(); - printMemExtendImpl(SignExtend, DoShift, Width, SrcRegKind, O); -} - -template -void AArch64InstPrinter::printRegWithShiftExtend(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printOperand(MI, OpNum, STI, O); - if (Suffix == 's' || Suffix == 'd') - O << '.' << Suffix; - else - assert(Suffix == 0 && "Unsupported suffix size"); - - bool DoShift = ExtWidth != 8; - if (SignExtend || DoShift || SrcRegKind == 'w') { - O << ", "; - printMemExtendImpl(SignExtend, DoShift, ExtWidth, SrcRegKind, O); - } -} - -void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); - O << AArch64CC::getCondCodeName(CC); -} - -void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); - O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC)); -} - -void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']'; -} - -template -void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm()); -} - -void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, - unsigned Scale, raw_ostream &O) { - const MCOperand MO = MI->getOperand(OpNum); - if (MO.isImm()) { - O << "#" << formatImm(MO.getImm() * Scale); - } else { - assert(MO.isExpr() && "Unexpected operand type!"); - MO.getExpr()->print(O, &MAI); - } -} - -void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, - unsigned Scale, raw_ostream &O) { - const MCOperand MO1 = MI->getOperand(OpNum + 1); - O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); - if (MO1.isImm()) { - O << ", #" << formatImm(MO1.getImm() * Scale); - } else { - assert(MO1.isExpr() && "Unexpected operand type!"); - O << ", "; - MO1.getExpr()->print(O, &MAI); - } - O << ']'; -} - -template -void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned prfop = MI->getOperand(OpNum).getImm(); - if (IsSVEPrefetch) { - if (auto PRFM = AArch64SVEPRFM::lookupSVEPRFMByEncoding(prfop)) { - O << PRFM->Name; - return; - } - } else if (auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop)) { - O << PRFM->Name; - return; - } - - O << '#' << formatImm(prfop); -} - -void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned psbhintop = MI->getOperand(OpNum).getImm(); - auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop); - if (PSB) - O << PSB->Name; - else - O << '#' << formatImm(psbhintop); -} - -void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned btihintop = (MI->getOperand(OpNum).getImm() ^ 32) >> 1; - auto BTI = AArch64BTIHint::lookupBTIByEncoding(btihintop); - if (BTI) - O << BTI->Name; - else - O << '#' << formatImm(btihintop); -} - -void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - float FPImm = - MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm()); - - // 8 decimal places are enough to perfectly represent permitted floats. - O << format("#%.8f", FPImm); -} - -static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { - while (Stride--) { - switch (Reg) { - default: - llvm_unreachable("Vector register expected!"); - case AArch64::Q0: Reg = AArch64::Q1; break; - case AArch64::Q1: Reg = AArch64::Q2; break; - case AArch64::Q2: Reg = AArch64::Q3; break; - case AArch64::Q3: Reg = AArch64::Q4; break; - case AArch64::Q4: Reg = AArch64::Q5; break; - case AArch64::Q5: Reg = AArch64::Q6; break; - case AArch64::Q6: Reg = AArch64::Q7; break; - case AArch64::Q7: Reg = AArch64::Q8; break; - case AArch64::Q8: Reg = AArch64::Q9; break; - case AArch64::Q9: Reg = AArch64::Q10; break; - case AArch64::Q10: Reg = AArch64::Q11; break; - case AArch64::Q11: Reg = AArch64::Q12; break; - case AArch64::Q12: Reg = AArch64::Q13; break; - case AArch64::Q13: Reg = AArch64::Q14; break; - case AArch64::Q14: Reg = AArch64::Q15; break; - case AArch64::Q15: Reg = AArch64::Q16; break; - case AArch64::Q16: Reg = AArch64::Q17; break; - case AArch64::Q17: Reg = AArch64::Q18; break; - case AArch64::Q18: Reg = AArch64::Q19; break; - case AArch64::Q19: Reg = AArch64::Q20; break; - case AArch64::Q20: Reg = AArch64::Q21; break; - case AArch64::Q21: Reg = AArch64::Q22; break; - case AArch64::Q22: Reg = AArch64::Q23; break; - case AArch64::Q23: Reg = AArch64::Q24; break; - case AArch64::Q24: Reg = AArch64::Q25; break; - case AArch64::Q25: Reg = AArch64::Q26; break; - case AArch64::Q26: Reg = AArch64::Q27; break; - case AArch64::Q27: Reg = AArch64::Q28; break; - case AArch64::Q28: Reg = AArch64::Q29; break; - case AArch64::Q29: Reg = AArch64::Q30; break; - case AArch64::Q30: Reg = AArch64::Q31; break; - // Vector lists can wrap around. - case AArch64::Q31: - Reg = AArch64::Q0; - break; - case AArch64::Z0: Reg = AArch64::Z1; break; - case AArch64::Z1: Reg = AArch64::Z2; break; - case AArch64::Z2: Reg = AArch64::Z3; break; - case AArch64::Z3: Reg = AArch64::Z4; break; - case AArch64::Z4: Reg = AArch64::Z5; break; - case AArch64::Z5: Reg = AArch64::Z6; break; - case AArch64::Z6: Reg = AArch64::Z7; break; - case AArch64::Z7: Reg = AArch64::Z8; break; - case AArch64::Z8: Reg = AArch64::Z9; break; - case AArch64::Z9: Reg = AArch64::Z10; break; - case AArch64::Z10: Reg = AArch64::Z11; break; - case AArch64::Z11: Reg = AArch64::Z12; break; - case AArch64::Z12: Reg = AArch64::Z13; break; - case AArch64::Z13: Reg = AArch64::Z14; break; - case AArch64::Z14: Reg = AArch64::Z15; break; - case AArch64::Z15: Reg = AArch64::Z16; break; - case AArch64::Z16: Reg = AArch64::Z17; break; - case AArch64::Z17: Reg = AArch64::Z18; break; - case AArch64::Z18: Reg = AArch64::Z19; break; - case AArch64::Z19: Reg = AArch64::Z20; break; - case AArch64::Z20: Reg = AArch64::Z21; break; - case AArch64::Z21: Reg = AArch64::Z22; break; - case AArch64::Z22: Reg = AArch64::Z23; break; - case AArch64::Z23: Reg = AArch64::Z24; break; - case AArch64::Z24: Reg = AArch64::Z25; break; - case AArch64::Z25: Reg = AArch64::Z26; break; - case AArch64::Z26: Reg = AArch64::Z27; break; - case AArch64::Z27: Reg = AArch64::Z28; break; - case AArch64::Z28: Reg = AArch64::Z29; break; - case AArch64::Z29: Reg = AArch64::Z30; break; - case AArch64::Z30: Reg = AArch64::Z31; break; - // Vector lists can wrap around. - case AArch64::Z31: - Reg = AArch64::Z0; - break; - } - } - return Reg; -} - -template -void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_assert(size == 64 || size == 32, - "Template parameter must be either 32 or 64"); - unsigned Reg = MI->getOperand(OpNum).getReg(); - - unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64; - unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64; - - unsigned Even = MRI.getSubReg(Reg, Sube); - unsigned Odd = MRI.getSubReg(Reg, Subo); - O << getRegisterName(Even) << ", " << getRegisterName(Odd); -} - -void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O, - StringRef LayoutSuffix) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - - O << "{ "; - - // Work out how many registers there are in the list (if there is an actual - // list). - unsigned NumRegs = 1; - if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) || - MRI.getRegClass(AArch64::ZPR2RegClassID).contains(Reg) || - MRI.getRegClass(AArch64::QQRegClassID).contains(Reg)) - NumRegs = 2; - else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) || - MRI.getRegClass(AArch64::ZPR3RegClassID).contains(Reg) || - MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg)) - NumRegs = 3; - else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) || - MRI.getRegClass(AArch64::ZPR4RegClassID).contains(Reg) || - MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg)) - NumRegs = 4; - - // Now forget about the list and find out what the first register is. - if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0)) - Reg = FirstReg; - else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0)) - Reg = FirstReg; - else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::zsub0)) - Reg = FirstReg; - - // If it's a D-reg, we need to promote it to the equivalent Q-reg before - // printing (otherwise getRegisterName fails). - if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) { - const MCRegisterClass &FPR128RC = - MRI.getRegClass(AArch64::FPR128RegClassID); - Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC); - } - - for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) { - if (MRI.getRegClass(AArch64::ZPRRegClassID).contains(Reg)) - O << getRegisterName(Reg) << LayoutSuffix; - else - O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix; - - if (i + 1 != NumRegs) - O << ", "; - } - - O << " }"; -} - -void -AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printVectorList(MI, OpNum, STI, O, ""); -} - -template -void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - std::string Suffix("."); - if (NumLanes) - Suffix += itostr(NumLanes) + LaneKind; - else - Suffix += LaneKind; - - printVectorList(MI, OpNum, STI, O, Suffix); -} - -void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "[" << MI->getOperand(OpNum).getImm() << "]"; -} - -void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - - // If the label has already been resolved to an immediate offset (say, when - // we're running the disassembler), just print the immediate. - if (Op.isImm()) { - O << "#" << formatImm(Op.getImm() * 4); - return; - } - - // If the branch target is simply an address then print it in hex. - const MCConstantExpr *BranchTarget = - dyn_cast(MI->getOperand(OpNum).getExpr()); - int64_t Address; - if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); - } else { - // Otherwise, just print the expression. - MI->getOperand(OpNum).getExpr()->print(O, &MAI); - } -} - -void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - - // If the label has already been resolved to an immediate offset (say, when - // we're running the disassembler), just print the immediate. - if (Op.isImm()) { - O << "#" << formatImm(Op.getImm() * (1 << 12)); - return; - } - - // Otherwise, just print the expression. - MI->getOperand(OpNum).getExpr()->print(O, &MAI); -} - -void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - unsigned Opcode = MI->getOpcode(); - - StringRef Name; - if (Opcode == AArch64::ISB) { - auto ISB = AArch64ISB::lookupISBByEncoding(Val); - Name = ISB ? ISB->Name : ""; - } else if (Opcode == AArch64::TSB) { - auto TSB = AArch64TSB::lookupTSBByEncoding(Val); - Name = TSB ? TSB->Name : ""; - } else { - auto DB = AArch64DB::lookupDBByEncoding(Val); - Name = DB ? DB->Name : ""; - } - if (!Name.empty()) - O << Name; - else - O << "#" << Val; -} - -void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - - // Horrible hack for the one register that has identical encodings but - // different names in MSR and MRS. Because of this, one of MRS and MSR is - // going to get the wrong entry - if (Val == AArch64SysReg::DBGDTRRX_EL0) { - O << "DBGDTRRX_EL0"; - return; - } - - const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); - if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits())) - O << Reg->Name; - else - O << AArch64SysReg::genericRegisterString(Val); -} - -void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - - // Horrible hack for the one register that has identical encodings but - // different names in MSR and MRS. Because of this, one of MRS and MSR is - // going to get the wrong entry - if (Val == AArch64SysReg::DBGDTRTX_EL0) { - O << "DBGDTRTX_EL0"; - return; - } - - const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); - if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits())) - O << Reg->Name; - else - O << AArch64SysReg::genericRegisterString(Val); -} - -void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - - auto PState = AArch64PState::lookupPStateByEncoding(Val); - if (PState && PState->haveFeatures(STI.getFeatureBits())) - O << PState->Name; - else - O << "#" << formatImm(Val); -} - -void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned RawVal = MI->getOperand(OpNo).getImm(); - uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal); - O << format("#%#016llx", Val); -} - -template -void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - O << "#" << (Val * Angle) + Remainder; -} - -void AArch64InstPrinter::printSVEPattern(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNum).getImm(); - if (auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByEncoding(Val)) - O << Pat->Name; - else - O << '#' << formatImm(Val); -} - -template -void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - switch (suffix) { - case 0: - case 'b': - case 'h': - case 's': - case 'd': - case 'q': - break; - default: llvm_unreachable("Invalid kind specifier."); - } - - unsigned Reg = MI->getOperand(OpNum).getReg(); - O << getRegisterName(Reg); - if (suffix != 0) - O << '.' << suffix; -} - -template -void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) { - typename std::make_unsigned::type HexValue = Value; - - if (getPrintImmHex()) - O << '#' << formatHex((uint64_t)HexValue); - else - O << '#' << formatDec(Value); - - if (CommentStream) { - // Do the opposite to that used for instruction operands. - if (getPrintImmHex()) - *CommentStream << '=' << formatDec(HexValue) << '\n'; - else - *CommentStream << '=' << formatHex((uint64_t)Value) << '\n'; - } -} - -template -void AArch64InstPrinter::printImm8OptLsl(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned UnscaledVal = MI->getOperand(OpNum).getImm(); - unsigned Shift = MI->getOperand(OpNum + 1).getImm(); - assert(AArch64_AM::getShiftType(Shift) == AArch64_AM::LSL && - "Unexepected shift type!"); - - // #0 lsl #8 is never pretty printed - if ((UnscaledVal == 0) && (AArch64_AM::getShiftValue(Shift) != 0)) { - O << '#' << formatImm(UnscaledVal); - printShifter(MI, OpNum + 1, STI, O); - return; - } - - T Val; - if (std::is_signed()) - Val = (int8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift)); - else - Val = (uint8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift)); - - printImmSVE(Val, O); -} - -template -void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - typedef typename std::make_signed::type SignedT; - typedef typename std::make_unsigned::type UnsignedT; - - uint64_t Val = MI->getOperand(OpNum).getImm(); - UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64); - - // Prefer the default format for 16bit values, hex otherwise. - if ((int16_t)PrintVal == (SignedT)PrintVal) - printImmSVE((T)PrintVal, O); - else if ((uint16_t)PrintVal == PrintVal) - printImmSVE(PrintVal, O); - else - O << '#' << formatHex((uint64_t)PrintVal); -} - -template -void AArch64InstPrinter::printZPRasFPR(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Base; - switch (Width) { - case 8: Base = AArch64::B0; break; - case 16: Base = AArch64::H0; break; - case 32: Base = AArch64::S0; break; - case 64: Base = AArch64::D0; break; - case 128: Base = AArch64::Q0; break; - default: - llvm_unreachable("Unsupported width"); - } - unsigned Reg = MI->getOperand(OpNum).getReg(); - O << getRegisterName(Reg - AArch64::Z0 + Base); -} - -template -void AArch64InstPrinter::printExactFPImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - auto *Imm0Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs0); - auto *Imm1Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs1); - unsigned Val = MI->getOperand(OpNum).getImm(); - O << "#" << (Val ? Imm1Desc->Repr : Imm0Desc->Repr); -} - -void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - O << getRegisterName(getWRegFromXReg(Reg)); -} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h deleted file mode 100644 index 4e9982f5b7be..000000000000 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ /dev/null @@ -1,223 +0,0 @@ -//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an AArch64 MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H -#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H - -#include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCInstPrinter.h" -#include "../Utils/AArch64BaseInfo.h" - -namespace llvm { - -class AArch64InstPrinter : public MCInstPrinter { -public: - AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - - // Autogenerated by tblgen. - virtual void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, - const MCSubtargetInfo &STI, - raw_ostream &O); - - virtual StringRef getRegName(unsigned RegNo) const { - return getRegisterName(RegNo); - } - - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = AArch64::NoRegAltName); - -protected: - bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - // Operand printers - void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - template void printImmSVE(T Value, raw_ostream &O); - void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, - raw_ostream &O); - template - void printPostIncOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printPostIncOperand(MI, OpNo, Amount, O); - } - - void printVRegOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSysCROperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddSubImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printLogicalImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printShifter(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printShiftedRegister(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExtendedRegister(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printArithExtend(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O, - char SrcRegKind, unsigned Width); - template - void printMemExtend(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - printMemExtend(MI, OpNum, O, SrcRegKind, Width); - } - template - void printRegWithShiftExtend(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printCondCode(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printInverseCondCode(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAlignedLabel(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, - raw_ostream &O); - void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale, - raw_ostream &O); - - template - void printUImm12Offset(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - printUImm12Offset(MI, OpNum, Scale, O); - } - - template - void printAMIndexedWB(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - printAMIndexedWB(MI, OpNum, BitWidth / 8, O); - } - - void printAMNoIndex(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - template - void printImmScale(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - template - void printPrefetchOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printPSBHintOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printBTIHintOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printFPImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printVectorList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O, - StringRef LayoutSuffix); - - /// Print a list of vector registers where the type suffix is implicit - /// (i.e. attached to the instruction rather than the registers). - void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - - template - void printTypedVectorList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printVectorIndex(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAdrpLabel(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printBarrierOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSystemPStateField(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printComplexRotationOp(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - template - void printImm8OptLsl(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printSVELogicalImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSVEPattern(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printSVERegOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printGPR64as32(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printZPRasFPR(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printExactFPImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); -}; - -class AArch64AppleInstPrinter : public AArch64InstPrinter { -public: - AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - - void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O) override; - bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O) override; - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, - const MCSubtargetInfo &STI, - raw_ostream &O) override; - - StringRef getRegName(unsigned RegNo) const override { - return getRegisterName(RegNo); - } - - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = AArch64::NoRegAltName); -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 688ca755d0b5..05a909f1780a 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -1,9 +1,8 @@ //===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index ed89d991d9fb..6418211a4f55 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -1,15 +1,15 @@ //===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "AArch64.h" #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" @@ -22,8 +22,10 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" using namespace llvm; namespace { @@ -42,6 +44,8 @@ public: return AArch64::NumTargetFixupKinds; } + Optional getFixupKind(StringRef Name) const override; + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = { // This table *must* be in the order that the fixup_* kinds are defined @@ -104,6 +108,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); + case FK_NONE: case AArch64::fixup_aarch64_tlsdesc_call: return 0; @@ -274,7 +279,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, if (RefKind & AArch64MCExpr::VK_NC) { Value &= 0xFFFF; } - else if (RefKind & AArch64MCExpr::VK_SABS) { + else if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) { if (SignedValue > 0xFFFF || SignedValue < -0xFFFF) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); @@ -305,6 +310,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, if (Value & 0x3) Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3ffffff; + case FK_NONE: case FK_Data_1: case FK_Data_2: case FK_Data_4: @@ -315,6 +321,12 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, } } +Optional AArch64AsmBackend::getFixupKind(StringRef Name) const { + if (TheTriple.isOSBinFormatELF() && Name == "R_AARCH64_NONE") + return FK_NONE; + return MCAsmBackend::getFixupKind(Name); +} + /// getFixupKindContainereSizeInBytes - The number of bytes of the /// container involved in big endian or 0 if the item is little endian unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const { @@ -398,7 +410,7 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, // handle this more cleanly. This may affect the output of -show-mc-encoding. AArch64MCExpr::VariantKind RefKind = static_cast(Target.getRefKind()); - if (RefKind & AArch64MCExpr::VK_SABS) { + if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) { // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) @@ -446,6 +458,10 @@ bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) { + unsigned Kind = Fixup.getKind(); + if (Kind == FK_NONE) + return true; + // The ADRP instruction adds some multiple of 0x1000 to the current PC & // ~0xfff. This means that the required offset to reach a symbol can vary by // up to one step depending on where the ADRP is in memory. For example: @@ -458,14 +474,14 @@ bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, // same page as the ADRP and the instruction should encode 0x0. Assuming the // section isn't 0x1000-aligned, we therefore need to delegate this decision // to the linker -- a relocation! - if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) + if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21) return true; AArch64MCExpr::VariantKind RefKind = static_cast(Target.getRefKind()); AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); // LDR GOT relocations need a relocation - if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_ldr_pcrel_imm19 && + if (Kind == AArch64::fixup_aarch64_ldr_pcrel_imm19 && SymLoc == AArch64MCExpr::VK_GOT) return true; return false; @@ -513,6 +529,7 @@ enum CompactUnwindEncodings { // FIXME: This should be in a separate file. class DarwinAArch64AsmBackend : public AArch64AsmBackend { const MCRegisterInfo &MRI; + bool IsILP32; /// Encode compact unwind stack adjustment for frameless functions. /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. @@ -523,13 +540,18 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { public: DarwinAArch64AsmBackend(const Target &T, const Triple &TT, - const MCRegisterInfo &MRI) - : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {} + const MCRegisterInfo &MRI, bool IsILP32) + : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI), + IsILP32(IsILP32) {} std::unique_ptr createObjectTargetWriter() const override { - return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64, - MachO::CPU_SUBTYPE_ARM64_ALL); + if (IsILP32) + return createAArch64MachObjectWriter( + MachO::CPU_TYPE_ARM64_32, MachO::CPU_SUBTYPE_ARM64_32_V8, true); + else + return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64, + MachO::CPU_SUBTYPE_ARM64_ALL, false); } /// Generate the compact unwind encoding from the CFI directives. @@ -711,8 +733,10 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { const Triple &TheTriple = STI.getTargetTriple(); - if (TheTriple.isOSBinFormatMachO()) - return new DarwinAArch64AsmBackend(T, TheTriple, MRI); + if (TheTriple.isOSBinFormatMachO()) { + const bool IsILP32 = TheTriple.isArch32Bit(); + return new DarwinAArch64AsmBackend(T, TheTriple, MRI, IsILP32); + } if (TheTriple.isOSBinFormatCOFF()) return new COFFAArch64AsmBackend(T, TheTriple); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 2ccd7cef8bef..c871e2c62eac 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -186,6 +185,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) return ELF::R_AARCH64_NONE; switch ((unsigned)Fixup.getKind()) { + case FK_NONE: + return ELF::R_AARCH64_NONE; case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 9a7e34b0aeb1..c33f7e957b54 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -1,9 +1,8 @@ //===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -103,8 +102,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - bool) override { + void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) override { EmitA64MappingSymbol(); MCELFStreamer::EmitInstruction(Inst, STI); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h index d5b009ec30d1..25c609ee1496 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h @@ -1,9 +1,8 @@ //===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h index 4293dcba955e..fe8043fe5ec0 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h @@ -1,9 +1,8 @@ //===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp new file mode 100644 index 000000000000..d0a544273b8b --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -0,0 +1,1587 @@ +//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an AArch64 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "AArch64GenAsmWriter.inc" +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "AArch64GenAsmWriter1.inc" + +AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + +AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : AArch64InstPrinter(MAI, MII, MRI) {} + +void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + // This is for .cfi directives. + OS << getRegisterName(RegNo); +} + +void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, + const MCSubtargetInfo &STI) { + // Check for special encodings and print the canonical alias instead. + + unsigned Opcode = MI->getOpcode(); + + if (Opcode == AArch64::SYSxt) + if (printSysAlias(MI, STI, O)) { + printAnnotation(O, Annot); + return; + } + + // SBFM/UBFM should print to a nicer aliased form if possible. + if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri || + Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) { + const MCOperand &Op0 = MI->getOperand(0); + const MCOperand &Op1 = MI->getOperand(1); + const MCOperand &Op2 = MI->getOperand(2); + const MCOperand &Op3 = MI->getOperand(3); + + bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri); + bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri); + if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) { + const char *AsmMnemonic = nullptr; + + switch (Op3.getImm()) { + default: + break; + case 7: + if (IsSigned) + AsmMnemonic = "sxtb"; + else if (!Is64Bit) + AsmMnemonic = "uxtb"; + break; + case 15: + if (IsSigned) + AsmMnemonic = "sxth"; + else if (!Is64Bit) + AsmMnemonic = "uxth"; + break; + case 31: + // *xtw is only valid for signed 64-bit operations. + if (Is64Bit && IsSigned) + AsmMnemonic = "sxtw"; + break; + } + + if (AsmMnemonic) { + O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) + << ", " << getRegisterName(getWRegFromXReg(Op1.getReg())); + printAnnotation(O, Annot); + return; + } + } + + // All immediate shifts are aliases, implemented using the Bitfield + // instruction. In all cases the immediate shift amount shift must be in + // the range 0 to (reg.size -1). + if (Op2.isImm() && Op3.isImm()) { + const char *AsmMnemonic = nullptr; + int shift = 0; + int64_t immr = Op2.getImm(); + int64_t imms = Op3.getImm(); + if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) { + AsmMnemonic = "lsl"; + shift = 31 - imms; + } else if (Opcode == AArch64::UBFMXri && imms != 0x3f && + ((imms + 1 == immr))) { + AsmMnemonic = "lsl"; + shift = 63 - imms; + } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) { + AsmMnemonic = "lsr"; + shift = immr; + } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) { + AsmMnemonic = "lsr"; + shift = immr; + } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) { + AsmMnemonic = "asr"; + shift = immr; + } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) { + AsmMnemonic = "asr"; + shift = immr; + } + if (AsmMnemonic) { + O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) + << ", " << getRegisterName(Op1.getReg()) << ", #" << shift; + printAnnotation(O, Annot); + return; + } + } + + // SBFIZ/UBFIZ aliases + if (Op2.getImm() > Op3.getImm()) { + O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t' + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) + << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1; + printAnnotation(O, Annot); + return; + } + + // Otherwise SBFX/UBFX is the preferred form + O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t' + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) + << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1; + printAnnotation(O, Annot); + return; + } + + if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) { + const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0 + const MCOperand &Op2 = MI->getOperand(2); + int ImmR = MI->getOperand(3).getImm(); + int ImmS = MI->getOperand(4).getImm(); + + if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) && + (ImmR == 0 || ImmS < ImmR)) { + // BFC takes precedence over its entire range, sligtly differently to BFI. + int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; + int LSB = (BitWidth - ImmR) % BitWidth; + int Width = ImmS + 1; + + O << "\tbfc\t" << getRegisterName(Op0.getReg()) + << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } else if (ImmS < ImmR) { + // BFI alias + int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; + int LSB = (BitWidth - ImmR) % BitWidth; + int Width = ImmS + 1; + + O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", " + << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } + + int LSB = ImmR; + int Width = ImmS - ImmR + 1; + // Otherwise BFXIL the preferred form + O << "\tbfxil\t" + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg()) + << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } + + // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift + // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be + // printed. + if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi || + Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && + MI->getOperand(1).isExpr()) { + if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) + O << "\tmovz\t"; + else + O << "\tmovn\t"; + + O << getRegisterName(MI->getOperand(0).getReg()) << ", #"; + MI->getOperand(1).getExpr()->print(O, &MAI); + return; + } + + if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) && + MI->getOperand(2).isExpr()) { + O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"; + MI->getOperand(2).getExpr()->print(O, &MAI); + return; + } + + // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their + // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 > + // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction + // that can represent the move is the MOV alias, and the rest get printed + // normally. + if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) && + MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32; + int Shift = MI->getOperand(2).getImm(); + uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift; + + if (AArch64_AM::isMOVZMovAlias(Value, Shift, + Opcode == AArch64::MOVZXi ? 64 : 32)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && + MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32; + int Shift = MI->getOperand(2).getImm(); + uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift); + if (RegWidth == 32) + Value = Value & 0xffffffff; + + if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) && + (MI->getOperand(1).getReg() == AArch64::XZR || + MI->getOperand(1).getReg() == AArch64::WZR) && + MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32; + uint64_t Value = AArch64_AM::decodeLogicalImmediate( + MI->getOperand(2).getImm(), RegWidth); + if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if (Opcode == AArch64::CompilerBarrier) { + O << '\t' << MAI.getCommentString() << " COMPILER BARRIER"; + printAnnotation(O, Annot); + return; + } + + // Instruction TSB is specified as a one operand instruction, but 'csync' is + // not encoded, so for printing it is treated as a special case here: + if (Opcode == AArch64::TSB) { + O << "\ttsb\tcsync"; + return; + } + + if (!printAliasInstr(MI, STI, O)) + printInstruction(MI, STI, O); + + printAnnotation(O, Annot); + + if (atomicBarrierDroppedOnZero(Opcode) && + (MI->getOperand(0).getReg() == AArch64::XZR || + MI->getOperand(0).getReg() == AArch64::WZR)) { + printAnnotation(O, "acquire semantics dropped since destination is zero"); + } +} + +static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout, + bool &IsTbx) { + switch (Opcode) { + case AArch64::TBXv8i8One: + case AArch64::TBXv8i8Two: + case AArch64::TBXv8i8Three: + case AArch64::TBXv8i8Four: + IsTbx = true; + Layout = ".8b"; + return true; + case AArch64::TBLv8i8One: + case AArch64::TBLv8i8Two: + case AArch64::TBLv8i8Three: + case AArch64::TBLv8i8Four: + IsTbx = false; + Layout = ".8b"; + return true; + case AArch64::TBXv16i8One: + case AArch64::TBXv16i8Two: + case AArch64::TBXv16i8Three: + case AArch64::TBXv16i8Four: + IsTbx = true; + Layout = ".16b"; + return true; + case AArch64::TBLv16i8One: + case AArch64::TBLv16i8Two: + case AArch64::TBLv16i8Three: + case AArch64::TBLv16i8Four: + IsTbx = false; + Layout = ".16b"; + return true; + default: + return false; + } +} + +struct LdStNInstrDesc { + unsigned Opcode; + const char *Mnemonic; + const char *Layout; + int ListOperand; + bool HasLane; + int NaturalOffset; +}; + +static const LdStNInstrDesc LdStNInstInfo[] = { + { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, + { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, + { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, + { AArch64::LD1i64, "ld1", ".d", 1, true, 0 }, + { AArch64::LD1i8_POST, "ld1", ".b", 2, true, 1 }, + { AArch64::LD1i16_POST, "ld1", ".h", 2, true, 2 }, + { AArch64::LD1i32_POST, "ld1", ".s", 2, true, 4 }, + { AArch64::LD1i64_POST, "ld1", ".d", 2, true, 8 }, + { AArch64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 }, + { AArch64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 }, + { AArch64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 }, + { AArch64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 }, + { AArch64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 }, + { AArch64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 }, + { AArch64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 }, + { AArch64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 }, + { AArch64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 }, + { AArch64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 }, + { AArch64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 }, + { AArch64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 }, + { AArch64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 }, + { AArch64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 }, + { AArch64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 }, + { AArch64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 }, + { AArch64::LD1Onev16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Onev8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Onev4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Onev2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Onev8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Onev4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Onev2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Onev1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 }, + { AArch64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 }, + { AArch64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 }, + { AArch64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 }, + { AArch64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 }, + { AArch64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 }, + { AArch64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 }, + { AArch64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 }, + { AArch64::LD1Twov16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Twov8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Twov4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Twov2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Twov8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Twov4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Twov2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Twov1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 }, + { AArch64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 }, + { AArch64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 }, + { AArch64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 }, + { AArch64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 }, + { AArch64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 }, + { AArch64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 }, + { AArch64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 }, + { AArch64::LD1Threev16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Threev8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Threev4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Threev2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Threev8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Threev4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Threev2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Threev1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 }, + { AArch64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 }, + { AArch64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 }, + { AArch64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 }, + { AArch64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 }, + { AArch64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 }, + { AArch64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 }, + { AArch64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 }, + { AArch64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 }, + { AArch64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 }, + { AArch64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 }, + { AArch64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 }, + { AArch64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 }, + { AArch64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 }, + { AArch64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 }, + { AArch64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 }, + { AArch64::LD2i8, "ld2", ".b", 1, true, 0 }, + { AArch64::LD2i16, "ld2", ".h", 1, true, 0 }, + { AArch64::LD2i32, "ld2", ".s", 1, true, 0 }, + { AArch64::LD2i64, "ld2", ".d", 1, true, 0 }, + { AArch64::LD2i8_POST, "ld2", ".b", 2, true, 2 }, + { AArch64::LD2i16_POST, "ld2", ".h", 2, true, 4 }, + { AArch64::LD2i32_POST, "ld2", ".s", 2, true, 8 }, + { AArch64::LD2i64_POST, "ld2", ".d", 2, true, 16 }, + { AArch64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 }, + { AArch64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 }, + { AArch64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 }, + { AArch64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 }, + { AArch64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 }, + { AArch64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 }, + { AArch64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 }, + { AArch64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 }, + { AArch64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 }, + { AArch64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 }, + { AArch64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 }, + { AArch64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 }, + { AArch64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 }, + { AArch64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 }, + { AArch64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 }, + { AArch64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 }, + { AArch64::LD2Twov16b, "ld2", ".16b", 0, false, 0 }, + { AArch64::LD2Twov8h, "ld2", ".8h", 0, false, 0 }, + { AArch64::LD2Twov4s, "ld2", ".4s", 0, false, 0 }, + { AArch64::LD2Twov2d, "ld2", ".2d", 0, false, 0 }, + { AArch64::LD2Twov8b, "ld2", ".8b", 0, false, 0 }, + { AArch64::LD2Twov4h, "ld2", ".4h", 0, false, 0 }, + { AArch64::LD2Twov2s, "ld2", ".2s", 0, false, 0 }, + { AArch64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 }, + { AArch64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 }, + { AArch64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 }, + { AArch64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 }, + { AArch64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 }, + { AArch64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 }, + { AArch64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 }, + { AArch64::LD3i8, "ld3", ".b", 1, true, 0 }, + { AArch64::LD3i16, "ld3", ".h", 1, true, 0 }, + { AArch64::LD3i32, "ld3", ".s", 1, true, 0 }, + { AArch64::LD3i64, "ld3", ".d", 1, true, 0 }, + { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, + { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, + { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, + { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, + { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, + { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, + { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, + { AArch64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 }, + { AArch64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 }, + { AArch64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 }, + { AArch64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 }, + { AArch64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 }, + { AArch64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 }, + { AArch64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 }, + { AArch64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 }, + { AArch64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 }, + { AArch64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 }, + { AArch64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 }, + { AArch64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 }, + { AArch64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 }, + { AArch64::LD3Threev16b, "ld3", ".16b", 0, false, 0 }, + { AArch64::LD3Threev8h, "ld3", ".8h", 0, false, 0 }, + { AArch64::LD3Threev4s, "ld3", ".4s", 0, false, 0 }, + { AArch64::LD3Threev2d, "ld3", ".2d", 0, false, 0 }, + { AArch64::LD3Threev8b, "ld3", ".8b", 0, false, 0 }, + { AArch64::LD3Threev4h, "ld3", ".4h", 0, false, 0 }, + { AArch64::LD3Threev2s, "ld3", ".2s", 0, false, 0 }, + { AArch64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 }, + { AArch64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 }, + { AArch64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 }, + { AArch64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 }, + { AArch64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 }, + { AArch64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 }, + { AArch64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 }, + { AArch64::LD4i8, "ld4", ".b", 1, true, 0 }, + { AArch64::LD4i16, "ld4", ".h", 1, true, 0 }, + { AArch64::LD4i32, "ld4", ".s", 1, true, 0 }, + { AArch64::LD4i64, "ld4", ".d", 1, true, 0 }, + { AArch64::LD4i8_POST, "ld4", ".b", 2, true, 4 }, + { AArch64::LD4i16_POST, "ld4", ".h", 2, true, 8 }, + { AArch64::LD4i32_POST, "ld4", ".s", 2, true, 16 }, + { AArch64::LD4i64_POST, "ld4", ".d", 2, true, 32 }, + { AArch64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 }, + { AArch64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 }, + { AArch64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 }, + { AArch64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 }, + { AArch64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 }, + { AArch64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 }, + { AArch64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 }, + { AArch64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 }, + { AArch64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 }, + { AArch64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 }, + { AArch64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 }, + { AArch64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 }, + { AArch64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 }, + { AArch64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 }, + { AArch64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 }, + { AArch64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 }, + { AArch64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 }, + { AArch64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 }, + { AArch64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 }, + { AArch64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 }, + { AArch64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 }, + { AArch64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 }, + { AArch64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 }, + { AArch64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 }, + { AArch64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 }, + { AArch64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 }, + { AArch64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 }, + { AArch64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 }, + { AArch64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 }, + { AArch64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 }, + { AArch64::ST1i8, "st1", ".b", 0, true, 0 }, + { AArch64::ST1i16, "st1", ".h", 0, true, 0 }, + { AArch64::ST1i32, "st1", ".s", 0, true, 0 }, + { AArch64::ST1i64, "st1", ".d", 0, true, 0 }, + { AArch64::ST1i8_POST, "st1", ".b", 1, true, 1 }, + { AArch64::ST1i16_POST, "st1", ".h", 1, true, 2 }, + { AArch64::ST1i32_POST, "st1", ".s", 1, true, 4 }, + { AArch64::ST1i64_POST, "st1", ".d", 1, true, 8 }, + { AArch64::ST1Onev16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Onev8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Onev4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Onev2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Onev8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Onev4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Onev2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Onev1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 }, + { AArch64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 }, + { AArch64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 }, + { AArch64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 }, + { AArch64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 }, + { AArch64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 }, + { AArch64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 }, + { AArch64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 }, + { AArch64::ST1Twov16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Twov8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Twov4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Twov2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Twov8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Twov4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Twov2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Twov1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 }, + { AArch64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 }, + { AArch64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 }, + { AArch64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 }, + { AArch64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 }, + { AArch64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 }, + { AArch64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 }, + { AArch64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 }, + { AArch64::ST1Threev16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Threev8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Threev4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Threev2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Threev8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Threev4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Threev2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Threev1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 }, + { AArch64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 }, + { AArch64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 }, + { AArch64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 }, + { AArch64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 }, + { AArch64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 }, + { AArch64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 }, + { AArch64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 }, + { AArch64::ST1Fourv16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Fourv8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Fourv4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Fourv2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Fourv8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Fourv4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Fourv2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Fourv1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 }, + { AArch64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 }, + { AArch64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 }, + { AArch64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 }, + { AArch64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 }, + { AArch64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 }, + { AArch64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 }, + { AArch64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 }, + { AArch64::ST2i8, "st2", ".b", 0, true, 0 }, + { AArch64::ST2i16, "st2", ".h", 0, true, 0 }, + { AArch64::ST2i32, "st2", ".s", 0, true, 0 }, + { AArch64::ST2i64, "st2", ".d", 0, true, 0 }, + { AArch64::ST2i8_POST, "st2", ".b", 1, true, 2 }, + { AArch64::ST2i16_POST, "st2", ".h", 1, true, 4 }, + { AArch64::ST2i32_POST, "st2", ".s", 1, true, 8 }, + { AArch64::ST2i64_POST, "st2", ".d", 1, true, 16 }, + { AArch64::ST2Twov16b, "st2", ".16b", 0, false, 0 }, + { AArch64::ST2Twov8h, "st2", ".8h", 0, false, 0 }, + { AArch64::ST2Twov4s, "st2", ".4s", 0, false, 0 }, + { AArch64::ST2Twov2d, "st2", ".2d", 0, false, 0 }, + { AArch64::ST2Twov8b, "st2", ".8b", 0, false, 0 }, + { AArch64::ST2Twov4h, "st2", ".4h", 0, false, 0 }, + { AArch64::ST2Twov2s, "st2", ".2s", 0, false, 0 }, + { AArch64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 }, + { AArch64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 }, + { AArch64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 }, + { AArch64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 }, + { AArch64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 }, + { AArch64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 }, + { AArch64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 }, + { AArch64::ST3i8, "st3", ".b", 0, true, 0 }, + { AArch64::ST3i16, "st3", ".h", 0, true, 0 }, + { AArch64::ST3i32, "st3", ".s", 0, true, 0 }, + { AArch64::ST3i64, "st3", ".d", 0, true, 0 }, + { AArch64::ST3i8_POST, "st3", ".b", 1, true, 3 }, + { AArch64::ST3i16_POST, "st3", ".h", 1, true, 6 }, + { AArch64::ST3i32_POST, "st3", ".s", 1, true, 12 }, + { AArch64::ST3i64_POST, "st3", ".d", 1, true, 24 }, + { AArch64::ST3Threev16b, "st3", ".16b", 0, false, 0 }, + { AArch64::ST3Threev8h, "st3", ".8h", 0, false, 0 }, + { AArch64::ST3Threev4s, "st3", ".4s", 0, false, 0 }, + { AArch64::ST3Threev2d, "st3", ".2d", 0, false, 0 }, + { AArch64::ST3Threev8b, "st3", ".8b", 0, false, 0 }, + { AArch64::ST3Threev4h, "st3", ".4h", 0, false, 0 }, + { AArch64::ST3Threev2s, "st3", ".2s", 0, false, 0 }, + { AArch64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 }, + { AArch64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 }, + { AArch64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 }, + { AArch64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 }, + { AArch64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 }, + { AArch64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 }, + { AArch64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 }, + { AArch64::ST4i8, "st4", ".b", 0, true, 0 }, + { AArch64::ST4i16, "st4", ".h", 0, true, 0 }, + { AArch64::ST4i32, "st4", ".s", 0, true, 0 }, + { AArch64::ST4i64, "st4", ".d", 0, true, 0 }, + { AArch64::ST4i8_POST, "st4", ".b", 1, true, 4 }, + { AArch64::ST4i16_POST, "st4", ".h", 1, true, 8 }, + { AArch64::ST4i32_POST, "st4", ".s", 1, true, 16 }, + { AArch64::ST4i64_POST, "st4", ".d", 1, true, 32 }, + { AArch64::ST4Fourv16b, "st4", ".16b", 0, false, 0 }, + { AArch64::ST4Fourv8h, "st4", ".8h", 0, false, 0 }, + { AArch64::ST4Fourv4s, "st4", ".4s", 0, false, 0 }, + { AArch64::ST4Fourv2d, "st4", ".2d", 0, false, 0 }, + { AArch64::ST4Fourv8b, "st4", ".8b", 0, false, 0 }, + { AArch64::ST4Fourv4h, "st4", ".4h", 0, false, 0 }, + { AArch64::ST4Fourv2s, "st4", ".2s", 0, false, 0 }, + { AArch64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 }, + { AArch64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 }, + { AArch64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 }, + { AArch64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 }, + { AArch64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 }, + { AArch64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 }, + { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, +}; + +static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { + unsigned Idx; + for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) + if (LdStNInstInfo[Idx].Opcode == Opcode) + return &LdStNInstInfo[Idx]; + + return nullptr; +} + +void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, + const MCSubtargetInfo &STI) { + unsigned Opcode = MI->getOpcode(); + StringRef Layout; + + bool IsTbx; + if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) { + O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t' + << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", "; + + unsigned ListOpNum = IsTbx ? 2 : 1; + printVectorList(MI, ListOpNum, STI, O, ""); + + O << ", " + << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg); + printAnnotation(O, Annot); + return; + } + + if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { + O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; + + // Now onto the operands: first a vector list with possible lane + // specifier. E.g. { v0 }[2] + int OpNum = LdStDesc->ListOperand; + printVectorList(MI, OpNum++, STI, O, ""); + + if (LdStDesc->HasLane) + O << '[' << MI->getOperand(OpNum++).getImm() << ']'; + + // Next the address: [xN] + unsigned AddrReg = MI->getOperand(OpNum++).getReg(); + O << ", [" << getRegisterName(AddrReg) << ']'; + + // Finally, there might be a post-indexed offset. + if (LdStDesc->NaturalOffset != 0) { + unsigned Reg = MI->getOperand(OpNum++).getReg(); + if (Reg != AArch64::XZR) + O << ", " << getRegisterName(Reg); + else { + assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?"); + O << ", #" << LdStDesc->NaturalOffset; + } + } + + printAnnotation(O, Annot); + return; + } + + AArch64InstPrinter::printInst(MI, O, Annot, STI); +} + +bool AArch64InstPrinter::printSysAlias(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { +#ifndef NDEBUG + unsigned Opcode = MI->getOpcode(); + assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); +#endif + + const MCOperand &Op1 = MI->getOperand(0); + const MCOperand &Cn = MI->getOperand(1); + const MCOperand &Cm = MI->getOperand(2); + const MCOperand &Op2 = MI->getOperand(3); + + unsigned Op1Val = Op1.getImm(); + unsigned CnVal = Cn.getImm(); + unsigned CmVal = Cm.getImm(); + unsigned Op2Val = Op2.getImm(); + + uint16_t Encoding = Op2Val; + Encoding |= CmVal << 3; + Encoding |= CnVal << 7; + Encoding |= Op1Val << 11; + + bool NeedsReg; + std::string Ins; + std::string Name; + + if (CnVal == 7) { + switch (CmVal) { + default: return false; + // Maybe IC, maybe Prediction Restriction + case 1: + switch (Op1Val) { + default: return false; + case 0: goto Search_IC; + case 3: goto Search_PRCTX; + } + // Prediction Restriction aliases + case 3: { + Search_PRCTX: + const AArch64PRCTX::PRCTX *PRCTX = AArch64PRCTX::lookupPRCTXByEncoding(Encoding >> 3); + if (!PRCTX || !PRCTX->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = PRCTX->NeedsReg; + switch (Op2Val) { + default: return false; + case 4: Ins = "cfp\t"; break; + case 5: Ins = "dvp\t"; break; + case 7: Ins = "cpp\t"; break; + } + Name = std::string(PRCTX->Name); + } + break; + // IC aliases + case 5: { + Search_IC: + const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding); + if (!IC || !IC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = IC->NeedsReg; + Ins = "ic\t"; + Name = std::string(IC->Name); + } + break; + // DC aliases + case 4: case 6: case 10: case 11: case 12: case 13: case 14: + { + const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding); + if (!DC || !DC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "dc\t"; + Name = std::string(DC->Name); + } + break; + // AT aliases + case 8: case 9: { + const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding); + if (!AT || !AT->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "at\t"; + Name = std::string(AT->Name); + } + break; + } + } else if (CnVal == 8) { + // TLBI aliases + const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding); + if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = TLBI->NeedsReg; + Ins = "tlbi\t"; + Name = std::string(TLBI->Name); + } + else + return false; + + std::string Str = Ins + Name; + std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower); + + O << '\t' << Str; + if (NeedsReg) + O << ", " << getRegisterName(MI->getOperand(4).getReg()); + + return true; +} + +void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg); + } else if (Op.isImm()) { + printImm(MI, OpNo, STI, O); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + O << "#" << formatImm(Op.getImm()); +} + +void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + O << format("#%#llx", Op.getImm()); +} + +void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, + unsigned Imm, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (Reg == AArch64::XZR) + O << "#" << Imm; + else + O << getRegisterName(Reg); + } else + llvm_unreachable("unknown operand kind in printPostIncOperand64"); +} + +void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isReg() && "Non-register vreg operand!"); + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg, AArch64::vreg); +} + +void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() && "System instruction C[nm] operands must be immediates!"); + O << "c" << Op.getImm(); +} + +void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.isImm()) { + unsigned Val = (MO.getImm() & 0xfff); + assert(Val == MO.getImm() && "Add/sub immediate out of range!"); + unsigned Shift = + AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); + O << '#' << formatImm(Val); + if (Shift != 0) + printShifter(MI, OpNum + 1, STI, O); + + if (CommentStream) + *CommentStream << '=' << formatImm(Val << Shift) << '\n'; + } else { + assert(MO.isExpr() && "Unexpected operand type!"); + MO.getExpr()->print(O, &MAI); + printShifter(MI, OpNum + 1, STI, O); + } +} + +template +void AArch64InstPrinter::printLogicalImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint64_t Val = MI->getOperand(OpNum).getImm(); + O << "#0x"; + O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 8 * sizeof(T))); +} + +void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + // LSL #0 should not be printed. + if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL && + AArch64_AM::getShiftValue(Val) == 0) + return; + O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val)) + << " #" << AArch64_AM::getShiftValue(Val); +} + +void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << getRegisterName(MI->getOperand(OpNum).getReg()); + printShifter(MI, OpNum + 1, STI, O); +} + +void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << getRegisterName(MI->getOperand(OpNum).getReg()); + printArithExtend(MI, OpNum + 1, STI, O); +} + +void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val); + unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val); + + // If the destination or first source register operand is [W]SP, print + // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at + // all. + if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) { + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src1 = MI->getOperand(1).getReg(); + if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) && + ExtType == AArch64_AM::UXTX) || + ((Dest == AArch64::WSP || Src1 == AArch64::WSP) && + ExtType == AArch64_AM::UXTW) ) { + if (ShiftVal != 0) + O << ", lsl #" << ShiftVal; + return; + } + } + O << ", " << AArch64_AM::getShiftExtendName(ExtType); + if (ShiftVal != 0) + O << " #" << ShiftVal; +} + +static void printMemExtendImpl(bool SignExtend, bool DoShift, + unsigned Width, char SrcRegKind, + raw_ostream &O) { + // sxtw, sxtx, uxtw or lsl (== uxtx) + bool IsLSL = !SignExtend && SrcRegKind == 'x'; + if (IsLSL) + O << "lsl"; + else + O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind; + + if (DoShift || IsLSL) + O << " #" << Log2_32(Width / 8); +} + +void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum, + raw_ostream &O, char SrcRegKind, + unsigned Width) { + bool SignExtend = MI->getOperand(OpNum).getImm(); + bool DoShift = MI->getOperand(OpNum + 1).getImm(); + printMemExtendImpl(SignExtend, DoShift, Width, SrcRegKind, O); +} + +template +void AArch64InstPrinter::printRegWithShiftExtend(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printOperand(MI, OpNum, STI, O); + if (Suffix == 's' || Suffix == 'd') + O << '.' << Suffix; + else + assert(Suffix == 0 && "Unsupported suffix size"); + + bool DoShift = ExtWidth != 8; + if (SignExtend || DoShift || SrcRegKind == 'w') { + O << ", "; + printMemExtendImpl(SignExtend, DoShift, ExtWidth, SrcRegKind, O); + } +} + +void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); + O << AArch64CC::getCondCodeName(CC); +} + +void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); + O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC)); +} + +void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']'; +} + +template +void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm()); +} + +void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, + unsigned Scale, raw_ostream &O) { + const MCOperand MO = MI->getOperand(OpNum); + if (MO.isImm()) { + O << "#" << formatImm(MO.getImm() * Scale); + } else { + assert(MO.isExpr() && "Unexpected operand type!"); + MO.getExpr()->print(O, &MAI); + } +} + +void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, + unsigned Scale, raw_ostream &O) { + const MCOperand MO1 = MI->getOperand(OpNum + 1); + O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); + if (MO1.isImm()) { + O << ", #" << formatImm(MO1.getImm() * Scale); + } else { + assert(MO1.isExpr() && "Unexpected operand type!"); + O << ", "; + MO1.getExpr()->print(O, &MAI); + } + O << ']'; +} + +template +void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned prfop = MI->getOperand(OpNum).getImm(); + if (IsSVEPrefetch) { + if (auto PRFM = AArch64SVEPRFM::lookupSVEPRFMByEncoding(prfop)) { + O << PRFM->Name; + return; + } + } else if (auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop)) { + O << PRFM->Name; + return; + } + + O << '#' << formatImm(prfop); +} + +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop); + if (PSB) + O << PSB->Name; + else + O << '#' << formatImm(psbhintop); +} + +void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned btihintop = (MI->getOperand(OpNum).getImm() ^ 32) >> 1; + auto BTI = AArch64BTIHint::lookupBTIByEncoding(btihintop); + if (BTI) + O << BTI->Name; + else + O << '#' << formatImm(btihintop); +} + +void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + float FPImm = + MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm()); + + // 8 decimal places are enough to perfectly represent permitted floats. + O << format("#%.8f", FPImm); +} + +static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { + while (Stride--) { + switch (Reg) { + default: + llvm_unreachable("Vector register expected!"); + case AArch64::Q0: Reg = AArch64::Q1; break; + case AArch64::Q1: Reg = AArch64::Q2; break; + case AArch64::Q2: Reg = AArch64::Q3; break; + case AArch64::Q3: Reg = AArch64::Q4; break; + case AArch64::Q4: Reg = AArch64::Q5; break; + case AArch64::Q5: Reg = AArch64::Q6; break; + case AArch64::Q6: Reg = AArch64::Q7; break; + case AArch64::Q7: Reg = AArch64::Q8; break; + case AArch64::Q8: Reg = AArch64::Q9; break; + case AArch64::Q9: Reg = AArch64::Q10; break; + case AArch64::Q10: Reg = AArch64::Q11; break; + case AArch64::Q11: Reg = AArch64::Q12; break; + case AArch64::Q12: Reg = AArch64::Q13; break; + case AArch64::Q13: Reg = AArch64::Q14; break; + case AArch64::Q14: Reg = AArch64::Q15; break; + case AArch64::Q15: Reg = AArch64::Q16; break; + case AArch64::Q16: Reg = AArch64::Q17; break; + case AArch64::Q17: Reg = AArch64::Q18; break; + case AArch64::Q18: Reg = AArch64::Q19; break; + case AArch64::Q19: Reg = AArch64::Q20; break; + case AArch64::Q20: Reg = AArch64::Q21; break; + case AArch64::Q21: Reg = AArch64::Q22; break; + case AArch64::Q22: Reg = AArch64::Q23; break; + case AArch64::Q23: Reg = AArch64::Q24; break; + case AArch64::Q24: Reg = AArch64::Q25; break; + case AArch64::Q25: Reg = AArch64::Q26; break; + case AArch64::Q26: Reg = AArch64::Q27; break; + case AArch64::Q27: Reg = AArch64::Q28; break; + case AArch64::Q28: Reg = AArch64::Q29; break; + case AArch64::Q29: Reg = AArch64::Q30; break; + case AArch64::Q30: Reg = AArch64::Q31; break; + // Vector lists can wrap around. + case AArch64::Q31: + Reg = AArch64::Q0; + break; + case AArch64::Z0: Reg = AArch64::Z1; break; + case AArch64::Z1: Reg = AArch64::Z2; break; + case AArch64::Z2: Reg = AArch64::Z3; break; + case AArch64::Z3: Reg = AArch64::Z4; break; + case AArch64::Z4: Reg = AArch64::Z5; break; + case AArch64::Z5: Reg = AArch64::Z6; break; + case AArch64::Z6: Reg = AArch64::Z7; break; + case AArch64::Z7: Reg = AArch64::Z8; break; + case AArch64::Z8: Reg = AArch64::Z9; break; + case AArch64::Z9: Reg = AArch64::Z10; break; + case AArch64::Z10: Reg = AArch64::Z11; break; + case AArch64::Z11: Reg = AArch64::Z12; break; + case AArch64::Z12: Reg = AArch64::Z13; break; + case AArch64::Z13: Reg = AArch64::Z14; break; + case AArch64::Z14: Reg = AArch64::Z15; break; + case AArch64::Z15: Reg = AArch64::Z16; break; + case AArch64::Z16: Reg = AArch64::Z17; break; + case AArch64::Z17: Reg = AArch64::Z18; break; + case AArch64::Z18: Reg = AArch64::Z19; break; + case AArch64::Z19: Reg = AArch64::Z20; break; + case AArch64::Z20: Reg = AArch64::Z21; break; + case AArch64::Z21: Reg = AArch64::Z22; break; + case AArch64::Z22: Reg = AArch64::Z23; break; + case AArch64::Z23: Reg = AArch64::Z24; break; + case AArch64::Z24: Reg = AArch64::Z25; break; + case AArch64::Z25: Reg = AArch64::Z26; break; + case AArch64::Z26: Reg = AArch64::Z27; break; + case AArch64::Z27: Reg = AArch64::Z28; break; + case AArch64::Z28: Reg = AArch64::Z29; break; + case AArch64::Z29: Reg = AArch64::Z30; break; + case AArch64::Z30: Reg = AArch64::Z31; break; + // Vector lists can wrap around. + case AArch64::Z31: + Reg = AArch64::Z0; + break; + } + } + return Reg; +} + +template +void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + static_assert(size == 64 || size == 32, + "Template parameter must be either 32 or 64"); + unsigned Reg = MI->getOperand(OpNum).getReg(); + + unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64; + unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64; + + unsigned Even = MRI.getSubReg(Reg, Sube); + unsigned Odd = MRI.getSubReg(Reg, Subo); + O << getRegisterName(Even) << ", " << getRegisterName(Odd); +} + +void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O, + StringRef LayoutSuffix) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + + O << "{ "; + + // Work out how many registers there are in the list (if there is an actual + // list). + unsigned NumRegs = 1; + if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::ZPR2RegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQRegClassID).contains(Reg)) + NumRegs = 2; + else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::ZPR3RegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg)) + NumRegs = 3; + else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::ZPR4RegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg)) + NumRegs = 4; + + // Now forget about the list and find out what the first register is. + if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0)) + Reg = FirstReg; + else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0)) + Reg = FirstReg; + else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::zsub0)) + Reg = FirstReg; + + // If it's a D-reg, we need to promote it to the equivalent Q-reg before + // printing (otherwise getRegisterName fails). + if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) { + const MCRegisterClass &FPR128RC = + MRI.getRegClass(AArch64::FPR128RegClassID); + Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC); + } + + for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) { + if (MRI.getRegClass(AArch64::ZPRRegClassID).contains(Reg)) + O << getRegisterName(Reg) << LayoutSuffix; + else + O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix; + + if (i + 1 != NumRegs) + O << ", "; + } + + O << " }"; +} + +void +AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printVectorList(MI, OpNum, STI, O, ""); +} + +template +void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + std::string Suffix("."); + if (NumLanes) + Suffix += itostr(NumLanes) + LaneKind; + else + Suffix += LaneKind; + + printVectorList(MI, OpNum, STI, O, Suffix); +} + +void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "[" << MI->getOperand(OpNum).getImm() << "]"; +} + +void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + + // If the label has already been resolved to an immediate offset (say, when + // we're running the disassembler), just print the immediate. + if (Op.isImm()) { + O << "#" << formatImm(Op.getImm() * 4); + return; + } + + // If the branch target is simply an address then print it in hex. + const MCConstantExpr *BranchTarget = + dyn_cast(MI->getOperand(OpNum).getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } else { + // Otherwise, just print the expression. + MI->getOperand(OpNum).getExpr()->print(O, &MAI); + } +} + +void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + + // If the label has already been resolved to an immediate offset (say, when + // we're running the disassembler), just print the immediate. + if (Op.isImm()) { + O << "#" << formatImm(Op.getImm() * (1 << 12)); + return; + } + + // Otherwise, just print the expression. + MI->getOperand(OpNum).getExpr()->print(O, &MAI); +} + +void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + unsigned Opcode = MI->getOpcode(); + + StringRef Name; + if (Opcode == AArch64::ISB) { + auto ISB = AArch64ISB::lookupISBByEncoding(Val); + Name = ISB ? ISB->Name : ""; + } else if (Opcode == AArch64::TSB) { + auto TSB = AArch64TSB::lookupTSBByEncoding(Val); + Name = TSB ? TSB->Name : ""; + } else { + auto DB = AArch64DB::lookupDBByEncoding(Val); + Name = DB ? DB->Name : ""; + } + if (!Name.empty()) + O << Name; + else + O << "#" << Val; +} + +void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + // Horrible hack for the one register that has identical encodings but + // different names in MSR and MRS. Because of this, one of MRS and MSR is + // going to get the wrong entry + if (Val == AArch64SysReg::DBGDTRRX_EL0) { + O << "DBGDTRRX_EL0"; + return; + } + + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); + if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits())) + O << Reg->Name; + else + O << AArch64SysReg::genericRegisterString(Val); +} + +void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + // Horrible hack for the one register that has identical encodings but + // different names in MSR and MRS. Because of this, one of MRS and MSR is + // going to get the wrong entry + if (Val == AArch64SysReg::DBGDTRTX_EL0) { + O << "DBGDTRTX_EL0"; + return; + } + + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); + if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits())) + O << Reg->Name; + else + O << AArch64SysReg::genericRegisterString(Val); +} + +void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + auto PState = AArch64PState::lookupPStateByEncoding(Val); + if (PState && PState->haveFeatures(STI.getFeatureBits())) + O << PState->Name; + else + O << "#" << formatImm(Val); +} + +void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned RawVal = MI->getOperand(OpNo).getImm(); + uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal); + O << format("#%#016llx", Val); +} + +template +void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + O << "#" << (Val * Angle) + Remainder; +} + +void AArch64InstPrinter::printSVEPattern(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + if (auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByEncoding(Val)) + O << Pat->Name; + else + O << '#' << formatImm(Val); +} + +template +void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + switch (suffix) { + case 0: + case 'b': + case 'h': + case 's': + case 'd': + case 'q': + break; + default: llvm_unreachable("Invalid kind specifier."); + } + + unsigned Reg = MI->getOperand(OpNum).getReg(); + O << getRegisterName(Reg); + if (suffix != 0) + O << '.' << suffix; +} + +template +void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) { + typename std::make_unsigned::type HexValue = Value; + + if (getPrintImmHex()) + O << '#' << formatHex((uint64_t)HexValue); + else + O << '#' << formatDec(Value); + + if (CommentStream) { + // Do the opposite to that used for instruction operands. + if (getPrintImmHex()) + *CommentStream << '=' << formatDec(HexValue) << '\n'; + else + *CommentStream << '=' << formatHex((uint64_t)Value) << '\n'; + } +} + +template +void AArch64InstPrinter::printImm8OptLsl(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned UnscaledVal = MI->getOperand(OpNum).getImm(); + unsigned Shift = MI->getOperand(OpNum + 1).getImm(); + assert(AArch64_AM::getShiftType(Shift) == AArch64_AM::LSL && + "Unexepected shift type!"); + + // #0 lsl #8 is never pretty printed + if ((UnscaledVal == 0) && (AArch64_AM::getShiftValue(Shift) != 0)) { + O << '#' << formatImm(UnscaledVal); + printShifter(MI, OpNum + 1, STI, O); + return; + } + + T Val; + if (std::is_signed()) + Val = (int8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift)); + else + Val = (uint8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift)); + + printImmSVE(Val, O); +} + +template +void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + typedef typename std::make_signed::type SignedT; + typedef typename std::make_unsigned::type UnsignedT; + + uint64_t Val = MI->getOperand(OpNum).getImm(); + UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64); + + // Prefer the default format for 16bit values, hex otherwise. + if ((int16_t)PrintVal == (SignedT)PrintVal) + printImmSVE((T)PrintVal, O); + else if ((uint16_t)PrintVal == PrintVal) + printImmSVE(PrintVal, O); + else + O << '#' << formatHex((uint64_t)PrintVal); +} + +template +void AArch64InstPrinter::printZPRasFPR(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Base; + switch (Width) { + case 8: Base = AArch64::B0; break; + case 16: Base = AArch64::H0; break; + case 32: Base = AArch64::S0; break; + case 64: Base = AArch64::D0; break; + case 128: Base = AArch64::Q0; break; + default: + llvm_unreachable("Unsupported width"); + } + unsigned Reg = MI->getOperand(OpNum).getReg(); + O << getRegisterName(Reg - AArch64::Z0 + Base); +} + +template +void AArch64InstPrinter::printExactFPImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto *Imm0Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs0); + auto *Imm1Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs1); + unsigned Val = MI->getOperand(OpNum).getImm(); + O << "#" << (Val ? Imm1Desc->Repr : Imm0Desc->Repr); +} + +void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + O << getRegisterName(getWRegFromXReg(Reg)); +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h new file mode 100644 index 000000000000..5311f73ca21c --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -0,0 +1,222 @@ +//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an AArch64 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64INSTPRINTER_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64INSTPRINTER_H + +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" +#include "../Utils/AArch64BaseInfo.h" + +namespace llvm { + +class AArch64InstPrinter : public MCInstPrinter { +public: + AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + + // Autogenerated by tblgen. + virtual void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, + raw_ostream &O); + + virtual StringRef getRegName(unsigned RegNo) const { + return getRegisterName(RegNo); + } + + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AArch64::NoRegAltName); + +protected: + bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + // Operand printers + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + template void printImmSVE(T Value, raw_ostream &O); + void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, + raw_ostream &O); + template + void printPostIncOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printPostIncOperand(MI, OpNo, Amount, O); + } + + void printVRegOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSysCROperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddSubImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printLogicalImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printShifter(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printShiftedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExtendedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printArithExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O, + char SrcRegKind, unsigned Width); + template + void printMemExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printMemExtend(MI, OpNum, O, SrcRegKind, Width); + } + template + void printRegWithShiftExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInverseCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAlignedLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, + raw_ostream &O); + void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale, + raw_ostream &O); + + template + void printUImm12Offset(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printUImm12Offset(MI, OpNum, Scale, O); + } + + template + void printAMIndexedWB(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printAMIndexedWB(MI, OpNum, BitWidth / 8, O); + } + + void printAMNoIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + template + void printImmScale(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + template + void printPrefetchOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printBTIHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O, + StringRef LayoutSuffix); + + /// Print a list of vector registers where the type suffix is implicit + /// (i.e. attached to the instruction rather than the registers). + void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + + template + void printTypedVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAdrpLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBarrierOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSystemPStateField(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printComplexRotationOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + template + void printImm8OptLsl(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printSVELogicalImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSVEPattern(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printSVERegOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printGPR64as32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printZPRasFPR(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printExactFPImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); +}; + +class AArch64AppleInstPrinter : public AArch64InstPrinter { +public: + AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O) override; + bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O) override; + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, + raw_ostream &O) override; + + StringRef getRegName(unsigned RegNo) const override { + return getRegisterName(RegNo); + } + + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AArch64::NoRegAltName); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64INSTPRINTER_H diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 58e4a9c9a9e9..ecff1ab0a8b3 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -131,8 +130,6 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { CodePointerSize = 8; CommentString = "//"; - ExceptionsType = ExceptionHandling::DwarfCFI; - // The default is dwarf, but WinEH can be enabled optionally, which requires - // WinEHEncodingType to be set. + ExceptionsType = ExceptionHandling::WinEH; WinEHEncodingType = WinEH::EncodingType::Itanium; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index e8570b1c2887..36ae92afc8c1 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -1,9 +1,8 @@ //=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 41cad48f7aea..8cb7a1672983 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -1,9 +1,8 @@ //=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -188,9 +187,10 @@ public: const MCSubtargetInfo &STI) const; private: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 729486b1020c..0a529321edc8 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -1,9 +1,8 @@ //===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -80,8 +79,7 @@ StringRef AArch64MCExpr::getVariantKindName() const { } void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { - if (getKind() != VK_NONE) - OS << getVariantKindName(); + OS << getVariantKindName(); Expr->print(OS, MAI); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index b6bf254d3835..ec9c95911628 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -1,9 +1,8 @@ //=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -23,8 +22,6 @@ namespace llvm { class AArch64MCExpr : public MCTargetExpr { public: enum VariantKind { - VK_NONE = 0x000, - // Symbol locations specifying (roughly speaking) what calculation should be // performed to construct the final address for the relocated // symbol. E.g. direct, via the GOT, ... diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 0f8198ba4e9b..df12274d9470 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,8 +14,10 @@ #include "AArch64ELFStreamer.h" #include "AArch64MCAsmInfo.h" #include "AArch64WinCOFFStreamer.h" -#include "InstPrinter/AArch64InstPrinter.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64InstPrinter.h" +#include "TargetInfo/AArch64TargetInfo.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCInstrAnalysis.h" @@ -56,11 +57,177 @@ createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { } void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { - for (unsigned Reg = AArch64::NoRegister + 1; - Reg < AArch64::NUM_TARGET_REGS; ++Reg) { - unsigned CV = MRI->getEncodingValue(Reg); - MRI->mapLLVMRegToCVReg(Reg, CV); - } + // Mapping from CodeView to MC register id. + static const struct { + codeview::RegisterId CVReg; + MCPhysReg Reg; + } RegMap[] = { + {codeview::RegisterId::ARM64_W0, AArch64::W0}, + {codeview::RegisterId::ARM64_W1, AArch64::W1}, + {codeview::RegisterId::ARM64_W2, AArch64::W2}, + {codeview::RegisterId::ARM64_W3, AArch64::W3}, + {codeview::RegisterId::ARM64_W4, AArch64::W4}, + {codeview::RegisterId::ARM64_W5, AArch64::W5}, + {codeview::RegisterId::ARM64_W6, AArch64::W6}, + {codeview::RegisterId::ARM64_W7, AArch64::W7}, + {codeview::RegisterId::ARM64_W8, AArch64::W8}, + {codeview::RegisterId::ARM64_W9, AArch64::W9}, + {codeview::RegisterId::ARM64_W10, AArch64::W10}, + {codeview::RegisterId::ARM64_W11, AArch64::W11}, + {codeview::RegisterId::ARM64_W12, AArch64::W12}, + {codeview::RegisterId::ARM64_W13, AArch64::W13}, + {codeview::RegisterId::ARM64_W14, AArch64::W14}, + {codeview::RegisterId::ARM64_W15, AArch64::W15}, + {codeview::RegisterId::ARM64_W16, AArch64::W16}, + {codeview::RegisterId::ARM64_W17, AArch64::W17}, + {codeview::RegisterId::ARM64_W18, AArch64::W18}, + {codeview::RegisterId::ARM64_W19, AArch64::W19}, + {codeview::RegisterId::ARM64_W20, AArch64::W20}, + {codeview::RegisterId::ARM64_W21, AArch64::W21}, + {codeview::RegisterId::ARM64_W22, AArch64::W22}, + {codeview::RegisterId::ARM64_W23, AArch64::W23}, + {codeview::RegisterId::ARM64_W24, AArch64::W24}, + {codeview::RegisterId::ARM64_W25, AArch64::W25}, + {codeview::RegisterId::ARM64_W26, AArch64::W26}, + {codeview::RegisterId::ARM64_W27, AArch64::W27}, + {codeview::RegisterId::ARM64_W28, AArch64::W28}, + {codeview::RegisterId::ARM64_W29, AArch64::W29}, + {codeview::RegisterId::ARM64_W30, AArch64::W30}, + {codeview::RegisterId::ARM64_WZR, AArch64::WZR}, + {codeview::RegisterId::ARM64_X0, AArch64::X0}, + {codeview::RegisterId::ARM64_X1, AArch64::X1}, + {codeview::RegisterId::ARM64_X2, AArch64::X2}, + {codeview::RegisterId::ARM64_X3, AArch64::X3}, + {codeview::RegisterId::ARM64_X4, AArch64::X4}, + {codeview::RegisterId::ARM64_X5, AArch64::X5}, + {codeview::RegisterId::ARM64_X6, AArch64::X6}, + {codeview::RegisterId::ARM64_X7, AArch64::X7}, + {codeview::RegisterId::ARM64_X8, AArch64::X8}, + {codeview::RegisterId::ARM64_X9, AArch64::X9}, + {codeview::RegisterId::ARM64_X10, AArch64::X10}, + {codeview::RegisterId::ARM64_X11, AArch64::X11}, + {codeview::RegisterId::ARM64_X12, AArch64::X12}, + {codeview::RegisterId::ARM64_X13, AArch64::X13}, + {codeview::RegisterId::ARM64_X14, AArch64::X14}, + {codeview::RegisterId::ARM64_X15, AArch64::X15}, + {codeview::RegisterId::ARM64_X16, AArch64::X16}, + {codeview::RegisterId::ARM64_X17, AArch64::X17}, + {codeview::RegisterId::ARM64_X18, AArch64::X18}, + {codeview::RegisterId::ARM64_X19, AArch64::X19}, + {codeview::RegisterId::ARM64_X20, AArch64::X20}, + {codeview::RegisterId::ARM64_X21, AArch64::X21}, + {codeview::RegisterId::ARM64_X22, AArch64::X22}, + {codeview::RegisterId::ARM64_X23, AArch64::X23}, + {codeview::RegisterId::ARM64_X24, AArch64::X24}, + {codeview::RegisterId::ARM64_X25, AArch64::X25}, + {codeview::RegisterId::ARM64_X26, AArch64::X26}, + {codeview::RegisterId::ARM64_X27, AArch64::X27}, + {codeview::RegisterId::ARM64_X28, AArch64::X28}, + {codeview::RegisterId::ARM64_FP, AArch64::FP}, + {codeview::RegisterId::ARM64_LR, AArch64::LR}, + {codeview::RegisterId::ARM64_SP, AArch64::SP}, + {codeview::RegisterId::ARM64_ZR, AArch64::XZR}, + {codeview::RegisterId::ARM64_NZCV, AArch64::NZCV}, + {codeview::RegisterId::ARM64_S0, AArch64::S0}, + {codeview::RegisterId::ARM64_S1, AArch64::S1}, + {codeview::RegisterId::ARM64_S2, AArch64::S2}, + {codeview::RegisterId::ARM64_S3, AArch64::S3}, + {codeview::RegisterId::ARM64_S4, AArch64::S4}, + {codeview::RegisterId::ARM64_S5, AArch64::S5}, + {codeview::RegisterId::ARM64_S6, AArch64::S6}, + {codeview::RegisterId::ARM64_S7, AArch64::S7}, + {codeview::RegisterId::ARM64_S8, AArch64::S8}, + {codeview::RegisterId::ARM64_S9, AArch64::S9}, + {codeview::RegisterId::ARM64_S10, AArch64::S10}, + {codeview::RegisterId::ARM64_S11, AArch64::S11}, + {codeview::RegisterId::ARM64_S12, AArch64::S12}, + {codeview::RegisterId::ARM64_S13, AArch64::S13}, + {codeview::RegisterId::ARM64_S14, AArch64::S14}, + {codeview::RegisterId::ARM64_S15, AArch64::S15}, + {codeview::RegisterId::ARM64_S16, AArch64::S16}, + {codeview::RegisterId::ARM64_S17, AArch64::S17}, + {codeview::RegisterId::ARM64_S18, AArch64::S18}, + {codeview::RegisterId::ARM64_S19, AArch64::S19}, + {codeview::RegisterId::ARM64_S20, AArch64::S20}, + {codeview::RegisterId::ARM64_S21, AArch64::S21}, + {codeview::RegisterId::ARM64_S22, AArch64::S22}, + {codeview::RegisterId::ARM64_S23, AArch64::S23}, + {codeview::RegisterId::ARM64_S24, AArch64::S24}, + {codeview::RegisterId::ARM64_S25, AArch64::S25}, + {codeview::RegisterId::ARM64_S26, AArch64::S26}, + {codeview::RegisterId::ARM64_S27, AArch64::S27}, + {codeview::RegisterId::ARM64_S28, AArch64::S28}, + {codeview::RegisterId::ARM64_S29, AArch64::S29}, + {codeview::RegisterId::ARM64_S30, AArch64::S30}, + {codeview::RegisterId::ARM64_S31, AArch64::S31}, + {codeview::RegisterId::ARM64_D0, AArch64::D0}, + {codeview::RegisterId::ARM64_D1, AArch64::D1}, + {codeview::RegisterId::ARM64_D2, AArch64::D2}, + {codeview::RegisterId::ARM64_D3, AArch64::D3}, + {codeview::RegisterId::ARM64_D4, AArch64::D4}, + {codeview::RegisterId::ARM64_D5, AArch64::D5}, + {codeview::RegisterId::ARM64_D6, AArch64::D6}, + {codeview::RegisterId::ARM64_D7, AArch64::D7}, + {codeview::RegisterId::ARM64_D8, AArch64::D8}, + {codeview::RegisterId::ARM64_D9, AArch64::D9}, + {codeview::RegisterId::ARM64_D10, AArch64::D10}, + {codeview::RegisterId::ARM64_D11, AArch64::D11}, + {codeview::RegisterId::ARM64_D12, AArch64::D12}, + {codeview::RegisterId::ARM64_D13, AArch64::D13}, + {codeview::RegisterId::ARM64_D14, AArch64::D14}, + {codeview::RegisterId::ARM64_D15, AArch64::D15}, + {codeview::RegisterId::ARM64_D16, AArch64::D16}, + {codeview::RegisterId::ARM64_D17, AArch64::D17}, + {codeview::RegisterId::ARM64_D18, AArch64::D18}, + {codeview::RegisterId::ARM64_D19, AArch64::D19}, + {codeview::RegisterId::ARM64_D20, AArch64::D20}, + {codeview::RegisterId::ARM64_D21, AArch64::D21}, + {codeview::RegisterId::ARM64_D22, AArch64::D22}, + {codeview::RegisterId::ARM64_D23, AArch64::D23}, + {codeview::RegisterId::ARM64_D24, AArch64::D24}, + {codeview::RegisterId::ARM64_D25, AArch64::D25}, + {codeview::RegisterId::ARM64_D26, AArch64::D26}, + {codeview::RegisterId::ARM64_D27, AArch64::D27}, + {codeview::RegisterId::ARM64_D28, AArch64::D28}, + {codeview::RegisterId::ARM64_D29, AArch64::D29}, + {codeview::RegisterId::ARM64_D30, AArch64::D30}, + {codeview::RegisterId::ARM64_D31, AArch64::D31}, + {codeview::RegisterId::ARM64_Q0, AArch64::Q0}, + {codeview::RegisterId::ARM64_Q1, AArch64::Q1}, + {codeview::RegisterId::ARM64_Q2, AArch64::Q2}, + {codeview::RegisterId::ARM64_Q3, AArch64::Q3}, + {codeview::RegisterId::ARM64_Q4, AArch64::Q4}, + {codeview::RegisterId::ARM64_Q5, AArch64::Q5}, + {codeview::RegisterId::ARM64_Q6, AArch64::Q6}, + {codeview::RegisterId::ARM64_Q7, AArch64::Q7}, + {codeview::RegisterId::ARM64_Q8, AArch64::Q8}, + {codeview::RegisterId::ARM64_Q9, AArch64::Q9}, + {codeview::RegisterId::ARM64_Q10, AArch64::Q10}, + {codeview::RegisterId::ARM64_Q11, AArch64::Q11}, + {codeview::RegisterId::ARM64_Q12, AArch64::Q12}, + {codeview::RegisterId::ARM64_Q13, AArch64::Q13}, + {codeview::RegisterId::ARM64_Q14, AArch64::Q14}, + {codeview::RegisterId::ARM64_Q15, AArch64::Q15}, + {codeview::RegisterId::ARM64_Q16, AArch64::Q16}, + {codeview::RegisterId::ARM64_Q17, AArch64::Q17}, + {codeview::RegisterId::ARM64_Q18, AArch64::Q18}, + {codeview::RegisterId::ARM64_Q19, AArch64::Q19}, + {codeview::RegisterId::ARM64_Q20, AArch64::Q20}, + {codeview::RegisterId::ARM64_Q21, AArch64::Q21}, + {codeview::RegisterId::ARM64_Q22, AArch64::Q22}, + {codeview::RegisterId::ARM64_Q23, AArch64::Q23}, + {codeview::RegisterId::ARM64_Q24, AArch64::Q24}, + {codeview::RegisterId::ARM64_Q25, AArch64::Q25}, + {codeview::RegisterId::ARM64_Q26, AArch64::Q26}, + {codeview::RegisterId::ARM64_Q27, AArch64::Q27}, + {codeview::RegisterId::ARM64_Q28, AArch64::Q28}, + {codeview::RegisterId::ARM64_Q29, AArch64::Q29}, + {codeview::RegisterId::ARM64_Q30, AArch64::Q30}, + {codeview::RegisterId::ARM64_Q31, AArch64::Q31}, + + }; + for (unsigned I = 0; I < array_lengthof(RegMap); ++I) + MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast(RegMap[I].CVReg)); } static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) { @@ -166,12 +333,20 @@ public: for (uint64_t Byte = 0, End = PltContents.size(); Byte + 7 < End; Byte += 4) { uint32_t Insn = support::endian::read32le(PltContents.data() + Byte); + uint64_t Off = 0; + // Check for optional bti c that prefixes adrp in BTI enabled entries + if (Insn == 0xd503245f) { + Off = 4; + Insn = support::endian::read32le(PltContents.data() + Byte + Off); + } // Check for adrp. if ((Insn & 0x9f000000) != 0x90000000) continue; + Off += 4; uint64_t Imm = (((PltSectionVA + Byte) >> 12) << 12) + (((Insn >> 29) & 3) << 12) + (((Insn >> 5) & 0x3ffff) << 14); - uint32_t Insn2 = support::endian::read32le(PltContents.data() + Byte + 4); + uint32_t Insn2 = + support::endian::read32le(PltContents.data() + Byte + Off); // Check for: ldr Xt, [Xn, #pimm]. if (Insn2 >> 22 == 0x3e5) { Imm += ((Insn2 >> 10) & 0xfff) << 3; @@ -192,7 +367,8 @@ static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) { // Force static initialization. extern "C" void LLVMInitializeAArch64TargetMC() { for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(), - &getTheARM64Target()}) { + &getTheAArch64_32Target(), &getTheARM64Target(), + &getTheARM64_32Target()}) { // Register the MC asm info. RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo); @@ -228,7 +404,8 @@ extern "C" void LLVMInitializeAArch64TargetMC() { } // Register the asm backend. - for (Target *T : {&getTheAArch64leTarget(), &getTheARM64Target()}) + for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64_32Target(), + &getTheARM64Target(), &getTheARM64_32Target()}) TargetRegistry::RegisterMCAsmBackend(*T, createAArch64leAsmBackend); TargetRegistry::RegisterMCAsmBackend(getTheAArch64beTarget(), createAArch64beAsmBackend); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 0f22f69bd5b0..c84c313c1db0 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -1,9 +1,8 @@ //===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,10 +36,6 @@ class Triple; class raw_ostream; class raw_pwrite_stream; -Target &getTheAArch64leTarget(); -Target &getTheAArch64beTarget(); -Target &getTheARM64Target(); - MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); @@ -57,7 +52,8 @@ std::unique_ptr createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32); std::unique_ptr -createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype); +createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, + bool IsILP32); std::unique_ptr createAArch64WinCOFFObjectWriter(); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 1021cdeeb3be..b3ce5ef22eef 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -38,8 +37,8 @@ class AArch64MachObjectWriter : public MCMachObjectTargetWriter { unsigned &Log2Size, const MCAssembler &Asm); public: - AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) - : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype) {} + AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32) + : MCMachObjectTargetWriter(!IsILP32 /* is64Bit */, CPUType, CPUSubtype) {} void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, @@ -405,6 +404,8 @@ void AArch64MachObjectWriter::recordRelocation( } std::unique_ptr -llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) { - return llvm::make_unique(CPUType, CPUSubtype); +llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, + bool IsILP32) { + return llvm::make_unique(CPUType, CPUSubtype, + IsILP32); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index a6b8d963bef9..f70752f5303f 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -1,9 +1,8 @@ //===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "AArch64TargetStreamer.h" #include "llvm/MC/ConstantPools.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCSubtargetInfo.h" using namespace llvm; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index 73fb9baea3e3..3a0c5d8318dd 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -1,9 +1,8 @@ //===-- AArch64TargetStreamer.h - AArch64 Target Streamer ------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 7ea7d5f2a20e..a45880a07427 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -1,9 +1,8 @@ //= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index b828ab832e9d..37c6fbb03908 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -1,9 +1,8 @@ //===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h index ed265a876ab3..8c0656652eed 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h @@ -1,9 +1,8 @@ //===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td index 23a65b345bad..808e59467081 100644 --- a/lib/Target/AArch64/SVEInstrFormats.td +++ b/lib/Target/AArch64/SVEInstrFormats.td @@ -1,9 +1,8 @@ //=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -*- tablegen -*--=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -701,8 +700,8 @@ multiclass sve_int_perm_dup_i { (!cast(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>; } -class sve_int_perm_tbl sz8_64, string asm, ZPRRegOp zprty, - RegisterOperand VecList> +class sve_int_perm_tbl sz8_64, bits<2> opc, string asm, + ZPRRegOp zprty, RegisterOperand VecList> : I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", "", @@ -714,16 +713,18 @@ class sve_int_perm_tbl sz8_64, string asm, ZPRRegOp zprty, let Inst{23-22} = sz8_64; let Inst{21} = 0b1; let Inst{20-16} = Zm; - let Inst{15-10} = 0b001100; + let Inst{15-13} = 0b001; + let Inst{12-11} = opc; + let Inst{10} = 0b0; let Inst{9-5} = Zn; let Inst{4-0} = Zd; } multiclass sve_int_perm_tbl { - def _B : sve_int_perm_tbl<0b00, asm, ZPR8, Z_b>; - def _H : sve_int_perm_tbl<0b01, asm, ZPR16, Z_h>; - def _S : sve_int_perm_tbl<0b10, asm, ZPR32, Z_s>; - def _D : sve_int_perm_tbl<0b11, asm, ZPR64, Z_d>; + def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>; + def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>; + def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>; + def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>; def : InstAlias(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>; @@ -735,6 +736,37 @@ multiclass sve_int_perm_tbl { (!cast(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>; } +multiclass sve2_int_perm_tbl { + def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>; + def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>; + def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>; + def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>; +} + +class sve2_int_perm_tbx sz8_64, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b001011; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_perm_tbx { + def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>; + def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>; + def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>; + def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>; +} + class sve_int_perm_reverse_z sz8_64, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zd), (ins zprty:$Zn), asm, "\t$Zd, $Zn", @@ -875,6 +907,21 @@ class sve_int_perm_extract_i let ElementSize = ElementSizeNone; } +class sve2_int_perm_extract_i_cons +: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8), + asm, "\t$Zd, $Zn, $imm8", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<8> imm8; + let Inst{31-21} = 0b00000101011; + let Inst{20-16} = imm8{7-3}; + let Inst{15-13} = 0b000; + let Inst{12-10} = imm8{2-0}; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + //===----------------------------------------------------------------------===// // SVE Vector Select Group //===----------------------------------------------------------------------===// @@ -1436,6 +1483,132 @@ multiclass sve_fp_fcadd { def _D : sve_fp_fcadd<0b11, asm, ZPR64>; } +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Convert Group +//===----------------------------------------------------------------------===// + +class sve2_fp_convert_precision opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<3> Pg; + let Inst{31-24} = 0b01100100; + let Inst{23-22} = opc{3-2}; + let Inst{21-18} = 0b0010; + let Inst{17-16} = opc{1-0}; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_fp_convert_down_narrow { + def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>; + def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>; +} + +multiclass sve2_fp_convert_up_long { + def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>; + def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>; +} + +multiclass sve2_fp_convert_down_odd_rounding { + def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Pairwise Group +//===----------------------------------------------------------------------===// + +class sve2_fp_pairwise_pred sz, bits<3> opc, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm), + asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zm; + bits<5> Zdn; + let Inst{31-24} = 0b01100100; + let Inst{23-22} = sz; + let Inst{21-19} = 0b010; + let Inst{18-16} = opc; + let Inst{15-13} = 0b100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; +} + +multiclass sve2_fp_pairwise_pred opc, string asm> { + def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>; + def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>; + def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Widening Multiply-Add - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_fp_mla_long_by_indexed_elem opc, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, + VectorIndexH:$iop), + asm, "\t$Zda, $Zn, $Zm$iop", + "", + []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<3> iop; + let Inst{31-21} = 0b01100100101; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{15-14} = 0b01; + let Inst{13} = opc{1}; + let Inst{12} = 0b0; + let Inst{11} = iop{0}; + let Inst{10} = opc{0}; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Widening Multiply-Add Group +//===----------------------------------------------------------------------===// + +class sve2_fp_mla_long opc, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), + asm, "\t$Zda, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01100100101; + let Inst{20-16} = Zm; + let Inst{15-14} = 0b10; + let Inst{13} = opc{1}; + let Inst{12-11} = 0b00; + let Inst{10} = opc{0}; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + //===----------------------------------------------------------------------===// // SVE Stack Allocation Group //===----------------------------------------------------------------------===// @@ -1536,6 +1709,12 @@ multiclass sve_fp_2op_p_zd_HSD opc, string asm> { def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>; } +multiclass sve2_fp_flogb { + def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>; + def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>; + def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>; +} + //===----------------------------------------------------------------------===// // SVE Floating Point Unary Operations - Unpredicated Group //===----------------------------------------------------------------------===// @@ -1691,6 +1870,112 @@ multiclass sve_int_mlas_vvv_pred opc, string asm> { def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>; } +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply-Add - Unpredicated Group +//===----------------------------------------------------------------------===// + +class sve2_int_mla sz, bits<5> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15} = 0b0; + let Inst{14-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_mla { + def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>; + def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>; + def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>; + def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>; +} + +multiclass sve2_int_mla_long opc, string asm> { + def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>; + def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>; + def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply-Add - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_int_mla_by_indexed_elem sz, bits<6> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + ZPRRegOp zprty3, Operand itype> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop), + asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{15-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_mla_by_indexed_elem opc, bit S, string asm> { + def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> { + bits<3> Zm; + bits<3> iop; + let Inst{22} = iop{2}; + let Inst{20-19} = iop{1-0}; + let Inst{18-16} = Zm; + } + def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> { + bits<3> Zm; + bits<2> iop; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> { + bits<4> Zm; + bit iop; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply-Add Long - Indexed Group +//===----------------------------------------------------------------------===// + +multiclass sve2_int_mla_long_by_indexed_elem opc, string asm> { + def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, + asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + bits<3> Zm; + bits<3> iop; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + } + def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, + asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + bits<4> Zm; + bits<2> iop; + let Inst{20} = iop{1}; + let Inst{19-16} = Zm; + let Inst{11} = iop{0}; + } +} + //===----------------------------------------------------------------------===// // SVE Integer Dot Product Group //===----------------------------------------------------------------------===// @@ -1733,32 +2018,671 @@ class sve_intx_dot_by_indexed_elem, Sched<[]> { bits<5> Zda; bits<5> Zn; - let Inst{31-23} = 0b010001001; - let Inst{22} = sz; + let Inst{31-23} = 0b010001001; + let Inst{22} = sz; + let Inst{21} = 0b1; + let Inst{15-11} = 0; + let Inst{10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve_intx_dot_by_indexed_elem { + def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + bits<1> iop; + bits<4> Zm; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Integer Dot Product Group +//===----------------------------------------------------------------------===// + +class sve2_complex_int_arith sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm, + complexrotateop:$rot), + asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + bits<2> rot; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-12} = opc; + let Inst{11-10} = rot; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_cintx_dot { + def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>; + def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Multiply-Add Group +//===----------------------------------------------------------------------===// + +multiclass sve2_int_cmla { + def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>; + def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>; + def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>; + def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Integer Dot Product - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_complex_int_arith_indexed sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + ZPRRegOp zprty3, Operand itype> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop, + complexrotateop:$rot), + asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<2> rot; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{15-12} = opc; + let Inst{11-10} = rot; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_cintx_dot_by_indexed_elem { + def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + bit iop; + bits<4> Zm; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Multiply-Add - Indexed Group +//===----------------------------------------------------------------------===// + +multiclass sve2_cmla_by_indexed_elem { + def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> { + bit iop; + bits<4> Zm; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply - Unpredicated Group +//===----------------------------------------------------------------------===// + +class sve2_int_mul sz, bits<3> opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_mul opc, string asm> { + def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; + def _H : sve2_int_mul<0b01, opc, asm, ZPR16>; + def _S : sve2_int_mul<0b10, opc, asm, ZPR32>; + def _D : sve2_int_mul<0b11, opc, asm, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_int_mul_by_indexed_elem sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + ZPRRegOp zprty3, Operand itype> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop), + asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{15-14} = 0b11; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_mul_by_indexed_elem opc, string asm> { + def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> { + bits<3> Zm; + bits<3> iop; + let Inst{22} = iop{2}; + let Inst{20-19} = iop{1-0}; + let Inst{18-16} = Zm; + } + def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> { + bits<3> Zm; + bits<2> iop; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> { + bits<4> Zm; + bit iop; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } +} + +multiclass sve2_int_mul_long_by_indexed_elem opc, string asm> { + def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm, + ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + bits<3> Zm; + bits<3> iop; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + } + def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm, + ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + bits<4> Zm; + bits<2> iop; + let Inst{20} = iop{1}; + let Inst{19-16} = Zm; + let Inst{11} = iop{0}; + } +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer - Predicated Group +//===----------------------------------------------------------------------===// + +class sve2_int_arith_pred sz, bits<6> opc, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm), + asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> { + bits<3> Pg; + bits<5> Zm; + bits<5> Zdn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = opc{5-1}; + let Inst{15-14} = 0b10; + let Inst{13} = opc{0}; + let Inst{12-10} = Pg; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; +} + +multiclass sve2_int_arith_pred opc, string asm> { + def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>; + def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>; + def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>; + def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>; +} + +class sve2_int_sadd_long_accum_pairwise sz, bit U, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn), + asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> { + bits<3> Pg; + bits<5> Zn; + bits<5> Zda; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21-17} = 0b00010; + let Inst{16} = U; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = zprty1.ElementSize; +} + +multiclass sve2_int_sadd_long_accum_pairwise { + def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>; + def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>; + def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>; +} + +class sve2_int_un_pred_arit sz, bit Q, bits<2> opc, + string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zd; + bits<5> Zn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21-20} = 0b00; + let Inst{19} = Q; + let Inst{18} = 0b0; + let Inst{17-16} = opc; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; +} + +multiclass sve2_int_un_pred_arit_s opc, string asm> { + def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>; +} + +multiclass sve2_int_un_pred_arit opc, string asm> { + def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>; + def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>; + def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>; + def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Widening Integer Arithmetic Group +//===----------------------------------------------------------------------===// + +class sve2_wide_int_arith sz, bits<5> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15} = 0b0; + let Inst{14-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_wide_int_arith_long opc, string asm> { + def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>; + def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>; + def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>; +} + +multiclass sve2_wide_int_arith_wide opc, string asm> { + def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>; + def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>; + def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>; +} + +multiclass sve2_pmul_long opc, string asm> { + def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>; + def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Misc Group +//===----------------------------------------------------------------------===// + +class sve2_misc sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-14} = 0b10; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_misc_bitwise opc, string asm> { + def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>; + def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>; + def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>; + def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>; +} + +multiclass sve2_bitwise_xor_interleaved { + let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in { + def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>; + def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>; + def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>; + def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>; + } +} + +multiclass sve2_misc_int_addsub_long_interleaved opc, string asm> { + def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; + def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; + def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; +} + +class sve2_bitwise_shift_left_long tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b0; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-12} = 0b1010; + let Inst{11-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_bitwise_shift_left_long opc, string asm> { + def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm, + ZPR16, ZPR8, vecshiftL8>; + def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm, + ZPR32, ZPR16, vecshiftL16> { + let Inst{19} = imm{3}; + } + def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm, + ZPR64, ZPR32, vecshiftL32> { + let Inst{20-19} = imm{4-3}; + } +} + +//===----------------------------------------------------------------------===// +// SVE2 Accumulate Group +//===----------------------------------------------------------------------===// + +class sve2_int_bin_cons_shift_imm tsz8_64, bit opc, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<6> imm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = tsz8_64{3-2}; + let Inst{21} = 0b0; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-11} = 0b11110; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_bin_cons_shift_imm_left { + def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } +} + +multiclass sve2_int_bin_cons_shift_imm_right { + def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } +} + +class sve2_int_bin_accum_cons_shift_imm tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm), + asm, "\t$Zda, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<6> imm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = tsz8_64{3-2}; + let Inst{21} = 0b0; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-12} = 0b1110; + let Inst{11-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_bin_accum_cons_shift_imm_right opc, string asm> { + def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } +} + +class sve2_int_cadd sz, bit opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot), + asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zm; + bit rot; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21-17} = 0b00000; + let Inst{16} = opc; + let Inst{15-11} = 0b11011; + let Inst{10} = rot; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_cadd { + def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>; + def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>; + def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>; + def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>; +} + +class sve2_int_absdiff_accum sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-14} = 0b11; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_absdiff_accum { + def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>; + def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>; + def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>; + def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>; +} + +multiclass sve2_int_absdiff_accum_long opc, string asm> { + def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; + def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; + def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; +} + +multiclass sve2_int_addsub_long_carry opc, string asm> { + def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, + ZPR32, ZPR32>; + def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, + ZPR64, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Narrowing Group +//===----------------------------------------------------------------------===// + +class sve2_int_bin_cons_shift_imm_narrow tsz8_64, bits<4> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-14} = 0b00; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_bin_cons_shift_imm_right_narrow opc, string asm> { + def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16, + vecshiftR8>; + def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32, + vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64, + vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } +} + +class sve2_int_addsub_narrow_high sz, bits<3> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-10} = opc; // S, R, T + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_addsub_narrow_high opc, string asm> { + def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>; +} + +class sve2_int_sat_extract_narrow tsz8_64, bits<3> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; let Inst{21} = 0b1; - let Inst{15-11} = 0; - let Inst{10} = U; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-13} = 0b000010; + let Inst{12-10} = opc; let Inst{9-5} = Zn; - let Inst{4-0} = Zda; - - let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; - let ElementSize = ElementSizeNone; + let Inst{4-0} = Zd; } -multiclass sve_intx_dot_by_indexed_elem { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { - bits<2> iop; - bits<3> Zm; - let Inst{20-19} = iop; - let Inst{18-16} = Zm; - } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { - bits<1> iop; - bits<4> Zm; - let Inst{20} = iop; - let Inst{19-16} = Zm; - } +multiclass sve2_int_sat_extract_narrow opc, string asm> { + def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>; } //===----------------------------------------------------------------------===// @@ -1983,6 +2907,86 @@ class sve_int_bin_cons_log opc, string asm> let Inst{4-0} = Zd; } +multiclass sve_int_bin_cons_log opc, string asm> { + def NAME : sve_int_bin_cons_log; + + def : InstAlias(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>; + def : InstAlias(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>; + def : InstAlias(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>; +} + +class sve2_int_bitwise_ternary_op_d opc, string asm> +: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk), + asm, "\t$Zdn, $_Zdn, $Zm, $Zk", + "", + []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zk; + bits<5> Zm; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = opc{2-1}; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b00111; + let Inst{10} = opc{0}; + let Inst{9-5} = Zk; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_bitwise_ternary_op opc, string asm> { + def NAME : sve2_int_bitwise_ternary_op_d; + + def : InstAlias(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>; + def : InstAlias(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>; + def : InstAlias(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>; +} + +class sve2_int_rotate_right_imm tsz8_64, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm), + asm, "\t$Zdn, $_Zdn, $Zm, $imm", + "", + []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zm; + bits<6> imm; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = tsz8_64{3-2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-10} = 0b001101; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_rotate_right_imm { + def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } +} //===----------------------------------------------------------------------===// // SVE Integer Wide Immediate - Predicated Group @@ -2266,6 +3270,32 @@ multiclass sve_int_while8_rr opc, string asm> { def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>; } +class sve2_int_while_rr sz8_64, bits<1> rw, string asm, + PPRRegOp pprty> +: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm), + asm, "\t$Pd, $Rn, $Rm", + "", []>, Sched<[]> { + bits<4> Pd; + bits<5> Rm; + bits<5> Rn; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Rm; + let Inst{15-10} = 0b001100; + let Inst{9-5} = Rn; + let Inst{4} = rw; + let Inst{3-0} = Pd; + + let Defs = [NZCV]; +} + +multiclass sve2_int_while_rr rw, string asm> { + def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>; + def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>; + def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>; + def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>; +} //===----------------------------------------------------------------------===// // SVE Floating Point Fast Reduction Group @@ -2497,9 +3527,9 @@ multiclass sve_int_index_rr { //===----------------------------------------------------------------------===// // SVE Bitwise Shift - Predicated Group //===----------------------------------------------------------------------===// -class sve_int_bin_pred_shift_imm tsz8_64, bits<3> opc, string asm, - ZPRRegOp zprty, Operand immtype, - ElementSizeEnum size> +class sve_int_bin_pred_shift_imm tsz8_64, bits<4> opc, string asm, + ZPRRegOp zprty, Operand immtype, + ElementSizeEnum size> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm), asm, "\t$Zdn, $Pg/m, $_Zdn, $imm", "", @@ -2509,8 +3539,8 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<3> opc, string asm, bits<6> imm; let Inst{31-24} = 0b00000100; let Inst{23-22} = tsz8_64{3-2}; - let Inst{21-19} = 0b000; - let Inst{18-16} = opc; + let Inst{21-20} = 0b00; + let Inst{19-16} = opc; let Inst{15-13} = 0b100; let Inst{12-10} = Pg; let Inst{9-8} = tsz8_64{1-0}; @@ -2522,7 +3552,7 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<3> opc, string asm, let ElementSize = size; } -multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { +multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, ElementSizeB>; def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, @@ -2540,7 +3570,7 @@ multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { } } -multiclass sve_int_bin_pred_shift_imm_right opc, string asm> { +multiclass sve_int_bin_pred_shift_imm_right opc, string asm> { def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, ElementSizeB>; def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, @@ -2856,6 +3886,43 @@ multiclass sve_mem_cstnt_ss msz, string asm, RegisterOperand listty, (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; } +class sve2_mem_cstnt_vs_base opc, dag iops, string asm, + RegisterOperand VecList> +: I<(outs VecList:$Zt), iops, + asm, "\t$Zt, $Pg, [$Zn, $Rm]", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Rm; + bits<5> Zn; + bits<5> Zt; + let Inst{31-25} = 0b1110010; + let Inst{24-22} = opc; + let Inst{21} = 0b0; + let Inst{20-16} = Rm; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zt; + + let mayStore = 1; +} + +multiclass sve2_mem_cstnt_vs opc, string asm, + RegisterOperand listty, ZPRRegOp zprty> { + def _REAL : sve2_mem_cstnt_vs_base; + + def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; +} + class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, RegisterOperand VecList, RegisterOperand zprext> : I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), @@ -3304,6 +4371,30 @@ multiclass sve_int_perm_splice { def _D : sve_int_perm_splice<0b11, asm, ZPR64>; } +class sve2_int_perm_splice_cons sz8_64, string asm, + ZPRRegOp zprty, RegisterOperand VecList> +: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn), + asm, "\t$Zd, $Pg, $Zn", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zn; + bits<5> Zd; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz8_64; + let Inst{21-13} = 0b101101100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_perm_splice_cons { + def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>; + def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>; + def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>; + def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>; +} + class sve_int_perm_rev sz8_64, bits<2> opc, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn), @@ -4003,6 +5094,46 @@ multiclass sve_mem_p_fill { (!cast(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>; } +class sve2_mem_cldnt_vs_base opc, dag iops, string asm, + RegisterOperand VecList> +: I<(outs VecList:$Zt), iops, + asm, "\t$Zt, $Pg/z, [$Zn, $Rm]", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Rm; + bits<5> Zn; + bits<5> Zt; + let Inst{31} = 0b1; + let Inst{30} = opc{4}; + let Inst{29-25} = 0b00010; + let Inst{24-23} = opc{3-2}; + let Inst{22-21} = 0b00; + let Inst{20-16} = Rm; + let Inst{15} = 0b1; + let Inst{14-13} = opc{1-0}; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve2_mem_cldnt_vs opc, string asm, + RegisterOperand listty, ZPRRegOp zprty> { + def _REAL : sve2_mem_cldnt_vs_base; + + def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; +} + //===----------------------------------------------------------------------===// // SVE Memory - 64-bit Gather Group //===----------------------------------------------------------------------===// @@ -4454,3 +5585,132 @@ multiclass sve_int_break_z opc, string asm> { def NAME : sve_int_break; } +//===----------------------------------------------------------------------===// +// SVE2 String Processing Group +//===----------------------------------------------------------------------===// + +class sve2_char_match +: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm), + asm, "\t$Pd, $Pg/z, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<4> Pd; + bits<3> Pg; + bits<5> Zm; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4} = opc; + let Inst{3-0} = Pd; + + let Defs = [NZCV]; +} + +multiclass sve2_char_match { + def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>; + def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Histogram Computation - Segment Group +//===----------------------------------------------------------------------===// + +class sve2_hist_gen_segment +: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000101001; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b101000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +//===----------------------------------------------------------------------===// +// SVE2 Histogram Computation - Vector Group +//===----------------------------------------------------------------------===// + +class sve2_hist_gen_vector +: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Pg/z, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<3> Pg; + bits<5> Zm; + let Inst{31-23} = 0b010001011; + let Inst{22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b110; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_hist_gen_vector { + def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>; + def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Crypto Extensions Group +//===----------------------------------------------------------------------===// + +class sve2_crypto_cons_bin_op +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000101001; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b11110; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +class sve2_crypto_des_bin_op opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm), + asm, "\t$Zdn, $_Zdn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zm; + let Inst{31-17} = 0b010001010010001; + let Inst{16} = opc{1}; + let Inst{15-11} = 0b11100; + let Inst{10} = opc{0}; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; +} + +class sve2_crypto_unary_op +: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn), + asm, "\t$Zdn, $_Zdn", + "", + []>, Sched<[]> { + bits<5> Zdn; + let Inst{31-11} = 0b010001010010000011100; + let Inst{10} = opc; + let Inst{9-5} = 0b00000; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; +} diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp index 8fb161574c5b..7f02da6a9516 100644 --- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp +++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -1,39 +1,50 @@ //===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "llvm/ADT/Triple.h" +#include "TargetInfo/AArch64TargetInfo.h" #include "llvm/Support/TargetRegistry.h" + using namespace llvm; -namespace llvm { -Target &getTheAArch64leTarget() { +Target &llvm::getTheAArch64leTarget() { static Target TheAArch64leTarget; return TheAArch64leTarget; } -Target &getTheAArch64beTarget() { +Target &llvm::getTheAArch64beTarget() { static Target TheAArch64beTarget; return TheAArch64beTarget; } -Target &getTheARM64Target() { +Target &llvm::getTheAArch64_32Target() { + static Target TheAArch64leTarget; + return TheAArch64leTarget; +} +Target &llvm::getTheARM64Target() { static Target TheARM64Target; return TheARM64Target; } -} // namespace llvm +Target &llvm::getTheARM64_32Target() { + static Target TheARM64_32Target; + return TheARM64_32Target; +} extern "C" void LLVMInitializeAArch64TargetInfo() { // Now register the "arm64" name for use with "-march". We don't want it to - // take possession of the Triple::aarch64 tag though. + // take possession of the Triple::aarch64 tags though. TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64", "ARM64 (little endian)", "AArch64", [](Triple::ArchType) { return false; }, true); + TargetRegistry::RegisterTarget(getTheARM64_32Target(), "arm64_32", + "ARM64 (little endian ILP32)", "AArch64", + [](Triple::ArchType) { return false; }, true); RegisterTarget Z( getTheAArch64leTarget(), "aarch64", "AArch64 (little endian)", "AArch64"); RegisterTarget W( getTheAArch64beTarget(), "aarch64_be", "AArch64 (big endian)", "AArch64"); + RegisterTarget X( + getTheAArch64_32Target(), "aarch64_32", "AArch64 (little endian ILP32)", "AArch64"); } diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h new file mode 100644 index 000000000000..b3728a11bb5d --- /dev/null +++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h @@ -0,0 +1,24 @@ +//===-- AArch64TargetInfo.h - AArch64 Target Implementation -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_TARGETINFO_AARCH64TARGETINFO_H +#define LLVM_LIB_TARGET_AARCH64_TARGETINFO_AARCH64TARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheAArch64leTarget(); +Target &getTheAArch64beTarget(); +Target &getTheAArch64_32Target(); +Target &getTheARM64Target(); +Target &getTheARM64_32Target(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_TARGETINFO_AARCH64TARGETINFO_H diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index c88155db7037..7bb075c36e79 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -1,9 +1,8 @@ //===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 44c6a6b44895..e5e2fc2cb0df 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -1,9 +1,8 @@ //===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -186,6 +185,49 @@ static inline unsigned getDRegFromBReg(unsigned Reg) { return Reg; } +static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) { + switch (Opcode) { + case AArch64::LDADDAB: case AArch64::LDADDAH: + case AArch64::LDADDAW: case AArch64::LDADDAX: + case AArch64::LDADDALB: case AArch64::LDADDALH: + case AArch64::LDADDALW: case AArch64::LDADDALX: + case AArch64::LDCLRAB: case AArch64::LDCLRAH: + case AArch64::LDCLRAW: case AArch64::LDCLRAX: + case AArch64::LDCLRALB: case AArch64::LDCLRALH: + case AArch64::LDCLRALW: case AArch64::LDCLRALX: + case AArch64::LDEORAB: case AArch64::LDEORAH: + case AArch64::LDEORAW: case AArch64::LDEORAX: + case AArch64::LDEORALB: case AArch64::LDEORALH: + case AArch64::LDEORALW: case AArch64::LDEORALX: + case AArch64::LDSETAB: case AArch64::LDSETAH: + case AArch64::LDSETAW: case AArch64::LDSETAX: + case AArch64::LDSETALB: case AArch64::LDSETALH: + case AArch64::LDSETALW: case AArch64::LDSETALX: + case AArch64::LDSMAXAB: case AArch64::LDSMAXAH: + case AArch64::LDSMAXAW: case AArch64::LDSMAXAX: + case AArch64::LDSMAXALB: case AArch64::LDSMAXALH: + case AArch64::LDSMAXALW: case AArch64::LDSMAXALX: + case AArch64::LDSMINAB: case AArch64::LDSMINAH: + case AArch64::LDSMINAW: case AArch64::LDSMINAX: + case AArch64::LDSMINALB: case AArch64::LDSMINALH: + case AArch64::LDSMINALW: case AArch64::LDSMINALX: + case AArch64::LDUMAXAB: case AArch64::LDUMAXAH: + case AArch64::LDUMAXAW: case AArch64::LDUMAXAX: + case AArch64::LDUMAXALB: case AArch64::LDUMAXALH: + case AArch64::LDUMAXALW: case AArch64::LDUMAXALX: + case AArch64::LDUMINAB: case AArch64::LDUMINAH: + case AArch64::LDUMINAW: case AArch64::LDUMINAX: + case AArch64::LDUMINALB: case AArch64::LDUMINALH: + case AArch64::LDUMINALW: case AArch64::LDUMINALX: + case AArch64::SWPAB: case AArch64::SWPAH: + case AArch64::SWPAW: case AArch64::SWPAX: + case AArch64::SWPALB: case AArch64::SWPALH: + case AArch64::SWPALW: case AArch64::SWPALX: + return true; + } + return false; +} + namespace AArch64CC { // The CondCodes constants map directly to the 4-bit encoding of the condition diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index bb7801c172f6..19a8bd901629 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -1,9 +1,8 @@ //===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -51,14 +50,16 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); -FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitcntsPass(); -FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); -FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &, + const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); +FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); +ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); FunctionPass *createSIModeRegisterPass(); @@ -93,6 +94,12 @@ ModulePass *createAMDGPULowerKernelAttributesPass(); void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); extern char &AMDGPULowerKernelAttributesID; +void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &); +extern char &AMDGPUPropagateAttributesEarlyID; + +void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &); +extern char &AMDGPUPropagateAttributesLateID; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; @@ -135,6 +142,9 @@ extern char &SIFixupVectorISelID; void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILowerSGPRSpillsPass(PassRegistry &); +extern char &SILowerSGPRSpillsID; + void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; @@ -150,8 +160,8 @@ extern char &SIInsertSkipsPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; -void initializeSIFixWWMLivenessPass(PassRegistry &); -extern char &SIFixWWMLivenessID; +void initializeSIPreAllocateWWMRegsPass(PassRegistry &); +extern char &SIPreAllocateWWMRegsID; void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); extern char &AMDGPUSimplifyLibCallsID; @@ -197,9 +207,6 @@ extern char &SIAnnotateControlFlowPassID; void initializeSIMemoryLegalizerPass(PassRegistry&); extern char &SIMemoryLegalizerID; -void initializeSIDebuggerInsertNopsPass(PassRegistry&); -extern char &SIDebuggerInsertNopsID; - void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; @@ -226,8 +233,11 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; -Target &getTheAMDGPUTarget(); -Target &getTheGCNTarget(); +void initializeGCNRegBankReassignPass(PassRegistry &); +extern char &GCNRegBankReassignID; + +void initializeGCNNSAReassignPass(PassRegistry &); +extern char &GCNNSAReassignID; namespace AMDGPU { enum TargetIndex { @@ -250,21 +260,23 @@ enum TargetIndex { namespace AMDGPUAS { enum : unsigned { // The maximum value for flat, generic, local, private, constant and region. - MAX_AMDGPU_ADDRESS = 6, + MAX_AMDGPU_ADDRESS = 7, FLAT_ADDRESS = 0, ///< Address space for flat memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - REGION_ADDRESS = 2, ///< Address space for region memory. + REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) - CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). LOCAL_ADDRESS = 3, ///< Address space for local memory. PRIVATE_ADDRESS = 5, ///< Address space for private memory. - CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. + + BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. - /// Address space for direct addressible parameter memory (CONST0) + /// Address space for direct addressible parameter memory (CONST0). PARAM_D_ADDRESS = 6, - /// Address space for indirect addressible parameter memory (VTX1) + /// Address space for indirect addressible parameter memory (VTX1). PARAM_I_ADDRESS = 7, // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 6a4cfe08e491..baeba534012c 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -1,9 +1,8 @@ //===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------===// @@ -61,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "Have scratch_* flat memory instructions" >; +def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", + "ScalarFlatScratchInsts", + "true", + "Have s_scratch_* flat memory instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -103,6 +108,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support", + "DoesNotSupportXNACK", + "true", + "Hardware does not support XNACK" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -116,12 +127,78 @@ def FeatureXNACK : SubtargetFeature<"xnack", "Enable XNACK support" >; +def FeatureCuMode : SubtargetFeature<"cumode", + "EnableCuMode", + "true", + "Enable CU wavefront execution mode" +>; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", + "LDSMisalignedBug", + "true", + "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" +>; + +def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", + "HasVcmpxPermlaneHazard", + "true", + "TODO: describe me" +>; + +def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard", + "HasVMEMtoScalarWriteHazard", + "true", + "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution." +>; + +def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard", + "HasSMEMtoVectorWriteHazard", + "true", + "s_load_dword followed by v_cmp page faults" +>; + +def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", + "HasInstFwdPrefetchBug", + "true", + "S_INST_PREFETCH instruction causes shader to hang" +>; + +def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", + "HasVcmpxExecWARHazard", + "true", + "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)" +>; + +def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard", + "HasLdsBranchVmemWARHazard", + "true", + "Switching between LDS and VMEM-tex not waiting VM_VSRC=0" +>; + +def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", + "HasNSAtoVMEMBug", + "true", + "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" +>; + +def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", + "HasFlatSegmentOffsetBug", + "true", + "GFX10 bug, inst_offset ignored in flat segment" +>; + +def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug", + "HasOffset3fBug", + "true", + "Branch offset of 3f hardware bug" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -144,10 +221,10 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts", "Additional instructions for CI+" >; -def FeatureVIInsts : SubtargetFeature<"vi-insts", - "VIInsts", +def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts", + "GFX8Insts", "true", - "Additional instructions for VI+" + "Additional instructions for GFX8+" >; def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", @@ -156,6 +233,18 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "Additional instructions for GFX9+" >; +def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", + "GFX10Insts", + "true", + "Additional instructions for GFX10+" +>; + +def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", + "GFX7GFX8GFX9Insts", + "true", + "Instructions shared in GFX7, GFX8, GFX9" +>; + def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", "HasSMemRealTime", "true", @@ -246,12 +335,25 @@ def FeatureDPP : SubtargetFeature<"dpp", "Support DPP (Data Parallel Primitives) extension" >; +// DPP8 allows arbitrary cross-lane swizzling withing groups of 8 lanes. +def FeatureDPP8 : SubtargetFeature<"dpp8", + "HasDPP8", + "true", + "Support DPP8 (Data Parallel Primitives) extension" +>; + def FeatureR128A16 : SubtargetFeature<"r128-a16", "HasR128A16", "true", "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" >; +def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", + "HasNSAEncoding", + "true", + "Support NSA encoding for image instructions" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -270,10 +372,65 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts", "Has v_fmac_f32 and v_xnor_b32 instructions" >; -def FeatureDotInsts : SubtargetFeature<"dot-insts", - "HasDotInsts", +def FeatureDot1Insts : SubtargetFeature<"dot1-insts", + "HasDot1Insts", + "true", + "Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions" +>; + +def FeatureDot2Insts : SubtargetFeature<"dot2-insts", + "HasDot2Insts", + "true", + "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" +>; + +def FeatureDot3Insts : SubtargetFeature<"dot3-insts", + "HasDot3Insts", + "true", + "Has v_dot8c_i32_i4 instruction" +>; + +def FeatureDot4Insts : SubtargetFeature<"dot4-insts", + "HasDot4Insts", + "true", + "Has v_dot2c_i32_i16 instruction" +>; + +def FeatureDot5Insts : SubtargetFeature<"dot5-insts", + "HasDot5Insts", "true", - "Has v_dot* instructions" + "Has v_dot2c_f32_f16 instruction" +>; + +def FeatureDot6Insts : SubtargetFeature<"dot6-insts", + "HasDot6Insts", + "true", + "Has v_dot4c_i32_i8 instruction" +>; + +def FeatureMAIInsts : SubtargetFeature<"mai-insts", + "HasMAIInsts", + "true", + "Has mAI instructions" +>; + +def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", + "HasPkFmacF16Inst", + "true", + "Has v_pk_fmac_f16 instruction" +>; + +def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts", + "HasAtomicFaddInsts", + "true", + "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, " + "global_atomic_pk_add_f16 instructions" +>; + +def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support", + "DoesNotSupportSRAMECC", + "true", + "Hardware does not support SRAM ECC" >; def FeatureSRAMECC : SubtargetFeature<"sram-ecc", @@ -282,6 +439,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc", "Enable SRAM ECC" >; +def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", + "HasNoSdstCMPX", + "true", + "V_CMPX does not write VCC/SGPR in addition to EXEC" +>; + +def FeatureVscnt : SubtargetFeature<"vscnt", + "HasVscnt", + "true", + "Has separate store vscnt counter" +>; + +def FeatureRegisterBanking : SubtargetFeature<"register-banking", + "HasRegisterBanking", + "true", + "Has register banking" +>; + +def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", + "HasVOP3Literal", + "true", + "Can use one literal in VOP3" +>; + +def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", + "HasNoDataDepHazard", + "true", + "Does not need SW waitstates" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -327,13 +514,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; -def FeatureEnableHugePrivateBuffer : SubtargetFeature< - "huge-private-buffer", - "EnableHugePrivateBuffer", - "true", - "Enable private/scratch buffer sizes greater than 128 GB" ->; - def FeatureDumpCode : SubtargetFeature <"DumpCode", "DumpCode", "true", @@ -425,103 +605,123 @@ def FeatureDisable : SubtargetFeature<"", "Dummy feature to disable assembler instructions" >; -def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU" ->; - class GCNSubtargetFeatureGeneration Implies> : - SubtargetFeatureGeneration ; + string FeatureName, + list Implies> : + SubtargetFeatureGeneration ; def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", + "southern-islands", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, - FeatureWavefrontSize64, FeatureGCN, - FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange] + FeatureWavefrontSize64, + FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, + FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", + "sea-islands", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, - FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange] + FeatureWavefrontSize64, FeatureFlatAddressSpace, + FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, + FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + "volcanic-islands", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts, + FeatureWavefrontSize64, FeatureFlatAddressSpace, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, - FeatureIntClamp, FeatureTrigReducedRange + FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, + FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts ] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", + "gfx9", [FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts, + FeatureWavefrontSize64, FeatureFlatAddressSpace, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, - FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16 + FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, + FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 ] >; -class SubtargetFeatureISAVersion Implies> - : SubtargetFeature < - "isaver"#Major#"."#Minor#"."#Stepping, - "IsaVersion", - "ISAVersion"#Major#"_"#Minor#"_"#Stepping, - "Instruction set version number", - Implies +def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", + "gfx10", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, + FeatureFlatAddressSpace, + FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureInv2PiInlineImm, + FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P, + FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, + FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, + FeatureVOP3Literal, FeatureDPP8, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC + ] >; -def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0, - [FeatureSouthernIslands, +class FeatureSet Features_> { + list Features = Features_; +} + +def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1, +def FeatureISAVersion6_0_1 : FeatureSet< [FeatureSouthernIslands, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, +def FeatureISAVersion7_0_0 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1, +def FeatureISAVersion7_0_1 : FeatureSet< [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, FeatureFastFMAF32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, +def FeatureISAVersion7_0_2 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, FeatureFastFMAF32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, +def FeatureISAVersion7_0_3 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, +def FeatureISAVersion7_0_4 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, +def FeatureISAVersion8_0_1 : FeatureSet< [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, @@ -530,78 +730,151 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, FeatureUnpackedD16VMem, FeatureCodeObjectV3]>; -def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, +def FeatureISAVersion8_0_2 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, FeatureUnpackedD16VMem, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, +def FeatureISAVersion8_0_3 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureUnpackedD16VMem, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, +def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, FeatureXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, +def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureDoesNotSupportXNACK, + FeatureDoesNotSupportSRAMECC]>; -def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, +def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, + FeatureDoesNotSupportSRAMECC, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, +def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, + FeatureDoesNotSupportXNACK, + FeatureDoesNotSupportSRAMECC, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, +def FeatureISAVersion9_0_6 : FeatureSet< [FeatureGFX9, HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, FeatureDLInsts, - FeatureDotInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3]>; + +def FeatureISAVersion9_0_8 : FeatureSet< + [FeatureGFX9, + HalfRate64Ops, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot3Insts, + FeatureDot4Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureMAIInsts, + FeaturePkFmacF16Inst, + FeatureAtomicFaddInsts, FeatureSRAMECC, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9, +def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, FeatureCodeObjectV3]>; -//===----------------------------------------------------------------------===// -// Debugger related subtarget features. -//===----------------------------------------------------------------------===// - -def FeatureDebuggerInsertNops : SubtargetFeature< - "amdgpu-debugger-insert-nops", - "DebuggerInsertNops", - "true", - "Insert one nop instruction for each high level source statement" ->; +// TODO: Organize more features into groups. +def FeatureGroup { + // Bugs present on gfx10.1. + list GFX10_1_Bugs = [ + FeatureVcmpxPermlaneHazard, + FeatureVMEMtoScalarWriteHazard, + FeatureSMEMtoVectorWriteHazard, + FeatureInstFwdPrefetchBug, + FeatureVcmpxExecWARHazard, + FeatureLdsBranchVmemWARHazard, + FeatureNSAtoVMEMBug, + FeatureOffset3fBug, + FeatureFlatSegmentOffsetBug + ]; +} -def FeatureDebuggerEmitPrologue : SubtargetFeature< - "amdgpu-debugger-emit-prologue", - "DebuggerEmitPrologue", - "true", - "Emit debugger prologue" ->; +def FeatureISAVersion10_1_0 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureLdsMisalignedBug, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3])>; + +def FeatureISAVersion10_1_1 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3])>; + +def FeatureISAVersion10_1_2 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureLdsMisalignedBug, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3])>; //===----------------------------------------------------------------------===// @@ -682,23 +955,71 @@ def NullALU : InstrItinClass; // Predicate helper class //===----------------------------------------------------------------------===// -def isSICI : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->, AssemblerPredicate<"!FeatureGCN3Encoding">; +def isGFX6 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + +def isGFX6GFX7 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">; + +def isGFX6GFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"!FeatureGCN3Encoding">; + +def isGFX7Only : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">; + +def isGFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">; + +def isGFX7GFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">; + +def isGFX6GFX7GFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"!FeatureGFX10Insts">; + +def isGFX7Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"FeatureCIInsts">; + +def isGFX8Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGFX8Insts">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; +def isGFX8Only : Predicate<"Subtarget->getGeneration() ==" + "AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate <"FeatureVolcanicIslands">; -def isGFX9 : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, +def isGFX9Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX9Insts">; -// TODO: Either the name to be changed or we simply use IsCI! -def isCIVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate<"FeatureCIInsts">; +def isGFX9Only : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">; + +def isGFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">; + +def isGFX10Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"FeatureGFX10Insts">; def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<"FeatureFlatAddressSpace">; @@ -707,6 +1028,8 @@ def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<"FeatureFlatGlobalInsts">; def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, + AssemblerPredicate<"FeatureScalarFlatScratchInsts">; def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; @@ -716,7 +1039,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<"!FeatureUnpackedD16VMem">; def D16PreservesUnusedBits : - Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">, + Predicate<"Subtarget->d16PreservesUnusedBits()">, AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; @@ -728,38 +1051,54 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9 def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, AssemblerPredicate<"FeatureAddNoCarryInsts">; -def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">, - AssemblerPredicate<"!FeatureAddNoCarryInsts">; +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<"Feature16BitInsts">; def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; -def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">, - AssemblerPredicate<"!FeatureVOP3P">; - def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; -def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureSDWA,FeatureGFX9">; +def HasSDWA9 : + Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">; + +def HasSDWA10 : + Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">; def HasDPP : Predicate<"Subtarget->hasDPP()">, - AssemblerPredicate<"FeatureDPP">; + AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; + +def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">; def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, AssemblerPredicate<"FeatureR128A16">; +def HasDPP16 : Predicate<"Subtarget->hasDPP()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">; + def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, AssemblerPredicate<"FeatureIntClamp">; def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, AssemblerPredicate<"FeatureMadMixInsts">; +def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, + AssemblerPredicate<"FeatureScalarStores">; + def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, AssemblerPredicate<"FeatureScalarAtomics">; +def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"FeatureNoSdstCMPX">; + +def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"!FeatureNoSdstCMPX">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, @@ -773,9 +1112,35 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<"FeatureDLInsts">; -def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">, - AssemblerPredicate<"FeatureDotInsts">; +def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">, + AssemblerPredicate<"FeatureDot1Insts">; + +def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">, + AssemblerPredicate<"FeatureDot2Insts">; + +def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">, + AssemblerPredicate<"FeatureDot3Insts">; + +def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">, + AssemblerPredicate<"FeatureDot4Insts">; + +def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">, + AssemblerPredicate<"FeatureDot5Insts">; + +def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, + AssemblerPredicate<"FeatureDot6Insts">; + +def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, + AssemblerPredicate<"FeatureMAIInsts">; + +def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, + AssemblerPredicate<"FeaturePkFmacF16Inst">; + +def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, + AssemblerPredicate<"FeatureAtomicFaddInsts">; +def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">, + AssemblerPredicate<"FeatureOffset3fBug">; def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; @@ -784,7 +1149,6 @@ def EnableLateCFGStructurize : Predicate< include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" -include "SIIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 73709ba13643..bba132c3bc46 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -1,9 +1,8 @@ //===- AMDGPUAliasAnalysis ------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -54,20 +53,21 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } -// These arrays are indexed by address space value enum elements 0 ... to 6 -static const AliasResult ASAliasRules[7][7] = { - /* Flat Global Region Group Constant Private Constant 32-bit */ - /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, - /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias}, - /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias , MayAlias}, - /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias}, - /* Constant */ {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias, NoAlias , MayAlias}, - /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias}, - /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias} +// These arrays are indexed by address space value enum elements 0 ... to 7 +static const AliasResult ASAliasRules[8][8] = { + /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */ + /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}, + /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias}, + /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, + /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}, + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias}, + /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias}, + /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias} }; static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { - static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range"); + static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range"); if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) return MayAlias; @@ -76,7 +76,8 @@ static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { } AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { + const MemoryLocation &LocB, + AAQueryInfo &AAQI) { unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); @@ -85,11 +86,11 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, return Result; // Forward the query to the next alias analysis. - return AAResultBase::alias(LocA, LocB); + return AAResultBase::alias(LocA, LocB, AAQI); } bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { + AAQueryInfo &AAQI, bool OrLocal) { const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); unsigned AS = Base->getType()->getPointerAddressSpace(); if (AS == AMDGPUAS::CONSTANT_ADDRESS || @@ -106,7 +107,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, // Only assume constant memory for arguments on kernels. switch (F->getCallingConv()) { default: - return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal); case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_ES: @@ -133,5 +134,5 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, return true; } } - return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal); } diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index d76c9fc48199..fb722920900f 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -1,9 +1,8 @@ //===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -45,8 +44,10 @@ public: /// By definition, this result is stateless and so remains valid. bool invalidate(Function &, const PreservedAnalyses &) { return false; } - AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); - bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal); + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB, + AAQueryInfo &AAQI); + bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI, + bool OrLocal); private: bool Aliases(const MDNode *A, const MDNode *B) const; diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index fc65430b745f..4c1dbd4c5304 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 896ac9c87779..419ebb2240ad 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -1,9 +1,8 @@ //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -46,8 +45,11 @@ namespace { class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; + SmallVector NodeList; bool addFeatureAttributes(Function &F); + bool processUniformWorkGroupAttribute(); + bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); public: static char ID; @@ -186,7 +188,6 @@ static bool handleAttr(Function &Parent, const Function &Callee, Parent.addFnAttr(Name); return true; } - return false; } @@ -213,6 +214,56 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, handleAttr(Parent, Callee, AttrName); } +bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { + bool Changed = false; + + for (auto *Node : reverse(NodeList)) { + Function *Caller = Node->getFunction(); + + for (auto I : *Node) { + Function *Callee = std::get<1>(I)->getFunction(); + if (Callee) + Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( + Function &Caller, Function &Callee) { + + // Check for externally defined function + if (!Callee.hasExactDefinition()) { + Callee.addFnAttr("uniform-work-group-size", "false"); + if (!Caller.hasFnAttribute("uniform-work-group-size")) + Caller.addFnAttr("uniform-work-group-size", "false"); + + return true; + } + // Check if the Caller has the attribute + if (Caller.hasFnAttribute("uniform-work-group-size")) { + // Check if the value of the attribute is true + if (Caller.getFnAttribute("uniform-work-group-size") + .getValueAsString().equals("true")) { + // Propagate the attribute to the Callee, if it does not have it + if (!Callee.hasFnAttribute("uniform-work-group-size")) { + Callee.addFnAttr("uniform-work-group-size", "true"); + return true; + } + } else { + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + } else { + // If the attribute is absent, set it as false + Caller.addFnAttr("uniform-work-group-size", "false"); + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + return false; +} + bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const GCNSubtarget &ST = TM->getSubtarget(F); bool HasFlat = ST.hasFlatAddressSpace(); @@ -293,15 +344,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { } bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { - Module &M = SCC.getCallGraph().getModule(); - Triple TT(M.getTargetTriple()); - bool Changed = false; + for (CallGraphNode *I : SCC) { + // Build a list of CallGraphNodes from most number of uses to least + if (I->getNumReferences()) + NodeList.push_back(I); + else { + processUniformWorkGroupAttribute(); + NodeList.clear(); + } + Function *F = I->getFunction(); + // Add feature attributes if (!F || F->isDeclaration()) continue; - Changed |= addFeatureAttributes(*F); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index f88e3b0dac86..71121ade0a49 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index 7465cf22b5a4..99a01ca3a2fd 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -1,15 +1,15 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUArgumentUsageInfo.h" #include "SIRegisterInfo.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -27,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS, } if (isRegister()) - OS << "Reg " << printReg(getRegister(), TRI) << '\n'; + OS << "Reg " << printReg(getRegister(), TRI); else - OS << "Stack offset " << getStackOffset() << '\n'; + OS << "Stack offset " << getStackOffset(); + + if (isMasked()) { + OS << " & "; + llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower); + } + + OS << '\n'; } char AMDGPUArgumentUsageInfo::ID = 0; diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index f0e6d1b83f15..097730441ed8 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -1,9 +1,8 @@ //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,6 +10,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/Register.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" @@ -29,22 +29,31 @@ private: friend class AMDGPUArgumentUsageInfo; union { - unsigned Register; + Register Reg; unsigned StackOffset; }; + // Bitmask to locate argument within the register. + unsigned Mask; + bool IsStack : 1; bool IsSet : 1; - ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false) - : Register(Val), IsStack(IsStack), IsSet(IsSet) {} public: - static ArgDescriptor createRegister(unsigned Reg) { - return ArgDescriptor(Reg, false, true); + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, + bool IsStack = false, bool IsSet = false) + : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + + static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, false, true); + } + + static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, true, true); } - static ArgDescriptor createStack(unsigned Reg) { - return ArgDescriptor(Reg, true, true); + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { + return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } bool isSet() const { @@ -59,9 +68,9 @@ public: return !IsStack; } - unsigned getRegister() const { + Register getRegister() const { assert(!IsStack); - return Register; + return Reg; } unsigned getStackOffset() const { @@ -69,6 +78,14 @@ public: return StackOffset; } + unsigned getMask() const { + return Mask; + } + + bool isMasked() const { + return Mask != ~0u; + } + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; }; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2ded7cdb6489..743ac64b8f10 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,7 +19,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "R600AsmPrinter.h" @@ -31,10 +30,12 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -100,7 +101,7 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : AsmPrinter(TM, std::move(Streamer)) { - if (IsaInfo::hasCodeObjectV3(getSTI())) + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) HSAMetadataStream.reset(new MetadataStreamerV3()); else HSAMetadataStream.reset(new MetadataStreamerV2()); @@ -110,7 +111,7 @@ StringRef AMDGPUAsmPrinter::getPassName() const { return "AMDGPU Assembly Printer"; } -const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { +const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { return TM.getMCSubtargetInfo(); } @@ -121,10 +122,10 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (IsaInfo::hasCodeObjectV3(getSTI())) { + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) { std::string ExpectedTarget; raw_string_ostream ExpectedTargetOS(ExpectedTarget); - IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS); + IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS); getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); } @@ -137,9 +138,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { HSAMetadataStream->begin(M); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) - readPALMetadata(M); + getTargetStreamer()->getPALMetadata()->readFromIR(M); - if (IsaInfo::hasCodeObjectV3(getSTI())) + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) return; // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. @@ -147,7 +148,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. - IsaVersion Version = getIsaVersion(getSTI()->getCPU()); + IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); getTargetStreamer()->EmitDirectiveHSACodeObjectISA( Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } @@ -157,11 +158,11 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!getTargetStreamer()) return; - if (!IsaInfo::hasCodeObjectV3(getSTI())) { + if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) { // Emit ISA Version (NT_AMD_AMDGPU_ISA). std::string ISAVersionString; raw_string_ostream ISAVersionStream(ISAVersionString); - IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream); getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); } @@ -172,20 +173,6 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { (void)Success; assert(Success && "Malformed HSA Metadata"); } - - if (!IsaInfo::hasCodeObjectV3(getSTI())) { - // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). - if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { - // Copy the PAL metadata from the map where we collected it into a vector, - // then write it as a .note. - PALMD::Metadata PALMetadataVector; - for (auto i : PALMetadataMap) { - PALMetadataVector.push_back(i.first); - PALMetadataVector.push_back(i.second); - } - getTargetStreamer()->EmitPALMetadata(PALMetadataVector); - } - } } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -225,7 +212,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { const SIMachineFunctionInfo &MFI = *MF->getInfo(); if (!MFI.isEntryFunction()) return; - if (!IsaInfo::hasCodeObjectV3(getSTI()) || + + if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) || TM.getTargetTriple().getOS() != Triple::AMDHSA) return; @@ -243,23 +231,25 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { if (ReadOnlySection.getAlignment() < 64) ReadOnlySection.setAlignment(64); + const MCSubtargetInfo &STI = MF->getSubtarget(); + SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(getSTI(), + IsaInfo::getNumExtraSGPRs(&STI, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, - hasXNACK(*getSTI())); + hasXNACK(STI)); Streamer.PopSection(); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { - if (IsaInfo::hasCodeObjectV3(getSTI()) && + if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) && TM.getTargetTriple().getOS() == Triple::AMDHSA) { AsmPrinter::EmitFunctionEntryLabel(); return; @@ -273,8 +263,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { getTargetStreamer()->EmitAMDGPUSymbolType( SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } - const GCNSubtarget &STI = MF->getSubtarget(); - if (STI.dumpCode()) { + if (DumpCodeInstEmitter) { // Disassemble function name label to text. DisasmLines.push_back(MF->getName().str() + ":"); DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); @@ -285,8 +274,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { } void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { - const GCNSubtarget &STI = MBB.getParent()->getSubtarget(); - if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) { + if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( (Twine("BB") + Twine(getFunctionNumber()) @@ -298,38 +286,57 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { } void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + if (GV->hasInitializer() && !isa(GV->getInitializer())) { + OutContext.reportError({}, + Twine(GV->getName()) + + ": unsupported initializer for address space"); + return; + } + + // LDS variables aren't emitted in HSA or PAL yet. + const Triple::OSType OS = TM.getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return; - // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + MCSymbol *GVSym = getSymbol(GV); + + GVSym->redefineIfPossible(); + if (GVSym->isDefined() || GVSym->isVariable()) + report_fatal_error("symbol '" + Twine(GVSym->getName()) + + "' is already defined"); + + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); + unsigned Align = GV->getAlignment(); + if (!Align) + Align = 4; + + EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + EmitLinkage(GV, GVSym); + if (auto TS = getTargetStreamer()) + TS->emitAMDGPULDS(GVSym, Size, Align); return; + } AsmPrinter::EmitGlobalVariable(GV); } bool AMDGPUAsmPrinter::doFinalization(Module &M) { CallGraphResourceInfo.clear(); - return AsmPrinter::doFinalization(M); -} -// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the -// frontend into our PALMetadataMap, ready for per-function modification. It -// is a NamedMD containing an MDTuple containing a number of MDNodes each of -// which is an integer value, and each two integer values forms a key=value -// pair that we store as PALMetadataMap[key]=value in the map. -void AMDGPUAsmPrinter::readPALMetadata(Module &M) { - auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); - if (!NamedMD || !NamedMD->getNumOperands()) - return; - auto Tuple = dyn_cast(NamedMD->getOperand(0)); - if (!Tuple) - return; - for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { - auto Key = mdconst::dyn_extract(Tuple->getOperand(I)); - auto Val = mdconst::dyn_extract(Tuple->getOperand(I + 1)); - if (!Key || !Val) - continue; - PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue(); + // Pad with s_code_end to help tools and guard against instruction prefetch + // causing stale data in caches. Arguably this should be done by the linker, + // which is why this isn't done for Mesa. + const MCSubtargetInfo &STI = *getGlobalSTI(); + if (AMDGPU::isGFX10(STI) && + (STI.getTargetTriple().getOS() == Triple::AMDHSA || + STI.getTargetTriple().getOS() == Triple::AMDPAL)) { + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer()->EmitCodeEnd(); } + + return AsmPrinter::doFinalization(M); } // Print comments that apply to both callable functions and entry points. @@ -376,6 +383,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } + if (MF.getSubtarget().isWave32()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; + } return KernelCodeProperties; } @@ -435,6 +446,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitProgramInfoSI(MF, CurrentProgramInfo); } + DumpCodeInstEmitter = nullptr; + if (STM.dumpCode()) { + // For -dumpcode, get the assembler out of the streamer, even if it does + // not really want to let us have it. This only works with -filetype=obj. + bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); + OutStreamer->setUseAssemblerInfoForParsing(true); + MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); + OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); + if (Assembler) + DumpCodeInstEmitter = Assembler->getEmitterPtr(); + } + DisasmLines.clear(); HexLines.clear(); DisasmLineMaxLen = 0; @@ -486,15 +509,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); - if (MF.getSubtarget().debuggerEmitPrologue()) { - OutStreamer->emitRawComment( - " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + - Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); - OutStreamer->emitRawComment( - " DebuggerPrivateSegmentBufferSGPR: s" + - Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); - } - OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); @@ -516,7 +530,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); } - if (STM.dumpCode()) { + if (DumpCodeInstEmitter) { OutStreamer->SwitchSection( Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); @@ -620,6 +634,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( HighestVGPRReg = Reg; break; } + MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); + if (MRI.isPhysRegUsed(AReg)) { + HighestVGPRReg = AReg; + break; + } } MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; @@ -665,8 +684,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::SRC_SHARED_LIMIT: case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SGPR_NULL: continue; + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + llvm_unreachable("src_pops_exiting_wave_id should not be used"); + case AMDGPU::NoRegister: assert(MI.isDebugInstr()); continue; @@ -687,6 +710,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::XNACK_MASK_HI: llvm_unreachable("xnack_mask registers should not be used"); + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: @@ -695,6 +721,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::TMA_HI: llvm_unreachable("trap handler registers should not be used"); + case AMDGPU::SRC_VCCZ: + llvm_unreachable("src_vccz register should not be used"); + + case AMDGPU::SRC_EXECZ: + llvm_unreachable("src_execz register should not be used"); + + case AMDGPU::SRC_SCC: + llvm_unreachable("src_scc register should not be used"); + default: break; } @@ -707,6 +742,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { IsSGPR = false; Width = 1; + } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -715,9 +753,14 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { IsSGPR = false; Width = 2; + } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { IsSGPR = false; Width = 3; + } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { + Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -726,6 +769,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { IsSGPR = false; Width = 4; + } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -742,6 +788,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { IsSGPR = false; Width = 16; + } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { + IsSGPR = true; + Width = 32; + } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; + } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; } else { llvm_unreachable("Unknown register class"); } @@ -767,8 +825,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // 48 SGPRs - vcc, - flat_scr, -xnack int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true, - ST.hasFlatAddressSpace()); + 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); @@ -779,9 +836,19 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else { // We force CodeGen to run in SCC order, so the callee's register // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); - assert(I != CallGraphResourceInfo.end() && - "callee should have been handled before caller"); + if (I == CallGraphResourceInfo.end()) { + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + // FIXME: The verifier shouldn't allow this. + if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + + llvm_unreachable("callee should have been handled before caller"); + } MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); @@ -825,14 +892,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const GCNSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - const SIInstrInfo *TII = STM.getInstrInfo(); - const SIRegisterInfo *RI = &TII->getRegisterInfo(); // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed); + &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && @@ -918,24 +983,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( &STM, ProgInfo.NumVGPRsForWavesPerEU); - // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and - // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" - // attribute was requested. - if (STM.debuggerEmitPrologue()) { - ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = - RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); - ProgInfo.DebuggerPrivateSegmentBufferSGPR = - RI->getHWRegIndex(MFI->getScratchRSrcReg()); - } - // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); - ProgInfo.IEEEMode = STM.enableIEEEBit(MF); + const SIModeRegisterDefaults Mode = MFI->getMode(); + ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = STM.enableDX10Clamp(); + ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { @@ -963,6 +1019,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 1ULL << ScratchAlignShift) >> ScratchAlignShift; + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + ProgInfo.MemOrdered = 1; + } + ProgInfo.ComputePGMRSrc1 = S_00B848_VGPRS(ProgInfo.VGPRBlocks) | S_00B848_SGPRS(ProgInfo.SGPRBlocks) | @@ -971,7 +1032,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | - S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + S_00B848_IEEE_MODE(ProgInfo.IEEEMode) | + S_00B848_WGP_MODE(ProgInfo.WgpMode) | + S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); // 0 = X, 1 = XY, 2 = XYZ unsigned TIDIGCompCnt = 0; @@ -1053,71 +1116,38 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, // This is the equivalent of EmitProgramInfoSI above, but for when the OS type // is AMDPAL. It stores each compute/SPI register setting and other PAL -// metadata items into the PALMetadataMap, combining with any provided by the -// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is -// then written as a single block in the .note section. +// metadata items into the PALMD::Metadata, combining with any provided by the +// frontend as LLVM metadata. Once all functions are written, the PAL metadata +// is then written as a single block in the .note section. void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo(); - // Given the calling convention, calculate the register number for rsrc1. In - // principle the register number could change in future hardware, but we know - // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so - // we can use the same fixed value that .AMDGPU.config has for Mesa. Note - // that we use a register number rather than a byte offset, so we need to - // divide by 4. - unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4; - unsigned Rsrc2Reg = Rsrc1Reg + 1; - // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used - // with a constant offset to access any non-register shader-specific PAL - // metadata key. - unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE; - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_PS: - ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_VS: - ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_GS: - ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_ES: - ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_HS: - ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_LS: - ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE; - break; - } - unsigned NumUsedVgprsKey = ScratchSizeKey + - PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE; - unsigned NumUsedSgprsKey = ScratchSizeKey + - PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE; - PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU; - PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU; + auto CC = MF.getFunction().getCallingConv(); + auto MD = getTargetStreamer()->getPALMetadata(); + + MD->setEntryPoint(CC, MF.getFunction().getName()); + MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1; - PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2; - // ScratchSize is in bytes, 16 aligned. - PALMetadataMap[ScratchSizeKey] |= - alignTo(CurrentProgramInfo.ScratchSize, 16); + MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1); + MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); } else { - PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | - S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks); + MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks)); if (CurrentProgramInfo.ScratchBlocks > 0) - PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1); - // ScratchSize is in bytes, 16 aligned. - PALMetadataMap[ScratchSizeKey] |= - alignTo(CurrentProgramInfo.ScratchSize, 16); + MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); } + // ScratchSize is in bytes, 16 aligned. + MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - PALMetadataMap[Rsrc2Reg] |= - S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); - PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable(); - PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr(); + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + MD->setSpiPsInputEna(MFI->getPSInputEnable()); + MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } + + const GCNSubtarget &STM = MF.getSubtarget(); + if (STM.isWave32()) + MD->setWave32(MF.getFunction().getCallingConv()); } // This is supposed to be log2(Size) @@ -1144,12 +1174,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &STM = MF.getSubtarget(); - AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI()); + AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); Out.compute_pgm_resource_registers = CurrentProgramInfo.ComputePGMRSrc1 | (CurrentProgramInfo.ComputePGMRSrc2 << 32); - Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; if (CurrentProgramInfo.DynamicCallStack) Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; @@ -1181,9 +1211,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (STM.debuggerSupported()) - Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; - if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -1196,22 +1223,14 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. - Out.kernarg_segment_alignment = std::max((size_t)4, + Out.kernarg_segment_alignment = std::max(4, countTrailingZeros(MaxKernArgAlign)); - - if (STM.debuggerEmitPrologue()) { - Out.debug_wavefront_private_segment_offset_sgpr = - CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - Out.debug_private_segment_buffer_sgpr = - CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR; - } } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // First try the generic code, which knows about modifiers like 'c' and 'n'. - if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O)) + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) return false; if (ExtraCode && ExtraCode[0]) { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 167ac4b21e1e..cf77034329ef 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -1,9 +1,8 @@ //===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,6 +32,7 @@ namespace llvm { class AMDGPUMachineFunction; class AMDGPUTargetStreamer; +class MCCodeEmitter; class MCOperand; class GCNSubtarget; @@ -57,12 +57,12 @@ private: DenseMap CallGraphResourceInfo; std::unique_ptr HSAMetadataStream; - std::map PALMetadataMap; + + MCCodeEmitter *DumpCodeInstEmitter = nullptr; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; - void readPALMetadata(Module &M); void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; @@ -95,7 +95,7 @@ public: StringRef getPassName() const override; - const MCSubtargetInfo* getSTI() const; + const MCSubtargetInfo* getGlobalSTI() const; AMDGPUTargetStreamer* getTargetStreamer() const; @@ -137,8 +137,7 @@ public: const MachineBasicBlock *MBB) const override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; protected: mutable std::vector DisasmLines, HexLines; diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 644e4fd558ba..8a92e7d923fb 100644 --- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,6 +30,7 @@ namespace { enum DPP_CTRL { DPP_ROW_SR1 = 0x111, DPP_ROW_SR2 = 0x112, + DPP_ROW_SR3 = 0x113, DPP_ROW_SR4 = 0x114, DPP_ROW_SR8 = 0x118, DPP_WF_SR1 = 0x138, @@ -40,7 +40,7 @@ enum DPP_CTRL { struct ReplacementInfo { Instruction *I; - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op; unsigned ValIdx; bool ValDivergent; }; @@ -55,10 +55,8 @@ private: bool HasDPP; bool IsPixelShader; - void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, - unsigned ValIdx, bool ValDivergent) const; - - void setConvergent(CallInst *const CI) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, + bool ValDivergent) const; public: static char ID; @@ -122,16 +120,20 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { break; } - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op = I.getOperation(); - switch (I.getOperation()) { + switch (Op) { default: return; case AtomicRMWInst::Add: - Op = Instruction::Add; - break; case AtomicRMWInst::Sub: - Op = Instruction::Sub; + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: break; } @@ -163,7 +165,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { } void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op; switch (I.getIntrinsicID()) { default: @@ -171,12 +173,47 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: - Op = Instruction::Add; + Op = AtomicRMWInst::Add; break; case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: - Op = Instruction::Sub; + Op = AtomicRMWInst::Sub; + break; + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + Op = AtomicRMWInst::And; + break; + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + Op = AtomicRMWInst::Or; + break; + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + Op = AtomicRMWInst::Xor; + break; + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + Op = AtomicRMWInst::Min; + break; + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + Op = AtomicRMWInst::UMin; + break; + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + Op = AtomicRMWInst::Max; + break; + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + Op = AtomicRMWInst::UMax; break; } @@ -208,12 +245,68 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { ToReplace.push_back(Info); } +// Use the builder to create the non-atomic counterpart of the specified +// atomicrmw binary op. +static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *LHS, Value *RHS) { + CmpInst::Predicate Pred; + + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::Sub: + return B.CreateBinOp(Instruction::Sub, LHS, RHS); + case AtomicRMWInst::And: + return B.CreateBinOp(Instruction::And, LHS, RHS); + case AtomicRMWInst::Or: + return B.CreateBinOp(Instruction::Or, LHS, RHS); + case AtomicRMWInst::Xor: + return B.CreateBinOp(Instruction::Xor, LHS, RHS); + + case AtomicRMWInst::Max: + Pred = CmpInst::ICMP_SGT; + break; + case AtomicRMWInst::Min: + Pred = CmpInst::ICMP_SLT; + break; + case AtomicRMWInst::UMax: + Pred = CmpInst::ICMP_UGT; + break; + case AtomicRMWInst::UMin: + Pred = CmpInst::ICMP_ULT; + break; + } + Value *Cond = B.CreateICmp(Pred, LHS, RHS); + return B.CreateSelect(Cond, LHS, RHS); +} + +static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, + unsigned BitWidth) { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + case AtomicRMWInst::UMax: + return APInt::getMinValue(BitWidth); + case AtomicRMWInst::And: + case AtomicRMWInst::UMin: + return APInt::getMaxValue(BitWidth); + case AtomicRMWInst::Max: + return APInt::getSignedMinValue(BitWidth); + case AtomicRMWInst::Min: + return APInt::getSignedMaxValue(BitWidth); + } +} + void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, - Instruction::BinaryOps Op, + AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const { - LLVMContext &Context = I.getContext(); - // Start building just before the instruction. IRBuilder<> B(&I); @@ -251,115 +344,130 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *const V = I.getOperand(ValIdx); // We need to know how many lanes are active within the wavefront, and we do - // this by getting the exec register, which tells us all the lanes that are - // active. - MDNode *const RegName = - llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); - Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); - CallInst *const Exec = - B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); - setConvergent(Exec); + // this by doing a ballot of active lanes. + CallInst *const Ballot = B.CreateIntrinsic( + Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()}, + {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is // below us only if its associated index was less than ours. We do this by // using the mbcnt intrinsic. - Value *const BitCast = B.CreateBitCast(Exec, VecTy); + Value *const BitCast = B.CreateBitCast(Ballot, VecTy); Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); CallInst *const PartialMbcnt = B.CreateIntrinsic( Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); - CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, - {ExtractHi, PartialMbcnt}); + Value *const Mbcnt = + B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, + {ExtractHi, PartialMbcnt}), + Ty, false); - Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); + Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); - Value *LaneOffset = nullptr; + Value *ExclScan = nullptr; Value *NewV = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to 0, so that they can - // correctly contribute to the final result. - CallInst *const SetInactive = B.CreateIntrinsic( - Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); - setConvergent(SetInactive); - NewV = SetInactive; - - const unsigned Iters = 6; - const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, - DPP_ROW_SR4, DPP_ROW_SR8, - DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - - // This loop performs an inclusive scan across the wavefront, with all lanes + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + CallInst *const SetInactive = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + + CallInst *const FirstDPP = + B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, + {Identity, SetInactive, B.getInt32(DPP_WF_SR1), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + ExclScan = FirstDPP; + + const unsigned Iters = 7; + const unsigned DPPCtrl[Iters] = { + DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, + DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; + const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; + const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; + + // This loop performs an exclusive scan across the wavefront, with all lanes // active (by using the WWM intrinsic). for (unsigned Idx = 0; Idx < Iters; Idx++) { - CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, - {NewV, B.getInt32(DPPCtrl[Idx]), - B.getInt32(RowMask[Idx]), - B.getInt32(0xf), B.getFalse()}); - setConvergent(DPP); - Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); - - NewV = B.CreateBinOp(Op, NewV, WWM); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; + CallInst *const DPP = B.CreateIntrinsic( + Intrinsic::amdgcn_update_dpp, Ty, + {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), + B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); + + ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); } - // NewV has returned the inclusive scan of V, but for the lane offset we - // require an exclusive scan. We do this by shifting the values from the - // entire wavefront right by 1, and by setting the bound_ctrl (last argument - // to the intrinsic below) to true, we can guarantee that 0 will be shifted - // into the 0'th invocation. - CallInst *const DPP = - B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, - {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), - B.getInt32(0xf), B.getTrue()}); - setConvergent(DPP); - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); // Read the value from the last lane, which has accumlated the values of - // each active lane in the wavefront. This will be our new value with which - // we will provide to the atomic operation. + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); CallInst *const ReadLaneLo = B.CreateIntrinsic( Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); - setConvergent(ReadLaneLo); CallInst *const ReadLaneHi = B.CreateIntrinsic( Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); - setConvergent(ReadLaneHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); Value *const Insert = B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); NewV = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, - {}, {NewV, B.getInt32(63)}); - setConvergent(ReadLane); - NewV = ReadLane; + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, B.getInt32(63)}); } else { llvm_unreachable("Unhandled atomic bit width"); } + + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); } else { - // Get the total number of active lanes we have by using popcount. - Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); - Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); - - // Calculate the new value we will be contributing to the atomic operation - // for the entire wavefront. - NewV = B.CreateMul(V, CtpopCast); - LaneOffset = B.CreateMul(V, MbcntCast); + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: { + // The new value we will be contributing to the atomic operation is the + // old value times the number of active lanes. + Value *const Ctpop = B.CreateIntCast( + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); + NewV = B.CreateMul(V, Ctpop); + break; + } + + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These operations with a uniform value are idempotent: doing the atomic + // operation multiple times has the same effect as doing it once. + NewV = V; + break; + + case AtomicRMWInst::Xor: + // The new value we will be contributing to the atomic operation is the + // old value times the parity of the number of active lanes. + Value *const Ctpop = B.CreateIntCast( + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); + NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1)); + break; + } } // We only want a single lane to enter our new control flow, and we do this // by checking if there are any active lanes below us. Only one lane will // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); + Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -401,20 +509,16 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - setConvergent(ReadFirstLaneLo); CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - setConvergent(ReadFirstLaneHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); Value *const Insert = B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - CallInst *const ReadFirstLane = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - setConvergent(ReadFirstLane); - BroadcastI = ReadFirstLane; + + BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -423,7 +527,31 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // get our individual lane's slice into the result. We use the lane offset we // previously calculated combined with the atomic result value we got from the // first lane, to get our lane's index into the atomic result. - Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); + Value *LaneOffset = nullptr; + if (ValDivergent) { + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + } else { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + LaneOffset = B.CreateMul(V, Mbcnt); + break; + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + LaneOffset = B.CreateSelect(Cond, Identity, V); + break; + case AtomicRMWInst::Xor: + LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); + break; + } + } + Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); if (IsPixelShader) { // Need a final PHI to reconverge to above the helper lane branch mask. @@ -442,10 +570,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, I.eraseFromParent(); } -void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { - CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); -} - INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index daef37f9c21f..b107c357196d 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1,9 +1,8 @@ //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -21,28 +20,98 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/LowLevelTypeImpl.h" using namespace llvm; +namespace { + +struct OutgoingArgHandler : public CallLowering::ValueHandler { + OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + + MachineInstrBuilder MIB; + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + llvm_unreachable("not implemented"); + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + llvm_unreachable("not implemented"); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + MIB.addUse(PhysReg); + MIRBuilder.buildCopy(PhysReg, ValVReg); + } + + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, + CCState &State) override { + return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + } +}; + +} + AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI) { } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const { - // FIXME: Add support for non-void returns. - if (Val) + ArrayRef VRegs) const { + + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + MFI->setIfReturnsVoid(!Val); + + if (!Val) { + MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + return true; + } + + Register VReg = VRegs[0]; + + const Function &F = MF.getFunction(); + auto &DL = F.getParent()->getDataLayout(); + if (!AMDGPU::isShader(F.getCallingConv())) return false; - MIRBuilder.buildInstr(AMDGPU::S_ENDPGM); + + const AMDGPUTargetLowering &TLI = *getTLI(); + SmallVector SplitVTs; + SmallVector Offsets; + ArgInfo OrigArg{VReg, Val->getType()}; + setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + + SmallVector SplitArgs; + CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false); + for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext()); + SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed}); + } + auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG); + OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + MIRBuilder.insertInstr(RetInstr); + return true; } -unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, +Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset) const { @@ -53,12 +122,12 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); LLT PtrType = getLLTForType(*PtrTy, DL); - unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); - unsigned KernArgSegmentPtr = + Register DstReg = MRI.createGenericVirtualRegister(PtrType); + Register KernArgSegmentPtr = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); + Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); - unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); MIRBuilder.buildConstant(OffsetReg, Offset); MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); @@ -69,14 +138,14 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset, unsigned Align, - unsigned DstReg) const { + Register DstReg) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | @@ -87,93 +156,233 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); } -bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef VRegs) const { - // AMDGPU_GS and AMDGP_HS are not supported yet. - if (F.getCallingConv() == CallingConv::AMDGPU_GS || - F.getCallingConv() == CallingConv::AMDGPU_HS) - return false; +static Register findFirstFreeSGPR(CCState &CCInfo) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { + if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { + return AMDGPU::SGPR0 + Reg; + } + } + llvm_unreachable("Cannot allocate sgpr"); +} - MachineFunction &MF = MIRBuilder.getMF(); - const GCNSubtarget *Subtarget = &MF.getSubtarget(); +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + const LLT S32 = LLT::scalar(32); MachineRegisterInfo &MRI = MF.getRegInfo(); - SIMachineFunctionInfo *Info = MF.getInfo(); - const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - const DataLayout &DL = F.getParent()->getDataLayout(); - SmallVector ArgLocs; - CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + if (Info.hasWorkItemIDX()) { + Register Reg = AMDGPU::VGPR0; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDY()) { + Register Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDZ()) { + Register Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } +} +// Allocate special inputs passed in user SGPRs. +static void allocateHSAUserSGPRs(CCState &CCInfo, + MachineIRBuilder &MIRBuilder, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info->hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); - MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + if (Info.hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info->hasDispatchPtr()) { - unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasDispatchPtr()) { + unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->hasQueuePtr()) { - unsigned QueuePtrReg = Info->addQueuePtr(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasQueuePtr()) { + unsigned QueuePtrReg = Info.addQueuePtr(TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info->hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); - const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); - unsigned VReg = MRI.createGenericVirtualRegister(P2); + if (Info.hasKernargSegmentPtr()) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register InputPtrReg = Info.addKernargSegmentPtr(TRI); + const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register VReg = MRI.createGenericVirtualRegister(P4); MRI.addLiveIn(InputPtrReg, VReg); MIRBuilder.getMBB().addLiveIn(InputPtrReg); MIRBuilder.buildCopy(VReg, InputPtrReg); CCInfo.AllocateReg(InputPtrReg); } - if (Info->hasDispatchID()) { - unsigned DispatchIDReg = Info->addDispatchID(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasDispatchID()) { + unsigned DispatchIDReg = Info.addDispatchID(TRI); + MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info->hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + +static void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (Info.hasWorkGroupIDX()) { + Register Reg = Info.addWorkGroupIDX(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasWorkGroupIDY()) { + Register Reg = Info.addWorkGroupIDY(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasWorkGroupIDZ()) { + unsigned Reg = Info.addWorkGroupIDZ(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasWorkGroupInfo()) { + unsigned Reg = Info.addWorkGroupInfo(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg; + + if (IsShader) { + PrivateSegmentWaveByteOffsetReg = + Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); + + // This is true if the scratch wave byte offset doesn't have a fixed + // location. + if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } + } else + PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } +} + +bool AMDGPUCallLowering::lowerFormalArgumentsKernel( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const GCNSubtarget *Subtarget = &MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo(); + const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + SmallVector ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); + + unsigned i = 0; + const unsigned KernArgBaseAlign = 16; + const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); + uint64_t ExplicitArgOffset = 0; + + // TODO: Align down to dword alignment and extract bits for extending loads. + for (auto &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + if (AllocSize == 0) + continue; + + unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + + uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; + + ArrayRef OrigArgRegs = VRegs[i]; + Register ArgReg = + OrigArgRegs.size() == 1 + ? OrigArgRegs[0] + : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); + unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); + ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); + lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg); + if (OrigArgRegs.size() > 1) + unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder); + ++i; + } + + allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); + return true; +} + +bool AMDGPUCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { // The infrastructure for normal calling convention lowering is essentially // useless for kernels. We want to avoid any kind of legalization or argument // splitting. - if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) { - unsigned i = 0; - const unsigned KernArgBaseAlign = 16; - const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); - uint64_t ExplicitArgOffset = 0; - - // TODO: Align down to dword alignment and extract bits for extending loads. - for (auto &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - unsigned AllocSize = DL.getTypeAllocSize(ArgTy); - if (AllocSize == 0) - continue; + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) + return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs); - unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + // AMDGPU_GS and AMDGP_HS are not supported yet. + if (F.getCallingConv() == CallingConv::AMDGPU_GS || + F.getCallingConv() == CallingConv::AMDGPU_HS) + return false; + + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo(); + const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); - uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; + bool IsShader = AMDGPU::isShader(F.getCallingConv()); - unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); - ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); - lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]); - ++i; - } + SmallVector ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); - return true; + if (Info->hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); } unsigned NumArgs = F.arg_size(); @@ -186,7 +395,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // We can only hanlde simple value types at the moment. ISD::ArgFlagsTy Flags; - ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()}; + assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); + ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; setArgFlags(OrigArg, i + 1, DL, F); Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); @@ -239,11 +449,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { if (Skipped.test(OrigArgIdx)) continue; - CCValAssign &VA = ArgLocs[i++]; - MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]); - MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); - MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg()); + assert(VRegs[OrigArgIdx].size() == 1 && + "Can't lower into more than 1 reg"); + CCValAssign &VA = ArgLocs[i++]; + MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); + MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); + MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); } + + allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index ed859716218e..3599659cac6a 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -1,9 +1,8 @@ //===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -23,20 +22,25 @@ namespace llvm { class AMDGPUTargetLowering; class AMDGPUCallLowering: public CallLowering { - unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset) const; void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset, unsigned Align, - unsigned DstReg) const; + Register DstReg) const; public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const override; + ArrayRef VRegs) const override; + + bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef> VRegs) const; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef VRegs) const override; + ArrayRef> VRegs) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); }; diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 367f120b5fa6..3688cd77542e 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -1,9 +1,8 @@ //===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,7 +23,16 @@ def CC_SI : CallingConv<[ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, - SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, + SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, + SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, + SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, + SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, + SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, + SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, + SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, + SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, + SGPR104, SGPR105 ]>>>, // We have no way of referring to the generated register tuples @@ -60,7 +68,16 @@ def RetCC_SI_Shader : CallingConv<[ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, - SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, + SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, + SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, + SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, + SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, + SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, + SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, + SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, + SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, + SGPR104, SGPR105 ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. @@ -93,12 +110,22 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< (sequence "VGPR%u", 32, 255) >; -def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs< - (sequence "SGPR%u", 32, 103) +def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< + (sequence "SGPR%u", 32, 105) +>; + +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< + (sequence "VGPR%u", 0, 255) +>; + +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< + (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) >; def CSR_AMDGPU_HighRegs : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103) + (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105) >; // Calling convention for leaf functions @@ -111,10 +138,12 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> ]>; diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 4dc1e67c573d..b750c6b5f6d2 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -62,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, AssumptionCache *AC = nullptr; LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; + const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to @@ -134,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + + unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; + unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; + bool isI24(Value *V, unsigned ScalarSize) const; + bool isU24(Value *V, unsigned ScalarSize) const; + + /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. + /// SelectionDAG has an issue where an and asserting the bits are known + bool replaceMulWithMul24(BinaryOperator &I) const; + /// Expands 24 bit div or rem. Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, @@ -393,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( return true; } +unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, + unsigned ScalarSize) const { + KnownBits Known = computeKnownBits(Op, *DL, 0, AC); + return ScalarSize - Known.countMinLeadingZeros(); +} + +unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, + unsigned ScalarSize) const { + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC); +} + +bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const { + return ScalarSize >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + numBitsSigned(V, ScalarSize) < 24; +} + +bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { + return numBitsUnsigned(V, ScalarSize) <= 24; +} + +static void extractValues(IRBuilder<> &Builder, + SmallVectorImpl &Values, Value *V) { + VectorType *VT = dyn_cast(V->getType()); + if (!VT) { + Values.push_back(V); + return; + } + + for (int I = 0, E = VT->getNumElements(); I != E; ++I) + Values.push_back(Builder.CreateExtractElement(V, I)); +} + +static Value *insertValues(IRBuilder<> &Builder, + Type *Ty, + SmallVectorImpl &Values) { + if (Values.size() == 1) + return Values[0]; + + Value *NewVal = UndefValue::get(Ty); + for (int I = 0, E = Values.size(); I != E; ++I) + NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); + + return NewVal; +} + +bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { + if (I.getOpcode() != Instruction::Mul) + return false; + + Type *Ty = I.getType(); + unsigned Size = Ty->getScalarSizeInBits(); + if (Size <= 16 && ST->has16BitInsts()) + return false; + + // Prefer scalar if this could be s_mul_i32 + if (DA->isUniform(&I)) + return false; + + Value *LHS = I.getOperand(0); + Value *RHS = I.getOperand(1); + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Intrinsic::ID IntrID = Intrinsic::not_intrinsic; + + // TODO: Should this try to match mulhi24? + if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { + IntrID = Intrinsic::amdgcn_mul_u24; + } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { + IntrID = Intrinsic::amdgcn_mul_i24; + } else + return false; + + SmallVector LHSVals; + SmallVector RHSVals; + SmallVector ResultVals; + extractValues(Builder, LHSVals, LHS); + extractValues(Builder, RHSVals, RHS); + + + IntegerType *I32Ty = Builder.getInt32Ty(); + FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); + for (int I = 0, E = LHSVals.size(); I != E; ++I) { + Value *LHS, *RHS; + if (IntrID == Intrinsic::amdgcn_mul_u24) { + LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); + RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); + } else { + LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); + RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); + } + + Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); + + if (IntrID == Intrinsic::amdgcn_mul_u24) { + ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, + LHSVals[I]->getType())); + } else { + ResultVals.push_back(Builder.CreateSExtOrTrunc(Result, + LHSVals[I]->getType())); + } + } + + I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) @@ -757,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; + if (replaceMulWithMul24(I)) + return true; + bool Changed = false; Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); @@ -807,7 +932,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { Type *I32Ty = Builder.getInt32Ty(); Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); - LoadInst *WidenLoad = Builder.CreateLoad(BitCast); + LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); WidenLoad->copyMetadata(I); // If we have range metadata, we need to convert the type, and not make @@ -883,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; + DL = &Mod->getDataLayout(); return false; } diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td index 3c7d8a8fc550..ea3952c316e4 100644 --- a/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -1,9 +1,8 @@ //===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -50,17 +49,12 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; -class SubtargetFeatureGeneration Implies> : - SubtargetFeature ; -def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", - "DX10Clamp", - "true", - "clamp modifier clamps NaNs to 0.0" ->; - def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", "EnablePromoteAlloca", "true", diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp index 6e2a981d3396..9ba04d113c70 100644 --- a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp +++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index e32ca9653b3a..e80797736363 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -1,9 +1,8 @@ //===----------------------- AMDGPUFrameLowering.cpp ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index ee836bf8a631..48b64488303e 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -1,9 +1,8 @@ //===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td index 59bb2a16e0f3..cad4c2ef404c 100644 --- a/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/lib/Target/AMDGPU/AMDGPUGISel.td @@ -1,9 +1,8 @@ //===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This files contains patterns that should only be used by GlobalISel. For @@ -13,6 +12,10 @@ include "AMDGPU.td" +def p0 : PtrValueType; +def p1 : PtrValueType; +def p4 : PtrValueType; + def sd_vsrc0 : ComplexPattern; def gi_vsrc0 : GIComplexOperandMatcher, @@ -35,6 +38,33 @@ def gi_vop3omods : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_smrd_imm : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_smrd_imm32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_smrd_sgpr : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_flat_offset : + GIComplexOperandMatcher, + GIComplexPatternEquiv; +def gi_flat_offset_signed : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_mubuf_scratch_offset : + GIComplexOperandMatcher, + GIComplexPatternEquiv; +def gi_mubuf_scratch_offen : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + + class GISelSop2Pat < SDPatternOperator node, Instruction inst, @@ -113,15 +143,6 @@ multiclass GISelVop2IntrPat < def : GISelSop2Pat ; def : GISelVop2Pat ; -def : GISelSop2Pat ; -let AddedComplexity = 100 in { -let SubtargetPredicate = isSICI in { -def : GISelVop2Pat ; -} -def : GISelVop2CommutePat ; -} -def : GISelVop3Pat2CommutePat ; - // FIXME: We can't re-use SelectionDAG patterns here because they match // against a custom SDNode and we would need to create a generic machine // instruction that is equivalent to the custom SDNode. This would also require @@ -135,3 +156,11 @@ defm : GISelVop2IntrPat ; def : GISelVop3Pat2ModsPat ; defm : GISelVop2IntrPat ; def : GISelVop3Pat2ModsPat ; + +// Since GlobalISel is more flexible then SelectionDAG, I think we can get +// away with adding patterns for integer types and not legalizing all +// loads and stores to vector types. This should help simplify the load/store +// legalization. +foreach Ty = [i64, p0, p1, p4] in { + defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; +} diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 6eab59ab4e09..0a1f48231b18 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -1,9 +1,8 @@ //===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -92,6 +91,28 @@ const RegisterBankInfo::ValueMapping ValMappings[] { {&PartMappings[17], 1} }; +const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] { + /*32-bit op*/ {0, 32, SGPRRegBank}, + /*2x32-bit op*/ {0, 32, SGPRRegBank}, + {32, 32, SGPRRegBank}, +/*<2x32-bit> op*/ {0, 64, SGPRRegBank}, + + /*32-bit op*/ {0, 32, VGPRRegBank}, + /*2x32-bit op*/ {0, 32, VGPRRegBank}, + {32, 32, VGPRRegBank}, +}; + + +// For some instructions which can operate 64-bit only for the scalar version. +const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] { + /*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1}, + /*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2}, + /*64-bit sgpr */ {&SGPROnly64BreakDown[3], 1}, + + /*32-bit vgpr*/ {&SGPROnly64BreakDown[4], 1}, + /*2 x 32-bit vgpr*/ {&SGPROnly64BreakDown[5], 2} +}; + enum ValueMappingIdx { SCCStartIdx = 0, SGPRStartIdx = 2, @@ -128,5 +149,89 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, return &ValMappings[Idx]; } +const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID, + unsigned Size) { + if (Size != 64) + return getValueMapping(BankID, Size); + + if (BankID == AMDGPU::VGPRRegBankID) + return &ValMappingsSGPR64OnlyVGPR32[4]; + + assert(BankID == AMDGPU::SGPRRegBankID); + return &ValMappingsSGPR64OnlyVGPR32[2]; +} + +const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] { + /* 256-bit load */ {0, 256, SGPRRegBank}, + /* 512-bit load */ {0, 512, SGPRRegBank}, + /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank}, + {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank}, + {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank}, + {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank}, + /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank}, + {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank}, + + /* FIXME: The generic register bank select does not support complex + * break downs where the number of vector elements does not equal the + * number of breakdowns. + * FIXME: register bank select now tries to handle complex break downs, + * but it emits an illegal instruction: + * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128) + */ + /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] { + /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1}, + /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1}, + /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8}, + /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16}, + /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4}, + /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8} +}; + +const RegisterBankInfo::ValueMapping * +getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) { + unsigned Size = SizeTy.getSizeInBits(); + if (Size < 256 || BankID == AMDGPU::SGPRRegBankID) + return getValueMapping(BankID, Size); + + assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID); + + // Default to using the non-split ValueMappings, we will use these if + // the register bank is SGPR or if we don't know how to handle the vector + // type. + unsigned Idx = Size == 256 ? 0 : 1; + + // We need to split this load if it has a vgpr pointer. + if (BankID == AMDGPU::VGPRRegBankID) { + if (SizeTy == LLT::vector(8, 32)) + Idx = 2; + else if (SizeTy == LLT::vector(16, 32)) + Idx = 3; + else if (SizeTy == LLT::vector(4, 64)) + Idx = 4; + else if (SizeTy == LLT::vector(8, 64)) + Idx = 5; + } + + return &ValMappingsLoadSGPROnly[Idx]; +} + + } // End AMDGPU namespace. } // End llvm namespace. diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index c38b0e61558b..b31de0af5018 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -1,9 +1,8 @@ //===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -240,23 +239,7 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF, Kernel::DebugProps::Metadata MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { - const GCNSubtarget &STM = MF.getSubtarget(); - HSAMD::Kernel::DebugProps::Metadata HSADebugProps; - - if (!STM.debuggerSupported()) - return HSADebugProps; - - HSADebugProps.mDebuggerABIVersion.push_back(1); - HSADebugProps.mDebuggerABIVersion.push_back(0); - - if (STM.debuggerEmitPrologue()) { - HSADebugProps.mPrivateSegmentBufferSGPR = - ProgramInfo.DebuggerPrivateSegmentBufferSGPR; - HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR = - ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - } - - return HSADebugProps; + return HSAMD::Kernel::DebugProps::Metadata(); } void MetadataStreamerV2::emitVersion() { @@ -452,6 +435,10 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); } } + + // Emit the pointer argument for multi-grid object. + if (HiddenArgNumBytes >= 56) + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg); } bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { @@ -506,20 +493,16 @@ void MetadataStreamerV3::dump(StringRef HSAMetadataString) const { void MetadataStreamerV3::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; - std::shared_ptr FromHSAMetadataString = - std::make_shared(); + msgpack::Document FromHSAMetadataString; - yaml::Input YIn(HSAMetadataString); - YIn >> FromHSAMetadataString; - if (YIn.error()) { + if (!FromHSAMetadataString.fromYAML(HSAMetadataString)) { errs() << "FAIL\n"; return; } std::string ToHSAMetadataString; raw_string_ostream StrOS(ToHSAMetadataString); - yaml::Output YOut(StrOS); - YOut << FromHSAMetadataString; + FromHSAMetadataString.toYAML(StrOS); errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n'; if (HSAMetadataString != ToHSAMetadataString) { @@ -653,23 +636,23 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const { } } -std::shared_ptr +msgpack::ArrayDocNode MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const { - auto Dims = std::make_shared(); + auto Dims = HSAMetadataDoc->getArrayNode(); if (Node->getNumOperands() != 3) return Dims; for (auto &Op : Node->operands()) - Dims->push_back(std::make_shared( - mdconst::extract(Op)->getZExtValue())); + Dims.push_back(Dims.getDocument()->getNode( + uint64_t(mdconst::extract(Op)->getZExtValue()))); return Dims; } void MetadataStreamerV3::emitVersion() { - auto Version = std::make_shared(); - Version->push_back(std::make_shared(V3::VersionMajor)); - Version->push_back(std::make_shared(V3::VersionMinor)); - getRootMetadata("amdhsa.version") = std::move(Version); + auto Version = HSAMetadataDoc->getArrayNode(); + Version.push_back(Version.getDocument()->getNode(VersionMajor)); + Version.push_back(Version.getDocument()->getNode(VersionMinor)); + getRootMetadata("amdhsa.version") = Version; } void MetadataStreamerV3::emitPrintf(const Module &Mod) { @@ -677,16 +660,16 @@ void MetadataStreamerV3::emitPrintf(const Module &Mod) { if (!Node) return; - auto Printf = std::make_shared(); + auto Printf = HSAMetadataDoc->getArrayNode(); for (auto Op : Node->operands()) if (Op->getNumOperands()) - Printf->push_back(std::make_shared( - cast(Op->getOperand(0))->getString())); - getRootMetadata("amdhsa.printf") = std::move(Printf); + Printf.push_back(Printf.getDocument()->getNode( + cast(Op->getOperand(0))->getString(), /*Copy=*/true)); + getRootMetadata("amdhsa.printf") = Printf; } void MetadataStreamerV3::emitKernelLanguage(const Function &Func, - msgpack::MapNode &Kern) { + msgpack::MapDocNode Kern) { // TODO: What about other languages? auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); if (!Node || !Node->getNumOperands()) @@ -695,77 +678,50 @@ void MetadataStreamerV3::emitKernelLanguage(const Function &Func, if (Op0->getNumOperands() <= 1) return; - Kern[".language"] = std::make_shared("OpenCL C"); - auto LanguageVersion = std::make_shared(); - LanguageVersion->push_back(std::make_shared( + Kern[".language"] = Kern.getDocument()->getNode("OpenCL C"); + auto LanguageVersion = Kern.getDocument()->getArrayNode(); + LanguageVersion.push_back(Kern.getDocument()->getNode( mdconst::extract(Op0->getOperand(0))->getZExtValue())); - LanguageVersion->push_back(std::make_shared( + LanguageVersion.push_back(Kern.getDocument()->getNode( mdconst::extract(Op0->getOperand(1))->getZExtValue())); - Kern[".language_version"] = std::move(LanguageVersion); + Kern[".language_version"] = LanguageVersion; } void MetadataStreamerV3::emitKernelAttrs(const Function &Func, - msgpack::MapNode &Kern) { + msgpack::MapDocNode Kern) { if (auto Node = Func.getMetadata("reqd_work_group_size")) Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node); if (auto Node = Func.getMetadata("work_group_size_hint")) Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node); if (auto Node = Func.getMetadata("vec_type_hint")) { - Kern[".vec_type_hint"] = std::make_shared(getTypeName( - cast(Node->getOperand(0))->getType(), - mdconst::extract(Node->getOperand(1))->getZExtValue())); + Kern[".vec_type_hint"] = Kern.getDocument()->getNode( + getTypeName( + cast(Node->getOperand(0))->getType(), + mdconst::extract(Node->getOperand(1))->getZExtValue()), + /*Copy=*/true); } if (Func.hasFnAttribute("runtime-handle")) { - Kern[".device_enqueue_symbol"] = std::make_shared( - Func.getFnAttribute("runtime-handle").getValueAsString().str()); + Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode( + Func.getFnAttribute("runtime-handle").getValueAsString().str(), + /*Copy=*/true); } } void MetadataStreamerV3::emitKernelArgs(const Function &Func, - msgpack::MapNode &Kern) { + msgpack::MapDocNode Kern) { unsigned Offset = 0; - auto Args = std::make_shared(); + auto Args = HSAMetadataDoc->getArrayNode(); for (auto &Arg : Func.args()) - emitKernelArg(Arg, Offset, *Args); - - emitHiddenKernelArgs(Func, Offset, *Args); - - // TODO: What about other languages? - if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) { - auto &DL = Func.getParent()->getDataLayout(); - auto Int64Ty = Type::getInt64Ty(Func.getContext()); - - emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args); - emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args); - emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args); - - auto Int8PtrTy = - Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + emitKernelArg(Arg, Offset, Args); - // Emit "printf buffer" argument if printf is used, otherwise emit dummy - // "none" argument. - if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args); - else - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + emitHiddenKernelArgs(Func, Offset, Args); - // Emit "default queue" and "completion action" arguments if enqueue kernel - // is used, otherwise emit dummy "none" arguments. - if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args); - emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args); - } else { - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); - } - } - - Kern[".args"] = std::move(Args); + Kern[".args"] = Args; } void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, - msgpack::ArrayNode &Args) { + msgpack::ArrayDocNode Args) { auto Func = Arg.getParent(); auto ArgNo = Arg.getArgNo(); const MDNode *Node; @@ -822,36 +778,35 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, unsigned &Offset, - msgpack::ArrayNode &Args, + msgpack::ArrayDocNode Args, unsigned PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) { - auto ArgPtr = std::make_shared(); - auto &Arg = *ArgPtr; + auto Arg = Args.getDocument()->getMapNode(); if (!Name.empty()) - Arg[".name"] = std::make_shared(Name); + Arg[".name"] = Arg.getDocument()->getNode(Name, /*Copy=*/true); if (!TypeName.empty()) - Arg[".type_name"] = std::make_shared(TypeName); + Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true); auto Size = DL.getTypeAllocSize(Ty); auto Align = DL.getABITypeAlignment(Ty); - Arg[".size"] = std::make_shared(Size); + Arg[".size"] = Arg.getDocument()->getNode(Size); Offset = alignTo(Offset, Align); - Arg[".offset"] = std::make_shared(Offset); + Arg[".offset"] = Arg.getDocument()->getNode(Offset); Offset += Size; - Arg[".value_kind"] = std::make_shared(ValueKind); + Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true); Arg[".value_type"] = - std::make_shared(getValueType(Ty, BaseTypeName)); + Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true); if (PointeeAlign) - Arg[".pointee_align"] = std::make_shared(PointeeAlign); + Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign); if (auto PtrTy = dyn_cast(Ty)) if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace())) - Arg[".address_space"] = std::make_shared(*Qualifier); + Arg[".address_space"] = Arg.getDocument()->getNode(*Qualifier, /*Copy=*/true); if (auto AQ = getAccessQualifier(AccQual)) - Arg[".access"] = std::make_shared(*AQ); + Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true); // TODO: Emit Arg[".actual_access"]. @@ -859,21 +814,21 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, TypeQual.split(SplitTypeQuals, " ", -1, false); for (StringRef Key : SplitTypeQuals) { if (Key == "const") - Arg[".is_const"] = std::make_shared(true); + Arg[".is_const"] = Arg.getDocument()->getNode(true); else if (Key == "restrict") - Arg[".is_restrict"] = std::make_shared(true); + Arg[".is_restrict"] = Arg.getDocument()->getNode(true); else if (Key == "volatile") - Arg[".is_volatile"] = std::make_shared(true); + Arg[".is_volatile"] = Arg.getDocument()->getNode(true); else if (Key == "pipe") - Arg[".is_pipe"] = std::make_shared(true); + Arg[".is_pipe"] = Arg.getDocument()->getNode(true); } - Args.push_back(std::move(ArgPtr)); + Args.push_back(Arg); } void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, unsigned &Offset, - msgpack::ArrayNode &Args) { + msgpack::ArrayDocNode Args) { int HiddenArgNumBytes = getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); @@ -913,56 +868,58 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); } } + + // Emit the pointer argument for multi-grid object. + if (HiddenArgNumBytes >= 56) + emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args); } -std::shared_ptr +msgpack::MapDocNode MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { const GCNSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); const Function &F = MF.getFunction(); - auto HSAKernelProps = std::make_shared(); - auto &Kern = *HSAKernelProps; + auto Kern = HSAMetadataDoc->getMapNode(); unsigned MaxKernArgAlign; - Kern[".kernarg_segment_size"] = std::make_shared( + Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode( STM.getKernArgSegmentSize(F, MaxKernArgAlign)); Kern[".group_segment_fixed_size"] = - std::make_shared(ProgramInfo.LDSSize); + Kern.getDocument()->getNode(ProgramInfo.LDSSize); Kern[".private_segment_fixed_size"] = - std::make_shared(ProgramInfo.ScratchSize); + Kern.getDocument()->getNode(ProgramInfo.ScratchSize); Kern[".kernarg_segment_align"] = - std::make_shared(std::max(uint32_t(4), MaxKernArgAlign)); + Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign)); Kern[".wavefront_size"] = - std::make_shared(STM.getWavefrontSize()); - Kern[".sgpr_count"] = std::make_shared(ProgramInfo.NumSGPR); - Kern[".vgpr_count"] = std::make_shared(ProgramInfo.NumVGPR); + Kern.getDocument()->getNode(STM.getWavefrontSize()); + Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); + Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR); Kern[".max_flat_workgroup_size"] = - std::make_shared(MFI.getMaxFlatWorkGroupSize()); + Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize()); Kern[".sgpr_spill_count"] = - std::make_shared(MFI.getNumSpilledSGPRs()); + Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs()); Kern[".vgpr_spill_count"] = - std::make_shared(MFI.getNumSpilledVGPRs()); + Kern.getDocument()->getNode(MFI.getNumSpilledVGPRs()); - return HSAKernelProps; + return Kern; } bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { - return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true); + return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } void MetadataStreamerV3::begin(const Module &Mod) { emitVersion(); emitPrintf(Mod); - getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode()); + getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); } void MetadataStreamerV3::end() { std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); - yaml::Output YOut(StrOS); - YOut << HSAMetadataRoot; + HSAMetadataDoc->toYAML(StrOS); if (DumpHSAMetadata) dump(StrOS.str()); @@ -973,25 +930,24 @@ void MetadataStreamerV3::end() { void MetadataStreamerV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); - auto KernelProps = getHSAKernelProps(MF, ProgramInfo); + auto Kern = getHSAKernelProps(MF, ProgramInfo); assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || Func.getCallingConv() == CallingConv::SPIR_KERNEL); - auto &KernelsNode = getRootMetadata("amdhsa.kernels"); - auto Kernels = cast(KernelsNode.get()); + auto Kernels = + getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true); { - auto &Kern = *KernelProps; - Kern[".name"] = std::make_shared(Func.getName()); - Kern[".symbol"] = std::make_shared( - (Twine(Func.getName()) + Twine(".kd")).str()); + Kern[".name"] = Kern.getDocument()->getNode(Func.getName()); + Kern[".symbol"] = Kern.getDocument()->getNode( + (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true); emitKernelLanguage(Func, Kern); emitKernelAttrs(Func, Kern); emitKernelArgs(Func, Kern); } - Kernels->push_back(std::move(KernelProps)); + Kernels.push_back(Kern); } } // end namespace HSAMD diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index afc09baf952d..2eecddbd7b01 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -1,9 +1,8 @@ //===--- AMDGPUHSAMetadataStreamer.h ----------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,7 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" -#include "llvm/BinaryFormat/MsgPackTypes.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" namespace llvm { @@ -52,8 +51,8 @@ public: class MetadataStreamerV3 final : public MetadataStreamer { private: - std::shared_ptr HSAMetadataRoot = - std::make_shared(); + std::unique_ptr HSAMetadataDoc = + llvm::make_unique(); void dump(StringRef HSAMetadataString) const; @@ -70,41 +69,39 @@ private: std::string getTypeName(Type *Ty, bool Signed) const; - std::shared_ptr - getWorkGroupDimensions(MDNode *Node) const; + msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const; - std::shared_ptr - getHSAKernelProps(const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; + msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; void emitVersion(); void emitPrintf(const Module &Mod); - void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern); + void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern); + void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern); + void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern); void emitKernelArg(const Argument &Arg, unsigned &Offset, - msgpack::ArrayNode &Args); + msgpack::ArrayDocNode Args); void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, - unsigned &Offset, msgpack::ArrayNode &Args, + unsigned &Offset, msgpack::ArrayDocNode Args, unsigned PointeeAlign = 0, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, - msgpack::ArrayNode &Args); + msgpack::ArrayDocNode Args); - std::shared_ptr &getRootMetadata(StringRef Key) { - return (*cast(HSAMetadataRoot.get()))[Key]; + msgpack::DocNode &getRootMetadata(StringRef Key) { + return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key]; } - std::shared_ptr &getHSAMetadataRoot() { - return HSAMetadataRoot; + msgpack::DocNode &getHSAMetadataRoot() { + return HSAMetadataDoc->getRoot(); } public: diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index a0a045e72a58..ea730539f834 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// // @@ -40,6 +39,9 @@ #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/BasicBlock.h" +#ifdef EXPENSIVE_CHECKS +#include "llvm/IR/Dominators.h" +#endif #include "llvm/IR/Instruction.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Casting.h" @@ -52,6 +54,8 @@ #include #include +#define DEBUG_TYPE "isel" + using namespace llvm; namespace llvm { @@ -66,6 +70,57 @@ class R600InstrInfo; namespace { +static bool isNullConstantOrUndef(SDValue V) { + if (V.isUndef()) + return true; + + ConstantSDNode *Const = dyn_cast(V); + return Const != nullptr && Const->isNullValue(); +} + +static bool getConstantValue(SDValue N, uint32_t &Out) { + // This is only used for packed vectors, where ussing 0 for undef should + // always be good. + if (N.isUndef()) { + Out = 0; + return true; + } + + if (const ConstantSDNode *C = dyn_cast(N)) { + Out = C->getAPIntValue().getSExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast(N)) { + Out = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + + return false; +} + +// TODO: Handle undef as zero +static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, + bool Negate = false) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = Negate ? + (-LHSVal & 0xffff) | (-RHSVal << 16) : + (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), + DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + +static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { + return packConstantV2I16(N, DAG, true); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -84,12 +139,18 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); AU.addRequired(); +#ifdef EXPENSIVE_CHECKS + AU.addRequired(); + AU.addRequired(); +#endif SelectionDAGISel::getAnalysisUsage(AU); } + bool matchLoadD16FromBuildVector(SDNode *N) const; + bool runOnMachineFunction(MachineFunction &MF) override; + void PreprocessISelDAG() override; void Select(SDNode *N) override; StringRef getPassName() const override; void PostprocessISelDAG() override; @@ -100,19 +161,24 @@ protected: private: std::pair foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; - bool isInlineImmediate(const SDNode *N) const; + bool isInlineImmediate(const SDNode *N, bool Negated = false) const; + bool isNegInlineImmediate(const SDNode *N) const { + return isInlineImmediate(N, true); + } + bool isVGPRImm(const SDNode *N) const; bool isUniformLoad(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; - SDNode *glueCopyToM0(SDNode *N) const; + SDNode *glueCopyToM0LDSInit(SDNode *N) const; + SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + bool isDSOffsetLegal(SDValue Base, unsigned Offset, unsigned OffsetBits) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -120,10 +186,10 @@ private: bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; + SDValue &TFE, SDValue &DLC) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -136,19 +202,19 @@ private: bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; + SDValue &TFE, SDValue &DLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, + bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; - bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr, + bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; template - bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, + bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, @@ -164,6 +230,7 @@ private: bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; @@ -193,11 +260,13 @@ private: bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectHi16Elt(SDValue In, SDValue &Src) const; + SDValue getHi16Elt(SDValue In) const; void SelectADD_SUB_I64(SDNode *N); + void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectDIV_FMAS(SDNode *N); void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -210,6 +279,10 @@ private: void SelectBRCOND(SDNode *N); void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); + void SelectDSAppendConsume(SDNode *N, unsigned IntrID); + void SelectDS_GWS(SDNode *N, unsigned IntrID); + void SelectINTRINSIC_W_CHAIN(SDNode *N); + void SelectINTRINSIC_VOID(SDNode *N); protected: // Include the pieces autogenerated from the target description. @@ -235,11 +308,49 @@ public: SDValue &Offset) override; bool runOnMachineFunction(MachineFunction &MF) override; + + void PreprocessISelDAG() override {} + protected: // Include the pieces autogenerated from the target description. #include "R600GenDAGISel.inc" }; +static SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; +} + +// Figure out if this is really an extract of the high 16-bits of a dword. +static bool isExtractHiElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue Srl = In.getOperand(0); + if (Srl.getOpcode() == ISD::SRL) { + if (ConstantSDNode *ShiftAmt = dyn_cast(Srl.getOperand(1))) { + if (ShiftAmt->getZExtValue() == 16) { + Out = stripBitcast(Srl.getOperand(0)); + return true; + } + } + } + + return false; +} + +// Look through operations that obscure just looking at the low 16-bits of the +// same register. +static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::TRUNCATE) { + SDValue Src = In.getOperand(0); + if (Src.getValueType().getSizeInBits() == 32) + return stripBitcast(Src); + } + + return In; +} + } // end anonymous namespace INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", @@ -247,6 +358,10 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +#ifdef EXPENSIVE_CHECKS +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +#endif INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) @@ -265,10 +380,125 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, } bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { +#ifdef EXPENSIVE_CHECKS + DominatorTree & DT = getAnalysis().getDomTree(); + LoopInfo * LI = &getAnalysis().getLoopInfo(); + for (auto &L : LI->getLoopsInPreorder()) { + assert(L->isLCSSAForm(DT)); + } +#endif Subtarget = &MF.getSubtarget(); return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { + assert(Subtarget->d16PreservesUnusedBits()); + MVT VT = N->getValueType(0).getSimpleVT(); + if (VT != MVT::v2i16 && VT != MVT::v2f16) + return false; + + SDValue Lo = N->getOperand(0); + SDValue Hi = N->getOperand(1); + + LoadSDNode *LdHi = dyn_cast(stripBitcast(Hi)); + + // build_vector lo, (load ptr) -> load_d16_hi ptr, lo + // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo + // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo + + // Need to check for possible indirect dependencies on the other half of the + // vector to avoid introducing a cycle. + if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); + + SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); + SDValue Ops[] = { + LdHi->getChain(), LdHi->getBasePtr(), TiedIn + }; + + unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; + if (LdHi->getMemoryVT() == MVT::i8) { + LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? + AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; + } else { + assert(LdHi->getMemoryVT() == MVT::i16); + } + + SDValue NewLoadHi = + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, + Ops, LdHi->getMemoryVT(), + LdHi->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); + return true; + } + + // build_vector (load ptr), hi -> load_d16_lo ptr, hi + // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi + // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi + LoadSDNode *LdLo = dyn_cast(stripBitcast(Lo)); + if (LdLo && Lo.hasOneUse()) { + SDValue TiedIn = getHi16Elt(Hi); + if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) + return false; + + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); + unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; + if (LdLo->getMemoryVT() == MVT::i8) { + LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? + AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; + } else { + assert(LdLo->getMemoryVT() == MVT::i16); + } + + TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); + + SDValue Ops[] = { + LdLo->getChain(), LdLo->getBasePtr(), TiedIn + }; + + SDValue NewLoadLo = + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, + Ops, LdLo->getMemoryVT(), + LdLo->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); + return true; + } + + return false; +} + +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + if (!Subtarget->d16PreservesUnusedBits()) + return; + + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + if (N->use_empty()) + continue; + + switch (N->getOpcode()) { + case ISD::BUILD_VECTOR: + MadeChange |= matchLoadD16FromBuildVector(N); + break; + default: + break; + } + } + + if (MadeChange) { + CurDAG->RemoveDeadNodes(); + LLVM_DEBUG(dbgs() << "After PreProcess:\n"; + CurDAG->dump();); + } +} + bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { if (TM.Options.NoNaNsFPMath) return true; @@ -280,14 +510,26 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { return CurDAG->isKnownNeverNaN(N); } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, + bool Negated) const { + if (N->isUndef()) + return true; + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (Negated) { + if (const ConstantSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(-C->getAPIntValue()); + + if (const ConstantFPSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - if (const ConstantSDNode *C = dyn_cast(N)) - return TII->isInlineConstant(C->getAPIntValue()); + } else { + if (const ConstantSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(C->getAPIntValue()); - if (const ConstantFPSDNode *C = dyn_cast(N)) - return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); + if (const ConstantFPSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); + } return false; } @@ -340,37 +582,48 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (cast(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || - !Subtarget->ldsRequiresM0Init()) - return N; - +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); + *static_cast(getTargetLowering()); - // Write max value to m0 before each load operation + assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); - SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), - CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), + Val); SDValue Glue = M0.getValue(1); SmallVector Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } + Ops.push_back(M0); // Replace the chain. + for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Glue); return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { + unsigned AS = cast(N)->getAddressSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + if (Subtarget->ldsRequiresM0Init()) + return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + } else if (AS == AMDGPUAS::REGION_ADDRESS) { + MachineFunction &MF = CurDAG->getMachineFunction(); + unsigned Value = MF.getInfo()->getGDSSize(); + return + glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); + } + return N; +} + MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, EVT VT) const { SDNode *Lo = CurDAG->getMachineNode( AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); + CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32)); const SDValue Ops[] = { CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), @@ -385,31 +638,23 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { return AMDGPU::SReg_32_XM0RegClassID; case 2: return AMDGPU::SReg_64RegClassID; + case 3: + return AMDGPU::SGPR_96RegClassID; case 4: return AMDGPU::SReg_128RegClassID; + case 5: + return AMDGPU::SGPR_160RegClassID; case 8: return AMDGPU::SReg_256RegClassID; case 16: return AMDGPU::SReg_512RegClassID; + case 32: + return AMDGPU::SReg_1024RegClassID; } llvm_unreachable("invalid vector size"); } -static bool getConstantValue(SDValue N, uint32_t &Out) { - if (const ConstantSDNode *C = dyn_cast(N)) { - Out = C->getAPIntValue().getZExtValue(); - return true; - } - - if (const ConstantFPSDNode *C = dyn_cast(N)) { - Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); - return true; - } - - return false; -} - void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -423,12 +668,12 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { return; } - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " "supported yet"); - // 16 = Max Num Vector Elements + // 32 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector RegSeqArgs(NumVectorElts * 2 + 1); + SmallVector RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; @@ -470,10 +715,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { if (isa(N) || (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || - Opc == AMDGPUISD::ATOMIC_LOAD_FADD || + Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) - N = glueCopyToM0(N); + N = glueCopyToM0LDSInit(N); switch (Opc) { default: @@ -491,6 +736,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } + case ISD::ADDCARRY: + case ISD::SUBCARRY: + if (N->getValueType(0) != MVT::i32) + break; + + SelectAddcSubb(N); + return; case ISD::UADDO: case ISD::USUBO: { SelectUADDO_USUBO(N); @@ -511,12 +763,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned NumVectorElts = VT.getVectorNumElements(); if (VT.getScalarSizeInBits() == 16) { if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - uint32_t K = LHSVal | (RHSVal << 16); - CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, - CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { + ReplaceNode(N, Packed); return; } } @@ -571,7 +819,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::STORE: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: { - N = glueCopyToM0(N); + N = glueCopyToM0LDSInit(N); break; } @@ -606,6 +854,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectDIV_SCALE(N); return; } + case AMDGPUISD::DIV_FMAS: { + SelectDIV_FMAS(N); + return; + } case AMDGPUISD::MAD_I64_I32: case AMDGPUISD::MAD_U64_U32: { SelectMAD_64_32(N); @@ -649,6 +901,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectCode(N); return; } + + break; + } + case ISD::INTRINSIC_W_CHAIN: { + SelectINTRINSIC_W_CHAIN(N); + return; + } + case ISD::INTRINSIC_VOID: { + SelectINTRINSIC_VOID(N); + return; } } @@ -763,6 +1025,19 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { ReplaceNode(N, RegSequence); } +void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue CI = N->getOperand(2); + + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); +} + void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned // carry out despite the _i32 name. These were renamed in VI to _U32. @@ -770,8 +1045,10 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; - CurDAG->SelectNodeTo(N, Opc, N->getVTList(), - { N->getOperand(0), N->getOperand(1) }); + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1), + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { @@ -816,6 +1093,35 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } +void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { + const GCNSubtarget *ST = static_cast(Subtarget); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; + + SDValue CarryIn = N->getOperand(3); + // V_DIV_FMAS implicitly reads VCC. + SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, + TRI->getVCC(), CarryIn, SDValue()); + + SDValue Ops[10]; + + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); + + Ops[8] = VCC; + Ops[9] = VCC.getValue(1); + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { @@ -829,13 +1135,13 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || (OffsetBits == 8 && !isUInt<8>(Offset))) return false; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || + if (Subtarget->hasUsableDSOffset() || Subtarget->unsafeDSOffsetFoldingEnabled()) return true; @@ -871,13 +1177,20 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + SmallVector Opnds; + Opnds.push_back(Zero); + Opnds.push_back(Addr.getOperand(1)); + // FIXME: Select to VOP3 version for with-carry. - unsigned SubOp = Subtarget->hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + unsigned SubOp = AMDGPU::V_SUB_I32_e32; + if (Subtarget->hasAddNoCarry()) { + SubOp = AMDGPU::V_SUB_U32_e64; + Opnds.push_back( + CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit + } - MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, - Zero, Addr.getOperand(1)); + MachineSDNode *MachineSub = + CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); Base = SDValue(MachineSub, 0); Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); @@ -945,12 +1258,18 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { - unsigned SubOp = Subtarget->hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + SmallVector Opnds; + Opnds.push_back(Zero); + Opnds.push_back(Addr.getOperand(1)); + unsigned SubOp = AMDGPU::V_SUB_I32_e32; + if (Subtarget->hasAddNoCarry()) { + SubOp = AMDGPU::V_SUB_U32_e64; + Opnds.push_back( + CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit + } MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, - Zero, Addr.getOperand(1)); + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); Base = SDValue(MachineSub, 0); Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); @@ -989,7 +1308,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { + SDValue &TFE, SDValue &DLC) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1001,6 +1320,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, if (!SLC.getNode()) SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); + DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1079,15 +1399,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const { + SDValue &SLC, SDValue &TFE, + SDValue &DLC) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!Subtarget->hasAddr64()) return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE)) + GLC, SLC, TFE, DLC)) return false; ConstantSDNode *C = cast(Addr64); @@ -1109,9 +1430,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE; + SDValue GLC, TFE, DLC; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1127,10 +1448,10 @@ std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - // If we can resolve this to a frame index access, this is relative to the - // frame pointer SGPR. - return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), - MVT::i32)); + // If we can resolve this to a frame index access, this will be relative to + // either the stack or frame pointer SGPR. + return std::make_pair( + TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); } // If we don't know this private access is a local stack object, it needs to @@ -1236,13 +1557,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { + SDValue &TFE, SDValue &DLC) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE)) + GLC, SLC, TFE, DLC)) return false; if (!cast(Offen)->getSExtValue() && @@ -1264,57 +1585,42 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE; + SDValue GLC, SLC, TFE, DLC; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE; + SDValue GLC, TFE, DLC; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); } template -bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - int64_t OffsetVal = 0; - - if (Subtarget->hasFlatInstOffsets() && - CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - int64_t COffsetVal = cast(N1)->getSExtValue(); - - if ((IsSigned && isInt<13>(COffsetVal)) || - (!IsSigned && isUInt<12>(COffsetVal))) { - Addr = N0; - OffsetVal = COffsetVal; - } - } - - VAddr = Addr; - Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); - SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); - - return true; + return static_cast(getTargetLowering())-> + SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC); } -bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return SelectFlatOffset(Addr, VAddr, Offset, SLC); + return SelectFlatOffset(N, Addr, VAddr, Offset, SLC); } -bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return SelectFlatOffset(Addr, VAddr, Offset, SLC); + return SelectFlatOffset(N, Addr, VAddr, Offset, SLC); } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, @@ -1619,9 +1925,12 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { return; } + const GCNSubtarget *ST = static_cast(Subtarget); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; - unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; + unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC(); SDLoc SL(N); if (!UseSCCBr) { @@ -1638,9 +1947,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { // the S_AND when is unnecessary. But it would be better to add a separate // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it // catches both cases. - Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, - CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), - Cond), + Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32 + : AMDGPU::S_AND_B64, + SL, MVT::i1, + CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO + : AMDGPU::EXEC, + MVT::i1), + Cond), 0); } @@ -1761,6 +2074,183 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { + // The address is assumed to be uniform, so if it ends up in a VGPR, it will + // be copied to an SGPR with readfirstlane. + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? + AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; + + SDValue Chain = N->getOperand(0); + SDValue Ptr = N->getOperand(2); + MemIntrinsicSDNode *M = cast(N); + MachineMemOperand *MMO = M->getMemOperand(); + bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; + + SDValue Offset; + if (CurDAG->isBaseWithConstantOffset(Ptr)) { + SDValue PtrBase = Ptr.getOperand(0); + SDValue PtrOffset = Ptr.getOperand(1); + + const APInt &OffsetVal = cast(PtrOffset)->getAPIntValue(); + if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { + N = glueCopyToM0(N, PtrBase); + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); + } + } + + if (!Offset) { + N = glueCopyToM0(N, Ptr); + Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + } + + SDValue Ops[] = { + Offset, + CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), + Chain, + N->getOperand(N->getNumOperands() - 1) // New glue + }; + + SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(cast(Selected), {MMO}); +} + +static unsigned gwsIntrinToOpcode(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + return AMDGPU::DS_GWS_INIT; + case Intrinsic::amdgcn_ds_gws_barrier: + return AMDGPU::DS_GWS_BARRIER; + case Intrinsic::amdgcn_ds_gws_sema_v: + return AMDGPU::DS_GWS_SEMA_V; + case Intrinsic::amdgcn_ds_gws_sema_br: + return AMDGPU::DS_GWS_SEMA_BR; + case Intrinsic::amdgcn_ds_gws_sema_p: + return AMDGPU::DS_GWS_SEMA_P; + case Intrinsic::amdgcn_ds_gws_sema_release_all: + return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; + default: + llvm_unreachable("not a gws intrinsic"); + } +} + +void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { + if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !Subtarget->hasGWSSemaReleaseAll()) { + // Let this error. + SelectCode(N); + return; + } + + // Chain, intrinsic ID, vsrc, offset + const bool HasVSrc = N->getNumOperands() == 4; + assert(HasVSrc || N->getNumOperands() == 3); + + SDLoc SL(N); + SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2); + int ImmOffset = 0; + MemIntrinsicSDNode *M = cast(N); + MachineMemOperand *MMO = M->getMemOperand(); + + // Don't worry if the offset ends up in a VGPR. Only one lane will have + // effect, so SIFixSGPRCopies will validly insert readfirstlane. + + // The resource id offset is computed as ( + M0[21:16] + + // offset field) % 64. Some versions of the programming guide omit the m0 + // part, or claim it's from offset 0. + if (ConstantSDNode *ConstOffset = dyn_cast(BaseOffset)) { + // If we have a constant offset, try to use the default value for m0 as a + // base to possibly avoid setting it up. + glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32)); + ImmOffset = ConstOffset->getZExtValue() + 1; + } else { + if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { + ImmOffset = BaseOffset.getConstantOperandVal(1); + BaseOffset = BaseOffset.getOperand(0); + } + + // Prefer to do the shift in an SGPR since it should be possible to use m0 + // as the result directly. If it's already an SGPR, it will be eliminated + // later. + SDNode *SGPROffset + = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, + BaseOffset); + // Shift to offset in m0 + SDNode *M0Base + = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, + SDValue(SGPROffset, 0), + CurDAG->getTargetConstant(16, SL, MVT::i32)); + glueCopyToM0(N, SDValue(M0Base, 0)); + } + + SDValue V0; + SDValue Chain = N->getOperand(0); + SDValue Glue; + if (HasVSrc) { + SDValue VSrc0 = N->getOperand(2); + + // The manual doesn't mention this, but it seems only v0 works. + V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); + + SDValue CopyToV0 = CurDAG->getCopyToReg( + N->getOperand(0), SL, V0, VSrc0, + N->getOperand(N->getNumOperands() - 1)); + Chain = CopyToV0; + Glue = CopyToV0.getValue(1); + } + + SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); + + // TODO: Can this just be removed from the instruction? + SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); + + const unsigned Opc = gwsIntrinToOpcode(IntrID); + SmallVector Ops; + if (HasVSrc) + Ops.push_back(V0); + Ops.push_back(OffsetField); + Ops.push_back(GDS); + Ops.push_back(Chain); + + if (HasVSrc) + Ops.push_back(Glue); + + SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(cast(Selected), {MMO}); +} + +void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { + unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + if (N->getValueType(0) != MVT::i32) + break; + SelectDSAppendConsume(N, IntrID); + return; + } + } + + SelectCode(N); +} + +void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { + unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: + SelectDS_GWS(N, IntrID); + return; + default: + break; + } + + SelectCode(N); +} + bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const { Mods = 0; @@ -1796,6 +2286,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, return isNoNanSrc(Src); } +bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + if (In.getValueType() == MVT::f32) + return SelectVOP3Mods(In, Src, SrcMods); + Src = In; + SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);; + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) return false; @@ -1833,41 +2332,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, return true; } -static SDValue stripBitcast(SDValue Val) { - return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; -} - -// Figure out if this is really an extract of the high 16-bits of a dword. -static bool isExtractHiElt(SDValue In, SDValue &Out) { - In = stripBitcast(In); - if (In.getOpcode() != ISD::TRUNCATE) - return false; - - SDValue Srl = In.getOperand(0); - if (Srl.getOpcode() == ISD::SRL) { - if (ConstantSDNode *ShiftAmt = dyn_cast(Srl.getOperand(1))) { - if (ShiftAmt->getZExtValue() == 16) { - Out = stripBitcast(Srl.getOperand(0)); - return true; - } - } - } - - return false; -} - -// Look through operations that obscure just looking at the low 16-bits of the -// same register. -static SDValue stripExtractLoElt(SDValue In) { - if (In.getOpcode() == ISD::TRUNCATE) { - SDValue Src = In.getOperand(0); - if (Src.getValueType().getSizeInBits() == 32) - return stripBitcast(Src); - } - - return In; -} - bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; @@ -2020,39 +2484,31 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, return true; } -// TODO: Can we identify things like v_mad_mixhi_f16? -bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { - if (In.isUndef()) { - Src = In; - return true; - } +SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { + if (In.isUndef()) + return CurDAG->getUNDEF(MVT::i32); if (ConstantSDNode *C = dyn_cast(In)) { SDLoc SL(In); - SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32); - MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SL, MVT::i32, K); - Src = SDValue(MovK, 0); - return true; + return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); } if (ConstantFPSDNode *C = dyn_cast(In)) { SDLoc SL(In); - SDValue K = CurDAG->getTargetConstant( + return CurDAG->getConstant( C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); - MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SL, MVT::i32, K); - Src = SDValue(MovK, 0); - return true; } - return isExtractHiElt(In, Src); + SDValue Src; + if (isExtractHiElt(In, Src)) + return Src; + + return SDValue(); } bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return false; - } + assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn); + const SIRegisterInfo *SIRI = static_cast(Subtarget->getRegisterInfo()); const SIInstrInfo * SII = diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 6951c915b177..39016ed37193 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,7 +20,6 @@ #include "AMDGPU.h" #include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" @@ -65,9 +63,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::v2f32: case MVT::v4i16: case MVT::v4f16: { - // Up to SGPR0-SGPR39 + // Up to SGPR0-SGPR105 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::SGPR_64RegClass, 20); + &AMDGPU::SGPR_64RegClass, 53); } default: return false; @@ -152,15 +150,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v3f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v5f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); setOperationAction(ISD::LOAD, MVT::v16f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); + setOperationAction(ISD::LOAD, MVT::i64, Promote); AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); @@ -237,15 +244,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v3f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v5f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::STORE, MVT::v8f32, Promote); AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); setOperationAction(ISD::STORE, MVT::v16f32, Promote); AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); @@ -327,16 +343,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); @@ -394,7 +422,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v4i32 + MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32 }; for (MVT VT : VectorIntTypes) { @@ -436,7 +464,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v4f32 + MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32 }; for (MVT VT : FloatVectorTypes) { @@ -478,9 +506,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::SELECT, MVT::v3f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::SELECT, MVT::v4f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::SELECT, MVT::v5f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); + // There are no libcalls of any kind. for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) setLibcallName(static_cast(I), nullptr); @@ -499,6 +533,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // vector compares until that is fixed. setHasMultipleConditionRegisters(true); + setMinCmpXchgSizeInBits(32); + setSupportsUnalignedAtomics(false); + PredictableSelectIsExpensive = false; // We want to find all load dependencies for long chains of stores to enable @@ -592,6 +629,7 @@ static bool hasSourceMods(const SDNode *N) { case ISD::FDIV: case ISD::FREM: case ISD::INLINEASM: + case ISD::INLINEASM_BR: case AMDGPUISD::INTERP_P1: case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: @@ -640,7 +678,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { // The backend supports 32 and 64 bit floating point immediates. // FIXME: Why are we reporting vectors of FP immediates as legal? -bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { EVT ScalarVT = VT.getScalarType(); return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); @@ -690,8 +729,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, return (OldSize < 32); } -bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, - EVT CastTy) const { +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); @@ -701,8 +741,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, unsigned LScalarSize = LoadTy.getScalarSizeInBits(); unsigned CastScalarSize = CastTy.getScalarSizeInBits(); - return (LScalarSize < CastScalarSize) || - (CastScalarSize >= 32); + if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) + return false; + + bool Fast = false; + return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, + MMO, &Fast) && Fast; } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -849,9 +893,6 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) { switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -864,8 +905,10 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::Fast: case CallingConv::Cold: return CC_AMDGPU_Func; + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: default: - report_fatal_error("Unsupported calling convention."); + report_fatal_error("Unsupported calling convention for call"); } } @@ -1010,9 +1053,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) MemVT = MemVT.getScalarType(); - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + // Round up vec3/vec5 argument. + if (MemVT.isVector() && !MemVT.isPow2VectorType()) { + assert(MemVT.getVectorNumElements() == 3 || + MemVT.getVectorNumElements() == 5); MemVT = MemVT.getPow2VectorType(State.getContext()); } @@ -1372,6 +1416,41 @@ SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); } +// Split a vector type into two parts. The first part is a power of two vector. +// The second part is whatever is left over, and is a scalar if it would +// otherwise be a 1-vector. +std::pair +AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { + EVT LoVT, HiVT; + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); + LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); + HiVT = NumElts - LoNumElts == 1 + ? EltVT + : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); + return std::make_pair(LoVT, HiVT); +} + +// Split a vector value into two parts of types LoVT and HiVT. HiVT could be +// scalar. +std::pair +AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, + const EVT &LoVT, const EVT &HiVT, + SelectionDAG &DAG) const { + assert(LoVT.getVectorNumElements() + + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= + N.getValueType().getVectorNumElements() && + "More vector elements requested than available!"); + auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, + DAG.getConstant(0, DL, IdxTy)); + SDValue Hi = DAG.getNode( + HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, + HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy)); + return std::make_pair(Lo, Hi); +} + SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast(Op); @@ -1393,9 +1472,9 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, EVT LoMemVT, HiMemVT; SDValue Lo, Hi; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); + std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); + std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); unsigned Size = LoMemVT.getStoreSize(); unsigned BaseAlign = Load->getAlignment(); @@ -1410,15 +1489,52 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); - SDValue Ops[] = { - DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - LoLoad.getValue(1), HiLoad.getValue(1)) - }; + auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); + SDValue Join; + if (LoVT == HiVT) { + // This is the case that the vector is power of two so was evenly split. + Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); + } else { + Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, + DAG.getConstant(0, SL, IdxTy)); + Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR + : ISD::INSERT_VECTOR_ELT, + SL, VT, Join, HiLoad, + DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy)); + } + + SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1))}; return DAG.getMergeValues(Ops, SL); } +// Widen a vector load from vec3 to vec4. +SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast(Op); + EVT VT = Op.getValueType(); + assert(VT.getVectorNumElements() == 3); + SDValue BasePtr = Load->getBasePtr(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Load->getAlignment(); + + EVT WideVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); + EVT WideMemVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); + SDValue WideLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, + WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); + return DAG.getMergeValues( + {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, + DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))), + WideLoad.getValue(1)}, + SL); +} + SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast(Op); @@ -1439,9 +1555,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, EVT LoMemVT, HiMemVT; SDValue Lo, Hi; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); + std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); + std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); @@ -2788,6 +2904,54 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { return true; } +// Find a load or store from corresponding pattern root. +// Roots may be build_vector, bitconvert or their combinations. +static MemSDNode* findMemSDNode(SDNode *N) { + N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); + if (MemSDNode *MN = dyn_cast(N)) + return MN; + assert(isa(N)); + for (SDValue V : N->op_values()) + if (MemSDNode *MN = + dyn_cast(AMDGPUTargetLowering::stripBitcast(V))) + return MN; + llvm_unreachable("cannot find MemSDNode in the pattern!"); +} + +bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned, + SelectionDAG &DAG, + SDNode *N, + SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { + const GCNSubtarget &ST = + DAG.getMachineFunction().getSubtarget(); + int64_t OffsetVal = 0; + + if (ST.hasFlatInstOffsets() && + (!ST.hasFlatSegmentOffsetBug() || + findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && + DAG.isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + int64_t COffsetVal = cast(N1)->getSExtValue(); + + const SIInstrInfo *TII = ST.getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(), + IsSigned)) { + Addr = N0; + OffsetVal = COffsetVal; + } + } + + VAddr = Addr; + Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1); + + return true; +} + // Replace load of an illegal type with a store of a bitcast to a friendlier // type. SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, @@ -2812,7 +2976,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (!allowsMisalignedMemoryAccesses( + VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) return scalarizeVectorLoad(LN, DAG); @@ -2864,7 +3029,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (!allowsMisalignedMemoryAccesses( + VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) return scalarizeVectorStore(SN, DAG); @@ -3049,30 +3215,44 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) - return SDValue(); - - const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + auto *RHS = dyn_cast(N->getOperand(1)); if (!RHS) return SDValue(); + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); unsigned ShiftAmt = RHS->getZExtValue(); + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) + // this improves the ability to match BFE patterns in isel. + if (LHS.getOpcode() == ISD::AND) { + if (auto *Mask = dyn_cast(LHS.getOperand(1))) { + if (Mask->getAPIntValue().isShiftedMask() && + Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) { + return DAG.getNode( + ISD::AND, SL, VT, + DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), + DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); + } + } + } + + if (VT != MVT::i64) + return SDValue(); + if (ShiftAmt < 32) return SDValue(); // srl i64:x, C for C >= 32 // => // build_pair (srl hi_32(x), C - 32), 0 - - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - SDValue One = DAG.getConstant(1, SL, MVT::i32); SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, - VecOp, One); + SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One); SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); @@ -3090,7 +3270,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( SDValue Src = N->getOperand(0); // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) - if (Src.getOpcode() == ISD::BITCAST) { + if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { SDValue Vec = Src.getOperand(0); if (Vec.getOpcode() == ISD::BUILD_VECTOR) { SDValue Elt0 = Vec.getOperand(0); @@ -3478,13 +3658,11 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. SelectionDAG &DAG = DCI.DAG; - if ((DAG.isConstantValueOfAnyType(True) || - DAG.isConstantValueOfAnyType(True)) && - (!DAG.isConstantValueOfAnyType(False) && - !DAG.isConstantValueOfAnyType(False))) { + if (DAG.isConstantValueOfAnyType(True) && + !DAG.isConstantValueOfAnyType(False)) { // Swap cmp + select pair to move constant to false input. // This will allow using VOPC cndmasks more often. - // select (setcc x, y), k, x -> select (setcc y, x) x, x + // select (setcc x, y), k, x -> select (setccinv x, y), x, k SDLoc SL(N); ISD::CondCode NewCC = getSetCCInverse(cast(CC)->get(), @@ -3594,6 +3772,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, RHS = RHS.getOperand(0); SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); + if (Res.getOpcode() != ISD::FADD) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3613,6 +3793,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); + if (Res.getOpcode() != Opc) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3640,6 +3822,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, RHS = RHS.getOperand(0); SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); + if (Res.getOpcode() != Opc) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3668,6 +3852,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, unsigned Opposite = inverseMinMax(Opc); SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); + if (Res.getOpcode() != Opposite) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3678,6 +3864,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); + if (Res.getOpcode() != AMDGPUISD::FMED3) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -4051,9 +4239,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, const ArgDescriptor &Arg) const { assert(Arg && "Attempting to load missing argument"); - if (Arg.isRegister()) - return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); - return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + SDValue V = Arg.isRegister() ? + CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : + loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + + if (!Arg.isMasked()) + return V; + + unsigned Mask = Arg.getMask(); + unsigned Shift = countTrailingZeros(Mask); + V = DAG.getNode(ISD::SRL, SL, VT, V, + DAG.getShiftAmountConstant(Shift, VT, SL)); + return DAG.getNode(ISD::AND, SL, VT, V, + DAG.getConstant(Mask >> Shift, SL, VT)); } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( @@ -4175,6 +4373,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(LDS) NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; @@ -4185,24 +4384,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) + NODE_NAME_CASE(INTERP_P1LL_F16) + NODE_NAME_CASE(INTERP_P1LV_F16) + NODE_NAME_CASE(INTERP_P2_F16) + NODE_NAME_CASE(LOAD_D16_HI) + NODE_NAME_CASE(LOAD_D16_LO) + NODE_NAME_CASE(LOAD_D16_HI_I8) + NODE_NAME_CASE(LOAD_D16_HI_U8) + NODE_NAME_CASE(LOAD_D16_LO_I8) + NODE_NAME_CASE(LOAD_D16_LO_U8) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) + NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) - NODE_NAME_CASE(ATOMIC_LOAD_FADD) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_UBYTE) + NODE_NAME_CASE(BUFFER_LOAD_USHORT) + NODE_NAME_CASE(BUFFER_LOAD_BYTE) + NODE_NAME_CASE(BUFFER_LOAD_SHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_BYTE) + NODE_NAME_CASE(BUFFER_STORE_SHORT) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) @@ -4216,6 +4429,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) + NODE_NAME_CASE(ATOMIC_FADD) + NODE_NAME_CASE(ATOMIC_PK_FADD) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } @@ -4367,6 +4584,23 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( } break; } + case AMDGPUISD::BUFFER_LOAD_UBYTE: { + Known.Zero.setHighBits(24); + break; + } + case AMDGPUISD::BUFFER_LOAD_USHORT: { + Known.Zero.setHighBits(16); + break; + } + case AMDGPUISD::LDS: { + auto GA = cast(Op.getOperand(0).getNode()); + unsigned Align = GA->getGlobal()->getAlignment(); + + Known.Zero.setHighBits(16); + if (Align) + Known.Zero.setLowBits(Log2_32(Align)); + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(Op.getOperand(0))->getZExtValue(); switch (IID) { @@ -4412,6 +4646,14 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; + case AMDGPUISD::BUFFER_LOAD_BYTE: + return 25; + case AMDGPUISD::BUFFER_LOAD_SHORT: + return 17; + case AMDGPUISD::BUFFER_LOAD_UBYTE: + return 24; + case AMDGPUISD::BUFFER_LOAD_USHORT: + return 16; case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP16_ZEXT: return 16; @@ -4519,7 +4761,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, TargetLowering::AtomicExpansionKind AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { - if (RMW->getOperation() == AtomicRMWInst::Nand) + switch (RMW->getOperation()) { + case AtomicRMWInst::Nand: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::None; + default: + return AtomicExpansionKind::None; + } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0d22cb2e3e20..fe7ad694943d 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -1,9 +1,8 @@ //===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -111,9 +110,23 @@ protected: SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; + /// Split a vector type into two parts. The first part is a power of two + /// vector. The second part is whatever is left over, and is a scalar if it + /// would otherwise be a 1-vector. + std::pair getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const; + + /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be + /// scalar. + std::pair splitVector(const SDValue &N, const SDLoc &DL, + const EVT &LoVT, const EVT &HighVT, + SelectionDAG &DAG) const; + /// Split a vector load into 2 loads of half the vector. SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Widen a vector load from vec3 to vec4. + SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; @@ -162,13 +175,15 @@ public: MVT getVectorIdxTy(const DataLayout &) const override; bool isSelectSupported(SelectSupportKind) const override; - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; bool ShouldShrinkFPConstant(EVT VT) const override; bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override; - bool isLoadBitCastBeneficial(EVT, EVT) const final; + bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, + const MachineMemOperand &MMO) const final; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, @@ -212,15 +227,15 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; - // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection - // for AMDGPU. - // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036 - // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on - // MergeConsecutiveStores() before Instruction Selection for all targets. - // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores() - // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores() - // re-merges, etc. ) to warrant turning it off for now. - bool mergeStoresAfterLegalization() const override { return false; } + // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for + // AMDGPU. Commit r319036, + // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6) + // turned on MergeConsecutiveStores() before Instruction Selection for all + // targets. Enough AMDGPU compiles go into an infinite loop ( + // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges; + // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for + // now. + bool mergeStoresAfterLegalization(EVT) const override { return false; } bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; @@ -309,6 +324,10 @@ public: } AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + + bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, + SDValue &SLC) const; }; namespace AMDGPUISD { @@ -463,28 +482,44 @@ enum NodeType : unsigned { INTERP_MOV, INTERP_P1, INTERP_P2, + INTERP_P1LL_F16, + INTERP_P1LV_F16, + INTERP_P2_F16, PC_ADD_REL_OFFSET, + LDS, KILL, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + LOAD_D16_HI, + LOAD_D16_LO, + LOAD_D16_HI_I8, + LOAD_D16_HI_U8, + LOAD_D16_LO_I8, + LOAD_D16_LO_U8, + STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, - TBUFFER_STORE_FORMAT_X3, TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, TBUFFER_LOAD_FORMAT_D16, + DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, - ATOMIC_LOAD_FADD, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, + BUFFER_LOAD_UBYTE, + BUFFER_LOAD_USHORT, + BUFFER_LOAD_BYTE, + BUFFER_LOAD_SHORT, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, BUFFER_STORE, + BUFFER_STORE_BYTE, + BUFFER_STORE_SHORT, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, @@ -498,6 +533,10 @@ enum NodeType : unsigned { BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, BUFFER_ATOMIC_CMPSWAP, + BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_PK_FADD, + ATOMIC_FADD, + ATOMIC_PK_FADD, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index 945c9acd379a..f4df20b8f03e 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -1,9 +1,8 @@ //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,7 +39,7 @@ using namespace llvm; #define DEBUG_TYPE "inline" static cl::opt -ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500), cl::desc("Cost of alloca argument")); // If the amount of scratch memory to eliminate exceeds our ability to allocate @@ -50,6 +49,12 @@ static cl::opt ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost")); +// Inliner constraint to achieve reasonable compilation time +static cl::opt +MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300), + cl::desc("Maximum BB number allowed in a function after inlining" + " (compile time constraint)")); + namespace { class AMDGPUInliner : public LegacyInlinerBase { @@ -112,7 +117,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { Callee->hasFnAttribute(Attribute::InlineHint); if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres && !Caller->hasFnAttribute(Attribute::MinSize)) - Thres = Params.HintThreshold.getValue(); + Thres = Params.HintThreshold.getValue() * + TTIWP->getTTI(*Callee).getInliningThresholdMultiplier(); const DataLayout &DL = Caller->getParent()->getDataLayout(); if (!Callee) @@ -124,10 +130,11 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { uint64_t AllocaSize = 0; SmallPtrSet AIVisited; for (Value *PtrArg : CS.args()) { - Type *Ty = PtrArg->getType(); - if (!Ty->isPointerTy() || - Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + PointerType *Ty = dyn_cast(PtrArg->getType()); + if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && + Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); if (const AllocaInst *AI = dyn_cast(PtrArg)) { if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) @@ -170,7 +177,6 @@ static bool isWrapperOnlyCall(CallSite CS) { InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); - TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); if (!Callee || Callee->isDeclaration()) return llvm::InlineCost::getNever("undefined callee"); @@ -178,13 +184,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { if (CS.isNoInline()) return llvm::InlineCost::getNever("noinline"); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); if (!TTI.areInlineCompatible(Caller, Callee)) return llvm::InlineCost::getNever("incompatible"); if (CS.hasFnAttr(Attribute::AlwaysInline)) { - if (isInlineViable(*Callee)) + auto IsViable = isInlineViable(*Callee); + if (IsViable) return llvm::InlineCost::getAlways("alwaysinline viable"); - return llvm::InlineCost::getNever("alwaysinline unviable"); + return llvm::InlineCost::getNever(IsViable.message); } if (isWrapperOnlyCall(CS)) @@ -206,6 +214,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, - None, PSI, RemarksEnabled ? &ORE : nullptr); + auto IC = llvm::getInlineCost(cast(*CS.getInstruction()), Callee, + LocalParams, TTI, GetAssumptionCache, None, PSI, + RemarksEnabled ? &ORE : nullptr); + + if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { + // Single BB does not increase total BB amount, thus subtract 1 + size_t Size = Caller->size() + Callee->size() - 1; + if (MaxBB && Size > MaxBB) + return llvm::InlineCost::getNever("max number of bb exceeded"); + } + return IC; } diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 07aa7c2cc8ad..9951cbf2326e 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 2f8166da0d33..698189e14c21 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -1,9 +1,8 @@ //===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 82644be26563..4a8446955496 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -1,9 +1,8 @@ //===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -51,27 +50,21 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def AMDGPUIfOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] + [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] >; def AMDGPUElseOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>] + [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] >; def AMDGPULoopOp : SDTypeProfile<0, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>] + [SDTCisVT<0, i1>, SDTCisVT<1, OtherVT>] >; def AMDGPUIfBreakOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>] ->; - -def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, - [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] + [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>] >; -def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; - //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -96,7 +89,8 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL", SDNPVariadic] >; -def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", + SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; @@ -205,14 +199,8 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; // out = (src1 > src0) ? 1 : 0 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; -// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own -// nodes in TargetSelectionDAG.td. -def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>; - -def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>; - def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc - SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> + SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; @@ -251,7 +239,8 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; // Special case divide FMA with scale and flags (src0 = Quotient, // src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, + [SDNPOptInGlue]>; // Single or double precision division fixup. // Special case divide fixup and flags(src0 = Quotient, src1 = @@ -370,6 +359,17 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", SDTypeProfile<1, 4, [SDTCisFP<0>]>, [SDNPInGlue]>; +def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", + SDTypeProfile<1, 7, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16", + SDTypeProfile<1, 9, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16", + SDTypeProfile<1, 8, [SDTCisFP<0>]>, + [SDNPInGlue]>; def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, [SDNPHasChain, SDNPSideEffect]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8eb49d49b2e0..901a2eaa8829 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1,9 +1,8 @@ //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -18,10 +17,11 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -35,6 +35,7 @@ #define DEBUG_TYPE "amdgpu-isel" using namespace llvm; +using namespace MIPatternMatch; #define GET_GLOBALISEL_IMPL #define AMDGPUSubtarget GCNSubtarget @@ -60,11 +61,101 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } +static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return Reg == AMDGPU::SCC; + + auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); + const TargetRegisterClass *RC = + RegClassOrBank.dyn_cast(); + if (RC) { + // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the + // context of the register bank has been lost. + if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID) + return false; + const LLT Ty = MRI.getType(Reg); + return Ty.isValid() && Ty.getSizeInBits() == 1; + } + + const RegisterBank *RB = RegClassOrBank.get(); + return RB->getID() == AMDGPU::SCCRegBankID; +} + +bool AMDGPUInstructionSelector::isVCC(Register Reg, + const MachineRegisterInfo &MRI) const { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return Reg == TRI.getVCC(); + + auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); + const TargetRegisterClass *RC = + RegClassOrBank.dyn_cast(); + if (RC) { + const LLT Ty = MRI.getType(Reg); + return RC->hasSuperClassEq(TRI.getBoolRC()) && + Ty.isValid() && Ty.getSizeInBits() == 1; + } + + const RegisterBank *RB = RegClassOrBank.get(); + return RB->getID() == AMDGPU::VCCRegBankID; +} + bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { + const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); I.setDesc(TII.get(TargetOpcode::COPY)); + + const MachineOperand &Src = I.getOperand(1); + MachineOperand &Dst = I.getOperand(0); + Register DstReg = Dst.getReg(); + Register SrcReg = Src.getReg(); + + if (isVCC(DstReg, MRI)) { + if (SrcReg == AMDGPU::SCC) { + const TargetRegisterClass *RC + = TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (!RC) + return true; + return RBI.constrainGenericRegister(DstReg, *RC, MRI); + } + + if (!isVCC(SrcReg, MRI)) { + // TODO: Should probably leave the copy and let copyPhysReg expand it. + if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI)) + return false; + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) + .addImm(0) + .addReg(SrcReg); + + if (!MRI.getRegClassOrNull(SrcReg)) + MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI)); + I.eraseFromParent(); + return true; + } + + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI)) + return false; + + // Don't constrain the source register to a class so the def instruction + // handles it (unless it's undef). + // + // FIXME: This is a hack. When selecting the def, we neeed to know + // specifically know that the result is VCCRegBank, and not just an SGPR + // with size 1. An SReg_32 with size 1 is ambiguous with wave32. + if (Src.isUndef()) { + const TargetRegisterClass *SrcRC = + TRI.getConstrainedRegClassForOperand(Src, MRI); + if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + return false; + } + + return true; + } + for (const MachineOperand &MO : I.operands()) { if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) continue; @@ -78,15 +169,54 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const Register DefReg = I.getOperand(0).getReg(); + const LLT DefTy = MRI.getType(DefReg); + + // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) + + const RegClassOrRegBank &RegClassOrBank = + MRI.getRegClassOrRegBank(DefReg); + + const TargetRegisterClass *DefRC + = RegClassOrBank.dyn_cast(); + if (!DefRC) { + if (!DefTy.isValid()) { + LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); + return false; + } + + const RegisterBank &RB = *RegClassOrBank.get(); + if (RB.getID() == AMDGPU::SCCRegBankID) { + LLVM_DEBUG(dbgs() << "illegal scc phi\n"); + return false; + } + + DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI); + if (!DefRC) { + LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); + return false; + } + } + + I.setDesc(TII.get(TargetOpcode::PHI)); + return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); +} + MachineOperand AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, + const TargetRegisterClass &SubRC, unsigned SubIdx) const { MachineInstr *MI = MO.getParent(); MachineBasicBlock *BB = MO.getParent()->getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register DstReg = MRI.createVirtualRegister(&SubRC); if (MO.isReg()) { unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); @@ -118,51 +248,273 @@ static int64_t getConstant(const MachineInstr *MI) { return MI->getOperand(1).getCImm()->getSExtValue(); } -bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const { +static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { + switch (Opc) { + case AMDGPU::G_AND: + return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; + case AMDGPU::G_OR: + return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; + case AMDGPU::G_XOR: + return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; + default: + llvm_unreachable("not a bit op"); + } +} + +bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); - unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineOperand &Dst = I.getOperand(0); + MachineOperand &Src0 = I.getOperand(1); + MachineOperand &Src1 = I.getOperand(2); + Register DstReg = Dst.getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + if (DstRB->getID() == AMDGPU::VCCRegBankID) { + const TargetRegisterClass *RC = TRI.getBoolRC(); + unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), + RC == &AMDGPU::SReg_64RegClass); + I.setDesc(TII.get(InstOpc)); + + // FIXME: Hack to avoid turning the register bank into a register class. + // The selector for G_ICMP relies on seeing the register bank for the result + // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will + // be ambiguous whether it's a scalar or vector bool. + if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg())) + MRI.setRegClass(Src0.getReg(), RC); + if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg())) + MRI.setRegClass(Src1.getReg(), RC); + + return RBI.constrainGenericRegister(DstReg, *RC, MRI); + } - if (Size != 64) - return false; + // TODO: Should this allow an SCC bank result, and produce a copy from SCC for + // the result? + if (DstRB->getID() == AMDGPU::SGPRRegBankID) { + unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); + I.setDesc(TII.get(InstOpc)); - DebugLoc DL = I.getDebugLoc(); + const TargetRegisterClass *RC + = TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (!RC) + return false; + return RBI.constrainGenericRegister(DstReg, *RC, MRI) && + RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) && + RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI); + } - MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0)); - MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0)); + return false; +} - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) - .add(Lo1) - .add(Lo2); +bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register DstReg = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; + const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; - MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1)); - MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1)); + if (Size == 32) { + if (IsSALU) { + const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; + MachineInstr *Add = + BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) + .add(I.getOperand(1)) + .add(I.getOperand(2)); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); + } - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) - .add(Hi1) - .add(Hi2); + if (STI.hasAddNoCarry()) { + const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; + I.setDesc(TII.get(Opc)); + I.addOperand(*MF, MachineOperand::CreateImm(0)); + I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } - BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg()) - .addReg(DstLo) - .addImm(AMDGPU::sub0) - .addReg(DstHi) - .addImm(AMDGPU::sub1); + const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; - for (MachineOperand &MO : I.explicit_operands()) { - if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg())) - continue; - RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI); + Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass()); + MachineInstr *Add + = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) + .addDef(UnusedCarry, RegState::Dead) + .add(I.getOperand(1)) + .add(I.getOperand(2)) + .addImm(0); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); } + assert(!Sub && "illegal sub should not reach here"); + + const TargetRegisterClass &RC + = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; + const TargetRegisterClass &HalfRC + = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; + + MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); + MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); + MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); + MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); + + Register DstLo = MRI.createVirtualRegister(&HalfRC); + Register DstHi = MRI.createVirtualRegister(&HalfRC); + + if (IsSALU) { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) + .add(Lo1) + .add(Lo2); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) + .add(Hi1) + .add(Hi2); + } else { + const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); + Register CarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) + .addDef(CarryReg) + .add(Lo1) + .add(Lo2) + .addImm(0); + MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) + .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead) + .add(Hi1) + .add(Hi2) + .addReg(CarryReg, RegState::Kill) + .addImm(0); + + if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) + return false; + } + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); + + + if (!RBI.constrainGenericRegister(DstReg, RC, MRI)) + return false; + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + assert(I.getOperand(2).getImm() % 32 == 0); + unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); + const DebugLoc &DL = I.getDebugLoc(); + MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), + I.getOperand(0).getReg()) + .addReg(I.getOperand(1).getReg(), 0, SubReg); + + for (const MachineOperand &MO : Copy->operands()) { + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (!RC) + continue; + RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + } I.eraseFromParent(); return true; } +bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { + MachineBasicBlock *BB = MI.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + + const unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 32) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI); + const unsigned DstSize = DstTy.getSizeInBits(); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI); + if (!DstRC) + return false; + + ArrayRef SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); + MachineInstrBuilder MIB = + BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); + for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { + MachineOperand &Src = MI.getOperand(I + 1); + MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); + MIB.addImm(SubRegs[I]); + + const TargetRegisterClass *SrcRC + = TRI.getConstrainedRegClassForOperand(Src, MRI); + if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI)) + return false; + } + + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) + return false; + + MI.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { + MachineBasicBlock *BB = MI.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const int NumDst = MI.getNumOperands() - 1; + + MachineOperand &Src = MI.getOperand(NumDst); + + Register SrcReg = Src.getReg(); + Register DstReg0 = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg0); + LLT SrcTy = MRI.getType(SrcReg); + + const unsigned DstSize = DstTy.getSizeInBits(); + const unsigned SrcSize = SrcTy.getSizeInBits(); + const DebugLoc &DL = MI.getDebugLoc(); + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + + const TargetRegisterClass *SrcRC = + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI); + if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + return false; + + const unsigned SrcFlags = getUndefRegState(Src.isUndef()); + + // Note we could have mixed SGPR and VGPR destination banks for an SGPR + // source, and this relies on the fact that the same subregister indices are + // used for both. + ArrayRef SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); + for (int I = 0, E = NumDst; I != E; ++I) { + MachineOperand &Dst = MI.getOperand(I); + BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) + .addReg(SrcReg, SrcFlags, SubRegs[I]); + + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI)) + return false; + } + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { - return selectG_ADD(I); + return selectG_ADD_SUB(I); } bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { @@ -170,47 +522,200 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const MachineOperand &MO = I.getOperand(0); - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); - if (RC) + + // FIXME: Interface for getConstrainedRegClassForOperand needs work. The + // regbank check here is to know why getConstrainedRegClassForOperand failed. + const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI); + if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) || + (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) { + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + return true; + } + + return false; +} + +bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); + DebugLoc DL = I.getDebugLoc(); + MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) + .addDef(I.getOperand(0).getReg()) + .addReg(I.getOperand(1).getReg()) + .addReg(I.getOperand(2).getReg()) + .addImm(SubReg); + + for (const MachineOperand &MO : Ins->operands()) { + if (!MO.isReg()) + continue; + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (!RC) + continue; RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); - I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + } + I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { - unsigned IntrinsicID = I.getOperand(1).getIntrinsicID(); - +bool AMDGPUInstructionSelector::selectG_INTRINSIC( + MachineInstr &I, CodeGenCoverage &CoverageInfo) const { + unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); switch (IntrinsicID) { - default: - break; case Intrinsic::maxnum: case Intrinsic::minnum: case Intrinsic::amdgcn_cvt_pkrtz: return selectImpl(I, CoverageInfo); - - case Intrinsic::amdgcn_kernarg_segment_ptr: { - MachineFunction *MF = I.getParent()->getParent(); + case Intrinsic::amdgcn_if_break: { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIMachineFunctionInfo *MFI = MF->getInfo(); - const ArgDescriptor *InputPtrReg; - const TargetRegisterClass *RC; - const DebugLoc &DL = I.getDebugLoc(); - - std::tie(InputPtrReg, RC) - = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - if (!InputPtrReg) - report_fatal_error("missing kernarg segment ptr"); - BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY)) + // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick + // SelectionDAG uses for wave32 vs wave64. + BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) .add(I.getOperand(0)) - .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister())); + .add(I.getOperand(2)) + .add(I.getOperand(3)); + + Register DstReg = I.getOperand(0).getReg(); + Register Src0Reg = I.getOperand(2).getReg(); + Register Src1Reg = I.getOperand(3).getReg(); + I.eraseFromParent(); + + for (Register Reg : { DstReg, Src0Reg, Src1Reg }) { + if (!MRI.getRegClassOrNull(Reg)) + MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); + } + return true; } + default: + return selectImpl(I, CoverageInfo); + } +} + +static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { + if (Size != 32 && Size != 64) + return -1; + switch (P) { + default: + llvm_unreachable("Unknown condition code!"); + case CmpInst::ICMP_NE: + return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; + case CmpInst::ICMP_EQ: + return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; + case CmpInst::ICMP_SGT: + return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; + case CmpInst::ICMP_SGE: + return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; + case CmpInst::ICMP_SLT: + return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; + case CmpInst::ICMP_SLE: + return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; + case CmpInst::ICMP_UGT: + return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; + case CmpInst::ICMP_UGE: + return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; + case CmpInst::ICMP_ULT: + return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; + case CmpInst::ICMP_ULE: + return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; } - return false; +} + +int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, + unsigned Size) const { + if (Size == 64) { + if (!STI.hasScalarCompareEq64()) + return -1; + + switch (P) { + case CmpInst::ICMP_NE: + return AMDGPU::S_CMP_LG_U64; + case CmpInst::ICMP_EQ: + return AMDGPU::S_CMP_EQ_U64; + default: + return -1; + } + } + + if (Size != 32) + return -1; + + switch (P) { + case CmpInst::ICMP_NE: + return AMDGPU::S_CMP_LG_U32; + case CmpInst::ICMP_EQ: + return AMDGPU::S_CMP_EQ_U32; + case CmpInst::ICMP_SGT: + return AMDGPU::S_CMP_GT_I32; + case CmpInst::ICMP_SGE: + return AMDGPU::S_CMP_GE_I32; + case CmpInst::ICMP_SLT: + return AMDGPU::S_CMP_LT_I32; + case CmpInst::ICMP_SLE: + return AMDGPU::S_CMP_LE_I32; + case CmpInst::ICMP_UGT: + return AMDGPU::S_CMP_GT_U32; + case CmpInst::ICMP_UGE: + return AMDGPU::S_CMP_GE_U32; + case CmpInst::ICMP_ULT: + return AMDGPU::S_CMP_LT_U32; + case CmpInst::ICMP_ULE: + return AMDGPU::S_CMP_LE_U32; + default: + llvm_unreachable("Unknown condition code!"); + } +} + +bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = I.getDebugLoc(); + + unsigned SrcReg = I.getOperand(2).getReg(); + unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI); + + auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); + + unsigned CCReg = I.getOperand(0).getReg(); + if (isSCC(CCReg, MRI)) { + int Opcode = getS_CMPOpcode(Pred, Size); + if (Opcode == -1) + return false; + MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) + .addReg(AMDGPU::SCC); + bool Ret = + constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && + RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI); + I.eraseFromParent(); + return Ret; + } + + int Opcode = getV_CMPOpcode(Pred, Size); + if (Opcode == -1) + return false; + + MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), + I.getOperand(0).getReg()) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), + *TRI.getBoolRC(), MRI); + bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; } static MachineInstr * @@ -232,8 +737,7 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, } bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( - MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { + MachineInstr &I, CodeGenCoverage &CoverageInfo) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -272,8 +776,72 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( I.eraseFromParent(); return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); } + case Intrinsic::amdgcn_end_cf: { + // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick + // SelectionDAG uses for wave32 vs wave64. + BuildMI(*BB, &I, I.getDebugLoc(), + TII.get(AMDGPU::SI_END_CF)) + .add(I.getOperand(1)); + + Register Reg = I.getOperand(1).getReg(); + I.eraseFromParent(); + + if (!MRI.getRegClassOrNull(Reg)) + MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); + return true; } - return false; + default: + return selectImpl(I, CoverageInfo); + } +} + +bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = I.getDebugLoc(); + + unsigned DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + assert(Size <= 32 || Size == 64); + const MachineOperand &CCOp = I.getOperand(1); + unsigned CCReg = CCOp.getReg(); + if (isSCC(CCReg, MRI)) { + unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : + AMDGPU::S_CSELECT_B32; + MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(CCReg); + + // The generic constrainSelectedInstRegOperands doesn't work for the scc register + // bank, because it does not cover the register class that we used to represent + // for it. So we need to manually set the register class here. + if (!MRI.getRegClassOrNull(CCReg)) + MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI)); + MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + + bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | + constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; + } + + // Wide VGPR select should have been split in RegBankSelect. + if (Size > 32) + return false; + + MachineInstr *Select = + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .add(I.getOperand(3)) + .addImm(0) + .add(I.getOperand(2)) + .add(I.getOperand(1)); + + bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; } bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { @@ -281,10 +849,16 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = I.getDebugLoc(); + unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI); + if (PtrSize != 64) { + LLVM_DEBUG(dbgs() << "Unhandled address space\n"); + return false; + } + unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); unsigned Opcode; - // FIXME: Select store instruction based on address space + // FIXME: Remove this when integers > s32 naturally selected. switch (StoreSize) { default: return false; @@ -307,7 +881,8 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { .add(I.getOperand(0)) .addImm(0) // offset .addImm(0) // glc - .addImm(0); // slc + .addImm(0) // slc + .addImm(0); // dlc // Now that we selected an opcode, we need to constrain the register @@ -318,6 +893,218 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { return Ret; } +static int sizeToSubRegIndex(unsigned Size) { + switch (Size) { + case 32: + return AMDGPU::sub0; + case 64: + return AMDGPU::sub0_sub1; + case 96: + return AMDGPU::sub0_sub1_sub2; + case 128: + return AMDGPU::sub0_sub1_sub2_sub3; + case 256: + return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + default: + if (Size < 32) + return AMDGPU::sub0; + if (Size > 256) + return -1; + return sizeToSubRegIndex(PowerOf2Ceil(Size)); + } +} + +bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned DstReg = I.getOperand(0).getReg(); + unsigned SrcReg = I.getOperand(1).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + if (!DstTy.isScalar()) + return false; + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); + if (SrcRB != DstRB) + return false; + + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = SrcTy.getSizeInBits(); + + const TargetRegisterClass *SrcRC + = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI); + const TargetRegisterClass *DstRC + = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI); + + if (SrcSize > 32) { + int SubRegIdx = sizeToSubRegIndex(DstSize); + if (SubRegIdx == -1) + return false; + + // Deal with weird cases where the class only partially supports the subreg + // index. + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); + if (!SrcRC) + return false; + + I.getOperand(1).setSubReg(SubRegIdx); + } + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; +} + +/// \returns true if a bitmask for \p Size bits will be an inline immediate. +static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { + Mask = maskTrailingOnes(Size); + int SignedMask = static_cast(Mask); + return SignedMask >= -16 && SignedMask <= 64; +} + +bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { + bool Signed = I.getOpcode() == AMDGPU::G_SEXT; + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + const LLT S1 = LLT::scalar(1); + const unsigned SrcSize = SrcTy.getSizeInBits(); + const unsigned DstSize = DstTy.getSizeInBits(); + if (!DstTy.isScalar()) + return false; + + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + + if (SrcBank->getID() == AMDGPU::SCCRegBankID) { + if (SrcTy != S1 || DstSize > 64) // Invalid + return false; + + unsigned Opcode = + DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + const TargetRegisterClass *DstRC = + DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; + + // FIXME: Create an extra copy to avoid incorrectly constraining the result + // of the scc producer. + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) + .addReg(SrcReg); + BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(TmpReg); + + // The instruction operands are backwards from what you would expect. + BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) + .addImm(0) + .addImm(Signed ? -1 : 1); + return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + } + + if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { + if (SrcTy != S1) // Invalid + return false; + + MachineInstr *ExtI = + BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) // src0_modifiers + .addImm(0) // src0 + .addImm(0) // src1_modifiers + .addImm(Signed ? -1 : 1) // src1 + .addUse(SrcReg); + return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + } + + if (I.getOpcode() == AMDGPU::G_ANYEXT) + return selectCOPY(I); + + if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { + // 64-bit should have been split up in RegBankSelect + + // Try to use an and with a mask if it will save code size. + unsigned Mask; + if (!Signed && shouldUseAndMask(SrcSize, Mask)) { + MachineInstr *ExtI = + BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) + .addImm(Mask) + .addReg(SrcReg); + return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + } + + const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; + MachineInstr *ExtI = + BuildMI(MBB, I, DL, TII.get(BFE), DstReg) + .addReg(SrcReg) + .addImm(0) // Offset + .addImm(SrcSize); // Width + return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + } + + if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { + if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI)) + return false; + + if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { + const unsigned SextOpc = SrcSize == 8 ? + AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; + BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) + .addReg(SrcReg); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + } + + const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; + const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. + if (DstSize > 32 && SrcSize <= 32) { + // We need a 64-bit register source, but the high bits don't matter. + unsigned ExtReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned UndefReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); + BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) + .addReg(SrcReg) + .addImm(AMDGPU::sub0) + .addReg(UndefReg) + .addImm(AMDGPU::sub1); + + BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) + .addReg(ExtReg) + .addImm(SrcSize << 16); + + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); + } + + unsigned Mask; + if (!Signed && shouldUseAndMask(SrcSize, Mask)) { + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) + .addReg(SrcReg) + .addImm(Mask); + } else { + BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) + .addReg(SrcReg) + .addImm(SrcSize << 16); + } + + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + } + + return false; +} + bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); @@ -423,7 +1210,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, getAddrModeInfo(*PtrMI, MRI, AddrInfo); } -static bool isInstrUniform(const MachineInstr &MI) { +bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { if (!MI.hasOneMemOperand()) return false; @@ -445,52 +1232,6 @@ static bool isInstrUniform(const MachineInstr &MI) { return I && I->getMetadata("amdgpu.uniform"); } -static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { - - if (LoadSize == 32) - return BaseOpcode; - - switch (BaseOpcode) { - case AMDGPU::S_LOAD_DWORD_IMM: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM; - } - break; - case AMDGPU::S_LOAD_DWORD_IMM_ci: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM_ci; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM_ci; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM_ci; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM_ci; - } - break; - case AMDGPU::S_LOAD_DWORD_SGPR: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_SGPR; - case 128: - return AMDGPU::S_LOAD_DWORDX4_SGPR; - case 256: - return AMDGPU::S_LOAD_DWORDX8_SGPR; - case 512: - return AMDGPU::S_LOAD_DWORDX16_SGPR; - } - break; - } - llvm_unreachable("Invalid base smrd opcode or size"); -} - bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef AddrInfo) const { for (const GEPInfo &GEPInfo : AddrInfo) { if (!GEPInfo.VgprParts.empty()) @@ -499,125 +1240,77 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef AddrInfo) const { return false; } -bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, - ArrayRef AddrInfo) const { - - if (!I.hasOneMemOperand()) - return false; - - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS && - (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return false; - - if (!isInstrUniform(I)) - return false; - - if (hasVgprParts(AddrInfo)) - return false; +bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { + // TODO: Can/should we insert m0 initialization here for DS instructions and + // call the normal selector? + return false; +} +bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = I.getOperand(0).getReg(); + MachineOperand &CondOp = I.getOperand(0); + Register CondReg = CondOp.getReg(); const DebugLoc &DL = I.getDebugLoc(); - unsigned Opcode; - unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); - - if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { - - const GEPInfo &GEPInfo = AddrInfo[0]; - - unsigned PtrReg = GEPInfo.SgprParts[0]; - int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); - if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } + unsigned BrOpcode; + Register CondPhysReg; + const TargetRegisterClass *ConstrainRC; + + // In SelectionDAG, we inspect the IR block for uniformity metadata to decide + // whether the branch is uniform when selecting the instruction. In + // GlobalISel, we should push that decision into RegBankSelect. Assume for now + // RegBankSelect knows what it's doing if the branch condition is scc, even + // though it currently does not. + if (isSCC(CondReg, MRI)) { + CondPhysReg = AMDGPU::SCC; + BrOpcode = AMDGPU::S_CBRANCH_SCC1; + ConstrainRC = &AMDGPU::SReg_32_XM0RegClass; + } else if (isVCC(CondReg, MRI)) { + // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? + // We sort of know that a VCC producer based on the register bank, that ands + // inactive lanes with 0. What if there was a logical operation with vcc + // producers in different blocks/with different exec masks? + // FIXME: Should scc->vcc copies and with exec? + CondPhysReg = TRI.getVCC(); + BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; + ConstrainRC = TRI.getBoolRC(); + } else + return false; - if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && - isUInt<32>(EncodedImm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } + if (!MRI.getRegClassOrNull(CondReg)) + MRI.setRegClass(CondReg, ConstrainRC); - if (isUInt<32>(GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addReg(OffsetReg) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - } + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) + .addReg(CondReg); + BuildMI(*BB, &I, DL, TII.get(BrOpcode)) + .addMBB(I.getOperand(1).getMBB()); - unsigned PtrReg = I.getOperand(1).getReg(); - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(0) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + I.eraseFromParent(); + return true; } - -bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - DebugLoc DL = I.getDebugLoc(); - unsigned DstReg = I.getOperand(0).getReg(); - unsigned PtrReg = I.getOperand(1).getReg(); - unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); - unsigned Opcode; - - SmallVector AddrInfo; - - getAddrModeInfo(I, MRI, AddrInfo); - - if (selectSMRD(I, AddrInfo)) { - I.eraseFromParent(); - return true; - } - switch (LoadSize) { - default: - llvm_unreachable("Load size not supported\n"); - case 32: - Opcode = AMDGPU::FLAT_LOAD_DWORD; - break; - case 64: - Opcode = AMDGPU::FLAT_LOAD_DWORDX2; - break; - } + Register DstReg = I.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; + I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); + if (IsVGPR) + I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) - .add(I.getOperand(0)) - .addReg(PtrReg) - .addImm(0) // offset - .addImm(0) // glc - .addImm(0); // slc - - bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); - I.eraseFromParent(); - return Ret; + return RBI.constrainGenericRegister( + DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI); } bool AMDGPUInstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { + if (I.isPHI()) + return selectPHI(I); if (!isPreISelGenericOpcode(I.getOpcode())) { if (I.isCopy()) @@ -626,28 +1319,75 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, } switch (I.getOpcode()) { - default: + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + if (selectG_AND_OR_XOR(I)) + return true; return selectImpl(I, CoverageInfo); case TargetOpcode::G_ADD: - return selectG_ADD(I); + case TargetOpcode::G_SUB: + if (selectG_ADD_SUB(I)) + return true; + LLVM_FALLTHROUGH; + default: + return selectImpl(I, CoverageInfo); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: return selectCOPY(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: return selectG_CONSTANT(I); + case TargetOpcode::G_EXTRACT: + return selectG_EXTRACT(I); + case TargetOpcode::G_MERGE_VALUES: + case TargetOpcode::G_BUILD_VECTOR: + case TargetOpcode::G_CONCAT_VECTORS: + return selectG_MERGE_VALUES(I); + case TargetOpcode::G_UNMERGE_VALUES: + return selectG_UNMERGE_VALUES(I); case TargetOpcode::G_GEP: return selectG_GEP(I); case TargetOpcode::G_IMPLICIT_DEF: return selectG_IMPLICIT_DEF(I); + case TargetOpcode::G_INSERT: + return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: return selectG_INTRINSIC(I, CoverageInfo); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); + case TargetOpcode::G_ICMP: + if (selectG_ICMP(I)) + return true; + return selectImpl(I, CoverageInfo); case TargetOpcode::G_LOAD: - return selectG_LOAD(I); + return selectImpl(I, CoverageInfo); + case TargetOpcode::G_SELECT: + return selectG_SELECT(I); case TargetOpcode::G_STORE: + if (selectImpl(I, CoverageInfo)) + return true; return selectG_STORE(I); + case TargetOpcode::G_TRUNC: + return selectG_TRUNC(I); + case TargetOpcode::G_SEXT: + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_ANYEXT: + if (selectG_SZA_EXT(I)) { + I.eraseFromParent(); + return true; + } + + return false; + case TargetOpcode::G_BRCOND: + return selectG_BRCOND(I); + case TargetOpcode::G_FRAME_INDEX: + return selectG_FRAME_INDEX(I); + case TargetOpcode::G_FENCE: + // FIXME: Tablegen importer doesn't handle the imm operands correctly, and + // is checking for G_CONSTANT + I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); + return true; } return false; } @@ -660,6 +1400,26 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } +std::pair +AMDGPUInstructionSelector::selectVOP3ModsImpl( + Register Src, const MachineRegisterInfo &MRI) const { + unsigned Mods = 0; + MachineInstr *MI = MRI.getVRegDef(Src); + + if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { + Src = MI->getOperand(1).getReg(); + Mods |= SISrcMods::NEG; + MI = MRI.getVRegDef(Src); + } + + if (MI && MI->getOpcode() == AMDGPU::G_FABS) { + Src = MI->getOperand(1).getReg(); + Mods |= SISrcMods::ABS; + } + + return std::make_pair(Src, Mods); +} + /// /// This will select either an SGPR or VGPR operand and will save us from /// having to write an extra tablegen pattern. @@ -672,11 +1432,18 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod }}; } InstructionSelector::ComplexRendererFns @@ -690,8 +1457,274 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + + if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) + return None; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + if (!isUInt<32>(EncodedImm)) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*MI, MRI, AddrInfo); + + // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, + // then we can select all ptr + 32-bit offsets not just immediate offsets. + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) + return None; + + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + unsigned PtrReg = GEPInfo.SgprParts[0]; + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } + }}; +} + +template +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + InstructionSelector::ComplexRendererFns Default = {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc + }}; + + if (!STI.hasFlatInstOffsets()) + return Default; + + const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg()); + if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP) + return Default; + + Optional Offset = + getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI); + if (!Offset.hasValue()) + return Default; + + unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); + if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) + return Default; + + Register BasePtr = OpDef->getOperand(1).getReg(); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { + return selectFlatOffsetImpl(Root); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { + return selectFlatOffsetImpl(Root); +} + +// FIXME: Implement +static bool signBitIsZero(const MachineOperand &Op, + const MachineRegisterInfo &MRI) { + return false; +} + +static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { + auto PSV = PtrInfo.V.dyn_cast(); + return PSV && PSV->isStack(); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *Info = MF->getInfo(); + + int64_t Offset = 0; + if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) { + Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // TODO: Should this be inside the render function? The iterator seems to + // move. + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), + HighBits) + .addImm(Offset & ~4095); + + return {{[=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + MIB.addReg(HighBits); + }, + [=](MachineInstrBuilder &MIB) { // soffset + const MachineMemOperand *MMO = *MI->memoperands_begin(); + const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); + + Register SOffsetReg = isStackPtrRelative(PtrInfo) + ? Info->getStackPtrOffsetReg() + : Info->getScratchWaveOffsetReg(); + MIB.addReg(SOffsetReg); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset & 4095); + }}}; + } + + assert(Offset == 0); + + // Try to fold a frame index directly into the MUBUF vaddr field, and any + // offsets. + Optional FI; + Register VAddr = Root.getReg(); + if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) { + if (isBaseWithConstantOffset(Root, MRI)) { + const MachineOperand &LHS = RootDef->getOperand(1); + const MachineOperand &RHS = RootDef->getOperand(2); + const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); + const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t PossibleOffset = + RHSDef->getOperand(1).getCImm()->getSExtValue(); + if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && + (!STI.privateMemoryResourceIsRangeChecked() || + signBitIsZero(LHS, MRI))) { + if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) + FI = LHSDef->getOperand(1).getIndex(); + else + VAddr = LHS.getReg(); + Offset = PossibleOffset; + } + } + } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { + FI = RootDef->getOperand(1).getIndex(); + } + } + + // If we don't know this private access is a local stack object, it needs to + // be relative to the entry point's scratch wave offset register. + // TODO: Should split large offsets that don't fit like above. + // TODO: Don't use scratch wave offset just because the offset didn't fit. + Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() + : Info->getScratchWaveOffsetReg(); + + return {{[=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + if (FI.hasValue()) + MIB.addFrameIndex(FI.getValue()); + else + MIB.addReg(VAddr); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(SOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset); + }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFScratchOffset( + MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + int64_t Offset = 0; + if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) || + !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + return {}; + + const MachineFunction *MF = MBB->getParent(); + const SIMachineFunctionInfo *Info = MF->getInfo(); + const MachineMemOperand *MMO = *MI->memoperands_begin(); + const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); + + Register SOffsetReg = isStackPtrRelative(PtrInfo) + ? Info->getStackPtrOffsetReg() + : Info->getScratchWaveOffsetReg(); + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(Info->getScratchRSrcReg()); + }, // rsrc + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 449431adc561..4f489ddfb23d 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -1,9 +1,8 @@ //===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -18,7 +17,9 @@ #include "AMDGPUArgumentUsageInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/IR/InstrTypes.h" namespace { #define GET_GLOBALISEL_PREDICATE_BITSET @@ -58,24 +59,45 @@ private: GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } }; + bool isInstrUniform(const MachineInstr &MI) const; + bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const; + /// tblgen-erated 'select' implementation. bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; - MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const; + MachineOperand getSubOperand64(MachineOperand &MO, + const TargetRegisterClass &SubRC, + unsigned SubIdx) const; bool selectCOPY(MachineInstr &I) const; + bool selectPHI(MachineInstr &I) const; + bool selectG_TRUNC(MachineInstr &I) const; + bool selectG_SZA_EXT(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; - bool selectG_ADD(MachineInstr &I) const; + bool selectG_AND_OR_XOR(MachineInstr &I) const; + bool selectG_ADD_SUB(MachineInstr &I) const; + bool selectG_EXTRACT(MachineInstr &I) const; + bool selectG_MERGE_VALUES(MachineInstr &I) const; + bool selectG_UNMERGE_VALUES(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; + bool selectG_INSERT(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; + bool selectG_ICMP(MachineInstr &I) const; bool hasVgprParts(ArrayRef AddrInfo) const; void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl &AddrInfo) const; bool selectSMRD(MachineInstr &I, ArrayRef AddrInfo) const; bool selectG_LOAD(MachineInstr &I) const; + bool selectG_SELECT(MachineInstr &I) const; bool selectG_STORE(MachineInstr &I) const; + bool selectG_BRCOND(MachineInstr &I) const; + bool selectG_FRAME_INDEX(MachineInstr &I) const; + + std::pair + selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -90,6 +112,27 @@ private: InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdSgpr(MachineOperand &Root) const; + + template + InstructionSelector::ComplexRendererFns + selectFlatOffsetImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectFlatOffset(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectFlatOffsetSigned(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFScratchOffen(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectMUBUFScratchOffset(MachineOperand &Root) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index eb8f2002ff2d..61bc415c839d 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -1,9 +1,8 @@ //===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,18 @@ // //===----------------------------------------------------------------------===// +class AddressSpacesImpl { + int Flat = 0; + int Global = 1; + int Region = 2; + int Local = 3; + int Constant = 4; + int Private = 5; +} + +def AddrSpaces : AddressSpacesImpl; + + class AMDGPUInst pattern = []> : Instruction { field bit isRegisterLoad = 0; @@ -66,17 +77,15 @@ class ILFormat pattern> def TruePredicate : Predicate<"true">; -// Exists to help track down where SubtargetPredicate isn't set rather -// than letting tablegen crash with an unhelpful error. -def InvalidPred : Predicate<"predicate not set on instruction or pattern">; - class PredicateControl { - Predicate SubtargetPredicate = InvalidPred; + Predicate SubtargetPredicate = TruePredicate; list AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; + Predicate WaveSizePredicate = TruePredicate; list OtherPredicates = []; list Predicates = !listconcat([SubtargetPredicate, - AssemblerPredicate], + AssemblerPredicate, + WaveSizePredicate], AssemblerPredicates, OtherPredicates); } @@ -326,6 +335,10 @@ def TEX_SHADOW_ARRAY : PatLeaf< // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +class AddressSpaceList AS> { + list AddrSpaces = AS; +} + class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; }]>; @@ -344,21 +357,25 @@ class StoreHi16 : PatFrag < (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr) >; -class PrivateAddress : CodePatPred<[{ - return cast(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; -}]>; +def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>; +def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>; +def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; -class ConstantAddress : CodePatPred<[{ - return cast(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; -}]>; +def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, + AddrSpaces.Global, + AddrSpaces.Constant ]>; +def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>; + +def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; +def StoreAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; + +def LoadAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>; +def StoreAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>; + +def LoadAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>; +def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>; -class LocalAddress : CodePatPred<[{ - return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; -class GlobalAddress : CodePatPred<[{ - return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -}]>; class GlobalLoadAddress : CodePatPred<[{ auto AS = cast(N)->getAddressSpace(); @@ -372,86 +389,126 @@ class FlatLoadAddress : CodePatPred<[{ AS == AMDGPUAS::CONSTANT_ADDRESS; }]>; -class FlatStoreAddress : CodePatPred<[{ - const auto AS = cast(N)->getAddressSpace(); - return AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::GLOBAL_ADDRESS; +class GlobalAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; -class AZExtLoadBase : PatFrag<(ops node:$ptr), - (ld_node node:$ptr), [{ - LoadSDNode *L = cast(N); - return L->getExtensionType() == ISD::ZEXTLOAD || - L->getExtensionType() == ISD::EXTLOAD; +class PrivateAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; }]>; -def az_extload : AZExtLoadBase ; - -def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; +class LocalAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; -def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; +class RegionAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; }]>; -def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; +class FlatStoreAddress : CodePatPred<[{ + const auto AS = cast(N)->getAddressSpace(); + return AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::GLOBAL_ADDRESS; }]>; -class PrivateLoad : LoadFrag , PrivateAddress; +// TODO: Remove these when stores to new PatFrag format. class PrivateStore : StoreFrag , PrivateAddress; - -class LocalLoad : LoadFrag , LocalAddress; class LocalStore : StoreFrag , LocalAddress; - -class GlobalLoad : LoadFrag, GlobalLoadAddress; +class RegionStore : StoreFrag , RegionAddress; class GlobalStore : StoreFrag, GlobalAddress; - -class FlatLoad : LoadFrag , FlatLoadAddress; class FlatStore : StoreFrag , FlatStoreAddress; -class ConstantLoad : LoadFrag , ConstantAddress; +foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { -def load_private : PrivateLoad ; -def az_extloadi8_private : PrivateLoad ; -def sextloadi8_private : PrivateLoad ; -def az_extloadi16_private : PrivateLoad ; -def sextloadi16_private : PrivateLoad ; +def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; +} -def store_private : PrivateStore ; -def truncstorei8_private : PrivateStore; -def truncstorei16_private : PrivateStore ; -def store_hi16_private : StoreHi16 , PrivateAddress; -def truncstorei8_hi16_private : StoreHi16, PrivateAddress; +def extloadi8_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} +def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} + +def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} + +def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i32; +} -def load_global : GlobalLoad ; -def sextloadi8_global : GlobalLoad ; -def az_extloadi8_global : GlobalLoad ; -def sextloadi16_global : GlobalLoad ; -def az_extloadi16_global : GlobalLoad ; -def atomic_load_global : GlobalLoad; +def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i64; +} + +def store_#as : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} + +// truncstore fragments. +def truncstore_#as : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 1; +} + +// TODO: We don't really need the truncstore here. We can use +// unindexedstore with MemoryVT directly, which will save an +// unnecessary check that the memory size is less than the value type +// in the generated matcher table. +def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} + +def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} + +defm atomic_store_#as : binary_atomic_op; + +} // End let AddressSpaces = ... +} // End foreach AddrSpace + + +def store_hi16_private : StoreHi16 , PrivateAddress; +def truncstorei8_hi16_private : StoreHi16, PrivateAddress; -def store_global : GlobalStore ; -def truncstorei8_global : GlobalStore ; -def truncstorei16_global : GlobalStore ; def store_atomic_global : GlobalStore; def truncstorei8_hi16_global : StoreHi16 , GlobalAddress; def truncstorei16_hi16_global : StoreHi16 , GlobalAddress; -def load_local : LocalLoad ; -def az_extloadi8_local : LocalLoad ; -def sextloadi8_local : LocalLoad ; -def az_extloadi16_local : LocalLoad ; -def sextloadi16_local : LocalLoad ; -def atomic_load_32_local : LocalLoad; -def atomic_load_64_local : LocalLoad; - -def store_local : LocalStore ; -def truncstorei8_local : LocalStore ; -def truncstorei16_local : LocalStore ; def store_local_hi16 : StoreHi16 , LocalAddress; def truncstorei8_local_hi16 : StoreHi16, LocalAddress; def atomic_store_local : LocalStore ; @@ -472,34 +529,24 @@ def store_align16_local : Aligned16Bytes < (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) >; -def load_flat : FlatLoad ; -def az_extloadi8_flat : FlatLoad ; -def sextloadi8_flat : FlatLoad ; -def az_extloadi16_flat : FlatLoad ; -def sextloadi16_flat : FlatLoad ; -def atomic_load_flat : FlatLoad; - -def store_flat : FlatStore ; -def truncstorei8_flat : FlatStore ; -def truncstorei16_flat : FlatStore ; def atomic_store_flat : FlatStore ; def truncstorei8_hi16_flat : StoreHi16, FlatStoreAddress; def truncstorei16_hi16_flat : StoreHi16, FlatStoreAddress; -def constant_load : ConstantLoad; -def sextloadi8_constant : ConstantLoad ; -def az_extloadi8_constant : ConstantLoad ; -def sextloadi16_constant : ConstantLoad ; -def az_extloadi16_constant : ConstantLoad ; - - class local_binary_atomic_op : PatFrag<(ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{ return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class region_binary_atomic_op : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + + def atomic_swap_local : local_binary_atomic_op; def atomic_load_add_local : local_binary_atomic_op; def atomic_load_sub_local : local_binary_atomic_op; @@ -524,13 +571,22 @@ class AtomicCmpSwapLocal : PatFrag< return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class AtomicCmpSwapRegion : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast(N); + return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + def atomic_cmp_swap_local : AtomicCmpSwapLocal ; +class global_binary_atomic_op_frag : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + multiclass global_binary_atomic_op { - def "" : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + def "" : global_binary_atomic_op_frag; def _noret : PatFrag< (ops node:$ptr, node:$value), @@ -585,7 +641,6 @@ int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; int FP16_NEG_ONE = 0xBC00; -int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; @@ -626,9 +681,7 @@ class Extract_Element { - let SubtargetPredicate = TruePredicate; -} +>; /* Insert element pattern */ class Insert_Element { - let SubtargetPredicate = TruePredicate; -} +>; // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer // can handle COPY instructions. @@ -811,7 +862,7 @@ multiclass IntMed3Pat { - // This matches 16 permutations of + // This matches 16 permutations of // min(max(a, b), max(min(a, b), c)) def : AMDGPUPat < (min (max_oneuse vt:$src0, vt:$src1), @@ -819,7 +870,7 @@ multiclass IntMed3Pat; - // This matches 16 permutations of + // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) def : AMDGPUPat < (max (min_oneuse vt:$src0, vt:$src1), @@ -827,7 +878,7 @@ multiclass IntMed3Pat; } - + // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp deleted file mode 100644 index 02108ca3ddd7..000000000000 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// AMDGPU Implementation of the IntrinsicInfo class. -// -//===-----------------------------------------------------------------------===// - -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" - -using namespace llvm; - -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() - : TargetIntrinsicInfo() {} - -static const char *const IntrinsicNameTable[] = { -#define GET_INTRINSIC_NAME_TABLE -#include "AMDGPUGenIntrinsicImpl.inc" -#undef GET_INTRINSIC_NAME_TABLE -}; - -namespace { -#define GET_INTRINSIC_ATTRIBUTES -#include "AMDGPUGenIntrinsicImpl.inc" -#undef GET_INTRINSIC_ATTRIBUTES -} - -StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, - ArrayRef Tys) const { - if (IntrID < Intrinsic::num_intrinsics) - return StringRef(); - - assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics && - "Invalid intrinsic ID"); - - return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; -} - -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned NumTys) const { - return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); -} - -FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, - ArrayRef Tys) const { - // FIXME: Re-use Intrinsic::getType machinery - llvm_unreachable("unhandled intrinsic"); -} - -unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, - unsigned Len) const { - StringRef Name(NameData, Len); - if (!Name.startswith("llvm.")) - return 0; // All intrinsics start with 'llvm.' - - // Look for a name match in our table. If the intrinsic is not overloaded, - // require an exact match. If it is overloaded, require a prefix match. The - // AMDGPU enum enum starts at Intrinsic::num_intrinsics. - int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name); - if (Idx >= 0) { - bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]); - return IsPrefixMatch == isOverloaded(Idx + 1) - ? Intrinsic::num_intrinsics + Idx - : 0; - } - - return 0; -} - -bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { -// Overload Table -#define GET_INTRINSIC_OVERLOAD_TABLE -#include "AMDGPUGenIntrinsicImpl.inc" -#undef GET_INTRINSIC_OVERLOAD_TABLE -} - -Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - ArrayRef Tys) const { - FunctionType *FTy = getType(M->getContext(), IntrID, Tys); - Function *F - = cast(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); - - AttributeList AS = - getAttributes(M->getContext(), static_cast(IntrID)); - F->setAttributes(AS); - return F; -} - -Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned NumTys) const { - return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); -} diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h deleted file mode 100644 index a1a094dded23..000000000000 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ /dev/null @@ -1,58 +0,0 @@ -//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// Interface for the AMDGPU Implementation of the Intrinsic Info class. -// -//===-----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H - -#include "llvm/IR/Intrinsics.h" -#include "llvm/Target/TargetIntrinsicInfo.h" - -namespace llvm { -class TargetMachine; - -namespace SIIntrinsic { -enum ID { - last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, -#define GET_INTRINSIC_ENUM_VALUES -#include "AMDGPUGenIntrinsicEnums.inc" -#undef GET_INTRINSIC_ENUM_VALUES - , num_AMDGPU_intrinsics -}; - -} // end namespace AMDGPUIntrinsic - -class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { -public: - AMDGPUIntrinsicInfo(); - - StringRef getName(unsigned IntrId, ArrayRef Tys = None) const; - - std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned NumTys = 0) const override; - - unsigned lookupName(const char *Name, unsigned Len) const override; - bool isOverloaded(unsigned IID) const override; - Function *getDeclaration(Module *M, unsigned ID, - Type **Tys = nullptr, - unsigned NumTys = 0) const override; - - Function *getDeclaration(Module *M, unsigned ID, - ArrayRef = None) const; - - FunctionType *getType(LLVMContext &Context, unsigned ID, - ArrayRef Tys = None) const; -}; - -} // end namespace llvm - -#endif diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ef85c1040545..670f6225fbf7 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -15,17 +14,93 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPUTargetMachine.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" +#define DEBUG_TYPE "amdgpu-legalinfo" + using namespace llvm; using namespace LegalizeActions; +using namespace LegalizeMutations; +using namespace LegalityPredicates; + + +static LegalityPredicate isMultiple32(unsigned TypeIdx, + unsigned MaxSize = 512) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getScalarType(); + return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; + }; +} + +static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return Ty.isVector() && + Ty.getNumElements() % 2 != 0 && + Ty.getElementType().getSizeInBits() < 32; + }; +} -AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, - const GCNTargetMachine &TM) { +static LegalizeMutation oneMoreElement(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getElementType(); + return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); + }; +} + +static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getElementType(); + unsigned Size = Ty.getSizeInBits(); + unsigned Pieces = (Size + 63) / 64; + unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; + return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); + }; +} + +static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; + }; +} + +static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; + }; +} + +// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of +// v2s16. +static LegalityPredicate isRegisterType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const int EltSize = Ty.getElementType().getSizeInBits(); + return EltSize == 32 || EltSize == 64 || + (EltSize == 16 && Ty.getNumElements() % 2 == 0) || + EltSize == 128 || EltSize == 256; + } + + return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; + }; +} + +AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, + const GCNTargetMachine &TM) + : ST(ST_) { using namespace TargetOpcode; auto GetAddrSpacePtr = [&TM](unsigned AS) { @@ -33,13 +108,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, }; const LLT S1 = LLT::scalar(1); + const LLT S8 = LLT::scalar(8); + const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); + const LLT S128 = LLT::scalar(128); + const LLT S256 = LLT::scalar(256); const LLT S512 = LLT::scalar(512); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); - const LLT V8S16 = LLT::vector(8, 16); const LLT V2S32 = LLT::vector(2, 32); const LLT V3S32 = LLT::vector(3, 32); @@ -79,156 +157,428 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, const LLT CodePtr = FlatPtr; - const LLT AddrSpaces[] = { - GlobalPtr, - ConstantPtr, - LocalPtr, - FlatPtr, - PrivatePtr + const std::initializer_list AddrSpaces64 = { + GlobalPtr, ConstantPtr, FlatPtr + }; + + const std::initializer_list AddrSpaces32 = { + LocalPtr, PrivatePtr + }; + + const std::initializer_list FPTypesBase = { + S32, S64 + }; + + const std::initializer_list FPTypes16 = { + S32, S64, S16 + }; + + const std::initializer_list FPTypesPK16 = { + S32, S64, S16, V2S16 }; setAction({G_BRCOND, S1}, Legal); - setAction({G_ADD, S32}, Legal); - setAction({G_ASHR, S32}, Legal); - setAction({G_SUB, S32}, Legal); - setAction({G_MUL, S32}, Legal); + // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more + // elements for v3s16 + getActionDefinitionsBuilder(G_PHI) + .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) + .legalFor(AllS32Vectors) + .legalFor(AllS64Vectors) + .legalFor(AddrSpaces64) + .legalFor(AddrSpaces32) + .clampScalar(0, S32, S256) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .legalIf(isPointer(0)); - // FIXME: 64-bit ones only legal for scalar + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + .legalFor({S32, S16}) + .clampScalar(0, S16, S32) + .scalarize(0); + } else { + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .scalarize(0); + } + + getActionDefinitionsBuilder({G_UMULH, G_SMULH}) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .scalarize(0); + + // Report legal for any types we can handle anywhere. For the cases only legal + // on the SALU, RegBankSelect will be able to re-legalize. getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) - .legalFor({S32, S1, S64, V2S32}); + .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) + .clampScalar(0, S32, S64) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .widenScalarToNextPow2(0) + .scalarize(0); getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) - .legalFor({{S32, S1}}); + .legalFor({{S32, S1}}) + .clampScalar(0, S32, S32); - setAction({G_BITCAST, V2S16}, Legal); - setAction({G_BITCAST, 1, S32}, Legal); + getActionDefinitionsBuilder(G_BITCAST) + .legalForCartesianProduct({S32, V2S16}) + .legalForCartesianProduct({S64, V2S32, V4S16}) + .legalForCartesianProduct({V2S64, V4S32}) + // Don't worry about the size constraint. + .legalIf(all(isPointer(0), isPointer(1))); - setAction({G_BITCAST, S32}, Legal); - setAction({G_BITCAST, 1, V2S16}, Legal); - - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64}); + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64, S16}) + .clampScalar(0, S16, S64); + } else { + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64}) + .clampScalar(0, S32, S64); + } - // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that - // can fit in a register. - // FIXME: We need to legalize several more operations before we can add - // a test case for size > 512. getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalIf([=](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() <= 512; - }) - .clampScalar(0, S1, S512); + .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, + ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampScalarOrElt(0, S32, S512) + .legalIf(isMultiple32(0)) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16); - getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({S1, S32, S64}); // FIXME: i1 operands to intrinsics should always be legal, but other i1 // values may not be legal. We need to figure out how to distinguish // between these two scenarios. - setAction({G_CONSTANT, S1}, Legal); + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({S1, S32, S64, GlobalPtr, + LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(0) + .legalIf(isPointer(0)); setAction({G_FRAME_INDEX, PrivatePtr}, Legal); - getActionDefinitionsBuilder( - { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA}) + auto &FPOpActions = getActionDefinitionsBuilder( + { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) .legalFor({S32, S64}); - getActionDefinitionsBuilder(G_FPTRUNC) - .legalFor({{S32, S64}}); + if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) + FPOpActions.legalFor({S16, V2S16}); + else + FPOpActions.legalFor({S16}); + } - // Use actual fsub instruction - setAction({G_FSUB, S32}, Legal); + auto &MinNumMaxNum = getActionDefinitionsBuilder({ + G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + + if (ST.hasVOP3PInsts()) { + MinNumMaxNum.customFor(FPTypesPK16) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.has16BitInsts()) { + MinNumMaxNum.customFor(FPTypes16) + .clampScalar(0, S16, S64) + .scalarize(0); + } else { + MinNumMaxNum.customFor(FPTypesBase) + .clampScalar(0, S32, S64) + .scalarize(0); + } - // Must use fadd + fneg - setAction({G_FSUB, S64}, Lower); + // TODO: Implement + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); - setAction({G_FCMP, S1}, Legal); - setAction({G_FCMP, 1, S32}, Legal); - setAction({G_FCMP, 1, S64}, Legal); + if (ST.hasVOP3PInsts()) + FPOpActions.clampMaxNumElements(0, S16, 2); + FPOpActions + .scalarize(0) + .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); - setAction({G_ZEXT, S64}, Legal); - setAction({G_ZEXT, 1, S32}, Legal); + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S64, S16}) + .scalarize(0) + .clampScalar(0, S16, S64); + } else { + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } - setAction({G_SEXT, S64}, Legal); - setAction({G_SEXT, 1, S32}, Legal); + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor({{S32, S64}, {S16, S32}}) + .scalarize(0); - setAction({G_ANYEXT, S64}, Legal); - setAction({G_ANYEXT, 1, S32}, Legal); + getActionDefinitionsBuilder(G_FPEXT) + .legalFor({{S64, S32}, {S32, S16}}) + .lowerFor({{S64, S16}}) // FIXME: Implement + .scalarize(0); - setAction({G_FPTOSI, S32}, Legal); - setAction({G_FPTOSI, 1, S32}, Legal); + // TODO: Verify V_BFI_B32 is generated from expanded bit ops. + getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); - setAction({G_SITOFP, S32}, Legal); - setAction({G_SITOFP, 1, S32}, Legal); + getActionDefinitionsBuilder(G_FSUB) + // Use actual fsub instruction + .legalFor({S32}) + // Must use fadd + fneg + .lowerFor({S64, S16, V2S16}) + .scalarize(0) + .clampScalar(0, S32, S64); - setAction({G_UITOFP, S32}, Legal); - setAction({G_UITOFP, 1, S32}, Legal); + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) + .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, + {S32, S1}, {S64, S1}, {S16, S1}, + // FIXME: Hack + {S64, LLT::scalar(33)}, + {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) + .scalarize(0); - setAction({G_FPTOUI, S32}, Legal); - setAction({G_FPTOUI, 1, S32}, Legal); + getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) + .legalFor({{S32, S32}, {S64, S32}}) + .lowerFor({{S32, S64}}) + .customFor({{S64, S64}}) + .scalarize(0); - setAction({G_FPOW, S32}, Legal); - setAction({G_FEXP2, S32}, Legal); - setAction({G_FLOG2, S32}, Legal); + getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalFor({{S32, S32}, {S32, S64}}) + .scalarize(0); - getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND}) - .legalFor({S32, S64}); + getActionDefinitionsBuilder(G_INTRINSIC_ROUND) + .legalFor({S32, S64}) + .scalarize(0); - for (LLT PtrTy : AddrSpaces) { - LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits()); - setAction({G_GEP, PtrTy}, Legal); - setAction({G_GEP, 1, IdxTy}, Legal); + if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) + .legalFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0); + } else { + getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) + .legalFor({S32}) + .customFor({S64}) + .clampScalar(0, S32, S64) + .scalarize(0); } + getActionDefinitionsBuilder(G_GEP) + .legalForCartesianProduct(AddrSpaces64, {S64}) + .legalForCartesianProduct(AddrSpaces32, {S32}) + .scalarize(0); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); - setAction({G_ICMP, S1}, Legal); - setAction({G_ICMP, 1, S32}, Legal); + auto &CmpBuilder = + getActionDefinitionsBuilder(G_ICMP) + .legalForCartesianProduct( + {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) + .legalFor({{S1, S32}, {S1, S64}}); + if (ST.has16BitInsts()) { + CmpBuilder.legalFor({{S1, S16}}); + } + + CmpBuilder + .widenScalarToNextPow2(1) + .clampScalar(1, S32, S64) + .scalarize(0) + .legalIf(all(typeIs(0, S1), isPointer(1))); + + getActionDefinitionsBuilder(G_FCMP) + .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) + .widenScalarToNextPow2(1) + .clampScalar(1, S32, S64) + .scalarize(0); + + // FIXME: fexp, flog2, flog10 needs to be custom lowered. + getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, + G_FLOG, G_FLOG2, G_FLOG10}) + .legalFor({S32}) + .scalarize(0); + + // The 64-bit versions produce 32-bit results, but only on the SALU. + getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, + G_CTTZ, G_CTTZ_ZERO_UNDEF, + G_CTPOP}) + .legalFor({{S32, S32}, {S32, S64}}) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S64) + .scalarize(0) + .widenScalarToNextPow2(0, 32) + .widenScalarToNextPow2(1, 32); + + // TODO: Expand for > s32 + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .scalarize(0); + + if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S32) + .widenScalarToNextPow2(0) + .scalarize(0); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16}) + .widenScalarToNextPow2(0) + .clampScalar(0, S16, S32) + .scalarize(0); + } + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .widenScalarToNextPow2(0) + .scalarize(0); + } - setAction({G_CTLZ, S32}, Legal); - setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal); - setAction({G_CTTZ, S32}, Legal); - setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal); - setAction({G_BSWAP, S32}, Legal); - setAction({G_CTPOP, S32}, Legal); + auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx0].getSizeInBits() < + Query.Types[TypeIdx1].getSizeInBits(); + }; + }; + + auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx0].getSizeInBits() > + Query.Types[TypeIdx1].getSizeInBits(); + }; + }; getActionDefinitionsBuilder(G_INTTOPTR) - .legalIf([](const LegalityQuery &Query) { - return true; - }); + // List the common cases + .legalForCartesianProduct(AddrSpaces64, {S64}) + .legalForCartesianProduct(AddrSpaces32, {S32}) + .scalarize(0) + // Accept any address space as long as the size matches + .legalIf(sameSize(0, 1)) + .widenScalarIf(smallerThan(1, 0), + [](const LegalityQuery &Query) { + return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); + }) + .narrowScalarIf(greaterThan(1, 0), + [](const LegalityQuery &Query) { + return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); + }); getActionDefinitionsBuilder(G_PTRTOINT) - .legalIf([](const LegalityQuery &Query) { - return true; - }); + // List the common cases + .legalForCartesianProduct(AddrSpaces64, {S64}) + .legalForCartesianProduct(AddrSpaces32, {S32}) + .scalarize(0) + // Accept any address space as long as the size matches + .legalIf(sameSize(0, 1)) + .widenScalarIf(smallerThan(0, 1), + [](const LegalityQuery &Query) { + return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); + }) + .narrowScalarIf( + greaterThan(0, 1), + [](const LegalityQuery &Query) { + return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); + }); + + if (ST.hasFlatAddressSpace()) { + getActionDefinitionsBuilder(G_ADDRSPACE_CAST) + .scalarize(0) + .custom(); + } + // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we + // handle some operations by just promoting the register during + // selection. There are also d16 loads on GFX9+ which preserve the high bits. getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalIf([=, &ST](const LegalityQuery &Query) { + .narrowScalarIf([](const LegalityQuery &Query) { + unsigned Size = Query.Types[0].getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + return (Size > 32 && MemSize < Size); + }, + [](const LegalityQuery &Query) { + return std::make_pair(0, LLT::scalar(32)); + }) + .fewerElementsIf([=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + return (MemSize == 96) && + Query.Types[0].isVector() && + !ST.hasDwordx3LoadStores(); + }, + [=](const LegalityQuery &Query) { + return std::make_pair(0, V2S32); + }) + .legalIf([=](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; + unsigned Size = Ty0.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + if (Size < 32 || (Size > 32 && MemSize < Size)) + return false; + + if (Ty0.isVector() && Size != MemSize) + return false; + // TODO: Decompose private loads into 4-byte components. // TODO: Illegal flat loads on SI - switch (Ty0.getSizeInBits()) { + switch (MemSize) { + case 8: + case 16: + return Size == 32; case 32: case 64: case 128: return true; case 96: - // XXX hasLoadX3 - return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); + return ST.hasDwordx3LoadStores(); case 256: case 512: - // TODO: constant loads + // TODO: Possibly support loads of i256 and i512 . This will require + // adding i256 and i512 types to MVT in order for to be able to use + // TableGen. + // TODO: Add support for other vector types, this will require + // defining more value mappings for the new types. + return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || + Ty0.getScalarType().getSizeInBits() == 64); + default: return false; } - }); + }) + .clampScalar(0, S32, S64); + // FIXME: Handle alignment requirements. + auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) + .legalForTypesWithMemDesc({ + {S32, GlobalPtr, 8, 8}, + {S32, GlobalPtr, 16, 8}, + {S32, LocalPtr, 8, 8}, + {S32, LocalPtr, 16, 8}, + {S32, PrivatePtr, 8, 8}, + {S32, PrivatePtr, 16, 8}}); + if (ST.hasFlatAddressSpace()) { + ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, + {S32, FlatPtr, 16, 8}}); + } + + ExtLoads.clampScalar(0, S32, S32) + .widenScalarToNextPow2(0) + .unsupportedIfMemSizeNotPow2() + .lower(); + auto &Atomics = getActionDefinitionsBuilder( {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, @@ -240,84 +590,805 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } - setAction({G_SELECT, S32}, Legal); - setAction({G_SELECT, 1, S1}, Legal); + // TODO: Pointer types, any 32-bit or 64-bit vector + getActionDefinitionsBuilder(G_SELECT) + .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, + GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, + LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) + .clampScalar(0, S16, S64) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .fewerElementsIf(numElementsNotEven(0), scalarize(0)) + .scalarize(1) + .clampMaxNumElements(0, S32, 2) + .clampMaxNumElements(0, LocalPtr, 2) + .clampMaxNumElements(0, PrivatePtr, 2) + .scalarize(0) + .widenScalarToNextPow2(0) + .legalIf(all(isPointer(0), typeIs(1, S1))); - setAction({G_SHL, S32}, Legal); + // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can + // be more flexible with the shift amount type. + auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) + .legalFor({{S32, S32}, {S64, S32}}); + if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) { + Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) + .clampMaxNumElements(0, S16, 2); + } else + Shifts.legalFor({{S16, S32}, {S16, S16}}); - - // FIXME: When RegBankSelect inserts copies, it will only create new - // registers with scalar types. This means we can end up with - // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer - // operands. In assert builds, the instruction selector will assert - // if it sees a generic instruction which isn't legal, so we need to - // tell it that scalar types are legal for pointer operands - setAction({G_GEP, S64}, Legal); + Shifts.clampScalar(1, S16, S32); + Shifts.clampScalar(0, S16, S64); + Shifts.widenScalarToNextPow2(0, 16); + } else { + // Make sure we legalize the shift amount type first, as the general + // expansion for the shifted type will produce much worse code if it hasn't + // been truncated already. + Shifts.clampScalar(1, S32, S32); + Shifts.clampScalar(0, S32, S64); + Shifts.widenScalarToNextPow2(0, 32); + } + Shifts.scalarize(0); for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { + unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; + unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; + unsigned IdxTypeIdx = 2; + getActionDefinitionsBuilder(Op) - .legalIf([=](const LegalityQuery &Query) { - const LLT &VecTy = Query.Types[1]; - const LLT &IdxTy = Query.Types[2]; - return VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= 512 && - IdxTy.getSizeInBits() == 32; - }); + .customIf([=](const LegalityQuery &Query) { + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const LLT IdxTy = Query.Types[IdxTypeIdx]; + return (EltTy.getSizeInBits() == 16 || + EltTy.getSizeInBits() % 32 == 0) && + VecTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= 512 && + IdxTy.getSizeInBits() == 32; + }) + .clampScalar(EltTypeIdx, S32, S64) + .clampScalar(VecTypeIdx, S32, S64) + .clampScalar(IdxTypeIdx, S32, S32); } - // FIXME: Doesn't handle extract of illegal sizes. - getActionDefinitionsBuilder({G_EXTRACT, G_INSERT}) - .legalIf([=](const LegalityQuery &Query) { - const LLT &Ty0 = Query.Types[0]; - const LLT &Ty1 = Query.Types[1]; - return (Ty0.getSizeInBits() % 32 == 0) && - (Ty1.getSizeInBits() % 32 == 0); + getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) + .unsupportedIf([=](const LegalityQuery &Query) { + const LLT &EltTy = Query.Types[1].getElementType(); + return Query.Types[0] != EltTy; }); + for (unsigned Op : {G_EXTRACT, G_INSERT}) { + unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; + unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; + + // FIXME: Doesn't handle extract of illegal sizes. + getActionDefinitionsBuilder(Op) + .legalIf([=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + const LLT LitTy = Query.Types[LitTyIdx]; + return (BigTy.getSizeInBits() % 32 == 0) && + (LitTy.getSizeInBits() % 16 == 0); + }) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + return (BigTy.getScalarSizeInBits() < 16); + }, + LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT LitTy = Query.Types[LitTyIdx]; + return (LitTy.getScalarSizeInBits() < 16); + }, + LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) + .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) + .widenScalarToNextPow2(BigTyIdx, 32); + + } + getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalForCartesianProduct(AllS32Vectors, {S32}) - .legalForCartesianProduct(AllS64Vectors, {S64}) - .clampNumElements(0, V16S32, V16S32) - .clampNumElements(0, V2S64, V8S64) - .minScalarSameAs(1, 0); + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V16S32) + .clampNumElements(0, V2S64, V8S64) + .minScalarSameAs(1, 0) + .legalIf(isRegisterType(0)) + .minScalarOrElt(0, S32); - // TODO: Support any combination of v2s32 getActionDefinitionsBuilder(G_CONCAT_VECTORS) - .legalFor({{V4S32, V2S32}, - {V8S32, V2S32}, - {V8S32, V4S32}, - {V4S64, V2S64}, - {V4S16, V2S16}, - {V8S16, V2S16}, - {V8S16, V4S16}}); + .legalIf(isRegisterType(0)); // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; + auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { + const LLT &Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const LLT &EltTy = Ty.getElementType(); + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + return true; + if (!isPowerOf2_32(EltTy.getSizeInBits())) + return true; + } + return false; + }; + getActionDefinitionsBuilder(Op) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, S16, S256) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) + + // Break up vectors with weird elements into scalars + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + scalarize(0)) + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + scalarize(1)) + .clampScalar(BigTyIdx, S32, S512) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 16 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) .legalIf([=](const LegalityQuery &Query) { const LLT &BigTy = Query.Types[BigTyIdx]; const LLT &LitTy = Query.Types[LitTyIdx]; - return BigTy.getSizeInBits() % 32 == 0 && - LitTy.getSizeInBits() % 32 == 0 && + + if (BigTy.isVector() && BigTy.getSizeInBits() < 32) + return false; + if (LitTy.isVector() && LitTy.getSizeInBits() < 32) + return false; + + return BigTy.getSizeInBits() % 16 == 0 && + LitTy.getSizeInBits() % 16 == 0 && BigTy.getSizeInBits() <= 512; }) // Any vectors left are the wrong size. Scalarize them. - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 0, Query.Types[0].getElementType()); - }) - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 1, Query.Types[1].getElementType()); - }); - + .scalarize(0) + .scalarize(1); } computeTables(); verify(*ST.getInstrInfo()); } + +bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_ADDRSPACE_CAST: + return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); + case TargetOpcode::G_FRINT: + return legalizeFrint(MI, MRI, MIRBuilder); + case TargetOpcode::G_FCEIL: + return legalizeFceil(MI, MRI, MIRBuilder); + case TargetOpcode::G_INTRINSIC_TRUNC: + return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); + case TargetOpcode::G_SITOFP: + return legalizeITOFP(MI, MRI, MIRBuilder, true); + case TargetOpcode::G_UITOFP: + return legalizeITOFP(MI, MRI, MIRBuilder, false); + case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMAXNUM: + case TargetOpcode::G_FMINNUM_IEEE: + case TargetOpcode::G_FMAXNUM_IEEE: + return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return legalizeExtractVectorElt(MI, MRI, MIRBuilder); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return legalizeInsertVectorElt(MI, MRI, MIRBuilder); + default: + return false; + } + + llvm_unreachable("expected switch to return"); +} + +Register AMDGPULegalizerInfo::getSegmentAperture( + unsigned AS, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MachineFunction &MF = MIRBuilder.getMF(); + const GCNSubtarget &ST = MF.getSubtarget(); + const LLT S32 = LLT::scalar(32); + + if (ST.hasApertureRegs()) { + // FIXME: Use inline constants (src_{shared, private}_base) instead of + // getreg. + unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? + AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : + AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? + AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : + AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = + AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + + Register ApertureReg = MRI.createGenericVirtualRegister(S32); + Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) + .addDef(GetReg) + .addImm(Encoding); + MRI.setType(GetReg, S32); + + auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); + MIRBuilder.buildInstr(TargetOpcode::G_SHL) + .addDef(ApertureReg) + .addUse(GetReg) + .addUse(ShiftAmt.getReg(0)); + + return ApertureReg; + } + + Register QueuePtr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + // FIXME: Placeholder until we can track the input registers. + MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); + + // Offset into amd_queue_t for group_segment_aperture_base_hi / + // private_segment_aperture_base_hi. + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + + // FIXME: Don't use undef + Value *V = UndefValue::get(PointerType::get( + Type::getInt8Ty(MF.getFunction().getContext()), + AMDGPUAS::CONSTANT_ADDRESS)); + + MachinePointerInfo PtrInfo(V, StructOffset); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 4, + MinAlign(64, StructOffset)); + + Register LoadResult = MRI.createGenericVirtualRegister(S32); + Register LoadAddr; + + MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); + return LoadResult; +} + +bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MachineFunction &MF = MIRBuilder.getMF(); + + MIRBuilder.setInstr(MI); + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + unsigned DestAS = DstTy.getAddressSpace(); + unsigned SrcAS = SrcTy.getAddressSpace(); + + // TODO: Avoid reloading from the queue ptr for each cast, or at least each + // vector element. + assert(!DstTy.isVector()); + + const AMDGPUTargetMachine &TM + = static_cast(MF.getTarget()); + + const GCNSubtarget &ST = MF.getSubtarget(); + if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { + MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); + return true; + } + + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { + assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || + DestAS == AMDGPUAS::PRIVATE_ADDRESS); + unsigned NullVal = TM.getNullPointerValue(DestAS); + + auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); + auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); + + Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); + + // Extract low 32-bits of the pointer. + MIRBuilder.buildExtract(PtrLo32, Src, 0); + + Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); + MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); + + MI.eraseFromParent(); + return true; + } + + assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS); + + auto SegmentNull = + MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = + MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + + Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); + + Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); + + Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); + + // Coerce the type of the low half of the result so we can use merge_values. + Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); + MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) + .addDef(SrcAsInt) + .addUse(Src); + + // TODO: Should we allow mismatched types but matching sizes in merges to + // avoid the ptrtoint? + MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); + MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFrint( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MIRBuilder.setInstr(MI); + + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Src); + assert(Ty.isScalar() && Ty.getSizeInBits() == 64); + + APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); + APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); + + auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); + auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); + + // TODO: Should this propagate fast-math-flags? + auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); + auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); + + auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); + auto Fabs = MIRBuilder.buildFAbs(Ty, Src); + + auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); + MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFceil( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + const LLT S1 = LLT::scalar(1); + const LLT S64 = LLT::scalar(64); + + Register Src = MI.getOperand(1).getReg(); + assert(MRI.getType(Src) == S64); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); + + const auto Zero = B.buildFConstant(S64, 0.0); + const auto One = B.buildFConstant(S64, 1.0); + auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); + auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); + auto And = B.buildAnd(S1, Lt0, NeTrunc); + auto Add = B.buildSelect(S64, And, One, Zero); + + // TODO: Should this propagate fast-math-flags? + B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); + return true; +} + +static MachineInstrBuilder extractF64Exponent(unsigned Hi, + MachineIRBuilder &B) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + LLT S32 = LLT::scalar(32); + + auto Const0 = B.buildConstant(S32, FractBits - 32); + auto Const1 = B.buildConstant(S32, ExpBits); + + auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) + .addUse(Const0.getReg(0)) + .addUse(Const1.getReg(0)); + + return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); +} + +bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + + Register Src = MI.getOperand(1).getReg(); + assert(MRI.getType(Src) == S64); + + // TODO: Should this use extract since the low half is unused? + auto Unmerge = B.buildUnmerge({S32, S32}, Src); + Register Hi = Unmerge.getReg(1); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + auto Exp = extractF64Exponent(Hi, B); + + const unsigned FractBits = 52; + + // Extract the sign bit. + const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); + auto SignBit = B.buildAnd(S32, Hi, SignBitMask); + + const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); + + const auto Zero32 = B.buildConstant(S32, 0); + + // Extend back to 64-bits. + auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); + + auto Shr = B.buildAShr(S64, FractMask, Exp); + auto Not = B.buildNot(S64, Shr); + auto Tmp0 = B.buildAnd(S64, Src, Not); + auto FiftyOne = B.buildConstant(S32, FractBits - 1); + + auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); + auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); + + auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); + B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); + return true; +} + +bool AMDGPULegalizerInfo::legalizeITOFP( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool Signed) const { + B.setInstr(MI); + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + + assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); + + auto Unmerge = B.buildUnmerge({S32, S32}, Src); + + auto CvtHi = Signed ? + B.buildSITOFP(S64, Unmerge.getReg(1)) : + B.buildUITOFP(S64, Unmerge.getReg(1)); + + auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); + + auto ThirtyTwo = B.buildConstant(S32, 32); + auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) + .addUse(CvtHi.getReg(0)) + .addUse(ThirtyTwo.getReg(0)); + + // TODO: Should this propagate fast-math-flags? + B.buildFAdd(Dst, LdExp, CvtLo); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || + MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; + + // With ieee_mode disabled, the instructions have the correct behavior + // already for G_FMINNUM/G_FMAXNUM + if (!MFI->getMode().IEEE) + return !IsIEEEOp; + + if (IsIEEEOp) + return true; + + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); + HelperBuilder.setMBB(*MI.getParent()); + return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; +} + +bool AMDGPULegalizerInfo::legalizeExtractVectorElt( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // TODO: Should move some of this into LegalizerHelper. + + // TODO: Promote dynamic indexing of s16 to s32 + // TODO: Dynamic s64 indexing is only legal for SGPR. + Optional IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); + if (!IdxVal) // Dynamic case will be selected to register indexing. + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Dst)); + + B.setInstr(MI); + + if (IdxVal.getValue() < VecTy.getNumElements()) + B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); + else + B.buildUndef(Dst); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeInsertVectorElt( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // TODO: Should move some of this into LegalizerHelper. + + // TODO: Promote dynamic indexing of s16 to s32 + // TODO: Dynamic s64 indexing is only legal for SGPR. + Optional IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); + if (!IdxVal) // Dynamic case will be selected to register indexing. + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + Register Ins = MI.getOperand(2).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Ins)); + + B.setInstr(MI); + + if (IdxVal.getValue() < VecTy.getNumElements()) + B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); + else + B.buildUndef(Dst); + + MI.eraseFromParent(); + return true; +} + +// Return the use branch instruction, otherwise null if the usage is invalid. +static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI) { + Register CondDef = MI.getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDef)) + return nullptr; + + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); + return UseMI.getParent() == MI.getParent() && + UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; +} + +Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, + Register Reg, LLT Ty) const { + Register LiveIn = MRI.getLiveInVirtReg(Reg); + if (LiveIn) + return LiveIn; + + Register NewReg = MRI.createGenericVirtualRegister(Ty); + MRI.addLiveIn(Reg, NewReg); + return NewReg; +} + +bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg) const { + if (!Arg->isRegister()) + return false; // TODO: Handle these + + assert(Arg->getRegister() != 0); + assert(Arg->getRegister().isPhysical()); + + MachineRegisterInfo &MRI = *B.getMRI(); + + LLT Ty = MRI.getType(DstReg); + Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); + + if (Arg->isMasked()) { + // TODO: Should we try to emit this once in the entry block? + const LLT S32 = LLT::scalar(32); + const unsigned Mask = Arg->getMask(); + const unsigned Shift = countTrailingZeros(Mask); + + auto ShiftAmt = B.buildConstant(S32, Shift); + auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); + B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); + } else + B.buildCopy(DstReg, LiveIn); + + // Insert the argument copy if it doens't already exist. + // FIXME: It seems EmitLiveInCopies isn't called anywhere? + if (!MRI.getVRegDef(LiveIn)) { + MachineBasicBlock &EntryMBB = B.getMF().front(); + EntryMBB.addLiveIn(Arg->getRegister()); + B.setInsertPt(EntryMBB, EntryMBB.begin()); + B.buildCopy(LiveIn, Arg->getRegister()); + } + + return true; +} + +bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( + MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + B.setInstr(MI); + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); + if (!Arg) { + LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + return false; + } + + if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { + MI.eraseFromParent(); + return true; + } + + return false; +} + +bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + if (!MFI->isEntryFunction()) { + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } + + B.setInstr(MI); + + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset( + B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); + + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + std::tie(Arg, RC) + = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + if (!Arg) + return false; + + Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); + if (!loadInputValue(KernargPtrReg, B, Arg)) + return false; + + B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // Replace the use G_BRCOND with the exec manipulate and branch pseudos. + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_if: { + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { + const SIRegisterInfo *TRI + = static_cast(MRI.getTargetRegisterInfo()); + + B.setInstr(*BrCond); + Register Def = MI.getOperand(1).getReg(); + Register Use = MI.getOperand(3).getReg(); + B.buildInstr(AMDGPU::SI_IF) + .addDef(Def) + .addUse(Use) + .addMBB(BrCond->getOperand(1).getMBB()); + + MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); + MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); + MI.eraseFromParent(); + BrCond->eraseFromParent(); + return true; + } + + return false; + } + case Intrinsic::amdgcn_loop: { + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { + const SIRegisterInfo *TRI + = static_cast(MRI.getTargetRegisterInfo()); + + B.setInstr(*BrCond); + Register Reg = MI.getOperand(2).getReg(); + B.buildInstr(AMDGPU::SI_LOOP) + .addUse(Reg) + .addMBB(BrCond->getOperand(1).getMBB()); + MI.eraseFromParent(); + BrCond->eraseFromParent(); + MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); + return true; + } + + return false; + } + case Intrinsic::amdgcn_kernarg_segment_ptr: + return legalizePreloadedArgIntrin( + MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + case Intrinsic::amdgcn_implicitarg_ptr: + return legalizeImplicitArgPtr(MI, MRI, B); + case Intrinsic::amdgcn_workitem_id_x: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_X); + case Intrinsic::amdgcn_workitem_id_y: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + case Intrinsic::amdgcn_workitem_id_z: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + case Intrinsic::amdgcn_workgroup_id_x: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X); + case Intrinsic::amdgcn_workgroup_id_y: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); + case Intrinsic::amdgcn_workgroup_id_z: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_dispatch_ptr: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::DISPATCH_PTR); + case Intrinsic::amdgcn_queue_ptr: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::QUEUE_PTR); + case Intrinsic::amdgcn_implicit_buffer_ptr: + return legalizePreloadedArgIntrin( + MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); + case Intrinsic::amdgcn_dispatch_id: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::DISPATCH_ID); + default: + return true; + } + + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 1cbd37c42c4b..3f1cc1d265dd 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -1,9 +1,8 @@ //===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -16,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "AMDGPUArgumentUsageInfo.h" namespace llvm { @@ -25,9 +25,51 @@ class GCNSubtarget; /// This class provides the information for the target register banks. class AMDGPULegalizerInfo : public LegalizerInfo { + const GCNSubtarget &ST; + public: AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const override; + + Register getSegmentAperture(unsigned AddrSpace, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + + bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, bool Signed) const; + bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + + Register getLiveInRegister(MachineRegisterInfo &MRI, + Register Reg, LLT Ty) const; + + bool loadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg) const; + bool legalizePreloadedArgIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + + bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + }; } // End llvm namespace. #endif diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 14e880042691..ce0a9db7c7f4 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1,9 +1,8 @@ //===- AMDGPULibCalls.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPULibFunc.h" +#include "AMDGPUSubtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/ADT/StringSet.h" @@ -23,6 +23,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -30,6 +31,7 @@ #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include @@ -66,6 +68,8 @@ private: typedef llvm::AMDGPULibFunc FuncInfo; + const TargetMachine *TM; + // -fuse-native. bool AllNative = false; @@ -73,7 +77,7 @@ private: // Return a pointer (pointer expr) to the function if function defintion with // "FuncName" exists. It may create a new function prototype in pre-link mode. - Constant *getFunction(Module *M, const FuncInfo& fInfo); + FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); // Replace a normal function with its native version. bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo); @@ -135,12 +139,15 @@ private: // __read_pipe/__write_pipe bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo); + // llvm.amdgcn.wavefrontsize + bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B); + // Get insertion point at entry. BasicBlock::iterator getEntryIns(CallInst * UI); // Insert an Alloc instruction. AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix); // Get a scalar native builtin signle argument FP function - Constant* getNativeFunction(Module* M, const FuncInfo &FInfo); + FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo); protected: CallInst *CI; @@ -153,6 +160,8 @@ protected: } public: + AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {} + bool fold(CallInst *CI, AliasAnalysis *AA = nullptr); void initNativeFuncs(); @@ -167,15 +176,16 @@ namespace { class AMDGPUSimplifyLibCalls : public FunctionPass { - AMDGPULibCalls Simplifier; - const TargetOptions Options; + AMDGPULibCalls Simplifier; + public: static char ID; // Pass identification - AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions()) - : FunctionPass(ID), Options(Opt) { + AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(), + const TargetMachine *TM = nullptr) + : FunctionPass(ID), Options(Opt), Simplifier(TM) { initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -217,19 +227,19 @@ INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative", false, false) template -static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg, +static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, const Twine &Name = "") { CallInst *R = B.CreateCall(Callee, Arg, Name); - if (Function* F = dyn_cast(Callee)) + if (Function *F = dyn_cast(Callee.getCallee())) R->setCallingConv(F->getCallingConv()); return R; } template -static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2, - const Twine &Name = "") { +static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1, + Value *Arg2, const Twine &Name = "") { CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name); - if (Function* F = dyn_cast(Callee)) + if (Function *F = dyn_cast(Callee.getCallee())) R->setCallingConv(F->getCallingConv()); return R; } @@ -472,7 +482,7 @@ static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) { return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType; } -Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) { +FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) { // If we are doing PreLinkOpt, the function is external. So it is safe to // use getOrInsertFunction() at this stage. @@ -519,11 +529,11 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { nf.setPrefix(AMDGPULibFunc::NATIVE); nf.setId(AMDGPULibFunc::EI_SIN); - Constant *sinExpr = getFunction(M, nf); + FunctionCallee sinExpr = getFunction(M, nf); nf.setPrefix(AMDGPULibFunc::NATIVE); nf.setId(AMDGPULibFunc::EI_COS); - Constant *cosExpr = getFunction(M, nf); + FunctionCallee cosExpr = getFunction(M, nf); if (sinExpr && cosExpr) { Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI); Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI); @@ -555,7 +565,7 @@ bool AMDGPULibCalls::useNative(CallInst *aCI) { return sincosUseNative(aCI, FInfo); FInfo.setPrefix(AMDGPULibFunc::NATIVE); - Constant *F = getFunction(aCI->getModule(), FInfo); + FunctionCallee F = getFunction(aCI->getModule(), FInfo); if (!F) return false; @@ -613,7 +623,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, auto *FTy = FunctionType::get(Callee->getReturnType(), ArrayRef(ArgTys), false); AMDGPULibFunc NewLibFunc(Name, FTy); - auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc); + FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc); if (!F) return false; @@ -640,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { // Ignore indirect calls. if (Callee == 0) return false; - FuncInfo FInfo; - if (!parseFunctionName(Callee->getName(), &FInfo)) - return false; - - // Further check the number of arguments to see if they match. - if (CI->getNumArgOperands() != FInfo.getNumArgs()) - return false; - BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); IRBuilder<> B(Context); @@ -659,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { if (const FPMathOperator *FPOp = dyn_cast(CI)) B.setFastMathFlags(FPOp->getFastMathFlags()); + switch (Callee->getIntrinsicID()) { + default: + break; + case Intrinsic::amdgcn_wavefrontsize: + return !EnablePreLink && fold_wavefrontsize(CI, B); + } + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo)) + return false; + + // Further check the number of arguments to see if they match. + if (CI->getNumArgOperands() != FInfo.getNumArgs()) + return false; + if (TDOFold(CI, FInfo)) return true; @@ -795,7 +812,7 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) { AMDGPULibFunc nf = FInfo; nf.setPrefix(AMDGPULibFunc::NATIVE); - if (Constant *FPExpr = getFunction(M, nf)) { + if (FunctionCallee FPExpr = getFunction(M, nf)) { LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); CI->setCalledFunction(FPExpr); @@ -848,7 +865,7 @@ bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B, namespace llvm { static double log2(double V) { -#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L +#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L return ::log2(V); #else return log(V) / 0.693147180559945309417; @@ -934,9 +951,10 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) { // pow[r](x, [-]0.5) = sqrt(x) bool issqrt = CF->isExactlyValue(0.5); - if (Constant *FPExpr = getFunction(M, - AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT - : AMDGPULibFunc::EI_RSQRT, FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT + : AMDGPULibFunc::EI_RSQRT, + FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << FInfo.getName().c_str() << "(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt" @@ -1003,8 +1021,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, // powr ---> exp2(y * log2(x)) // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) - Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, - FInfo)); + FunctionCallee ExpExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); if (!ExpExpr) return false; @@ -1090,8 +1108,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, Value *nval; if (needabs) { - Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, - FInfo)); + FunctionCallee AbsExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, FInfo)); if (!AbsExpr) return false; nval = CreateCallEx(B, AbsExpr, opr0, "__fabs"); @@ -1099,8 +1117,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, nval = cnval ? cnval : opr0; } if (needlog) { - Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, - FInfo)); + FunctionCallee LogExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); if (!LogExpr) return false; nval = CreateCallEx(B,LogExpr, nval, "__log2"); @@ -1159,8 +1177,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, std::vector ParamsTys; ParamsTys.push_back(opr0->getType()); Module *M = CI->getModule(); - if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, - FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt"); replaceCall(nval); @@ -1168,8 +1186,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, } } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) Module *M = CI->getModule(); - if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, - FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt"); replaceCall(nval); @@ -1186,8 +1204,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, std::vector ParamsTys; ParamsTys.push_back(opr0->getType()); Module *M = CI->getModule(); - if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, - FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt"); @@ -1243,7 +1261,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, } // Get a scalar native builtin signle argument FP function -Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) { +FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, + const FuncInfo &FInfo) { if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId())) return nullptr; FuncInfo nf = FInfo; @@ -1256,8 +1275,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo) { if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { - if (Constant *FPExpr = getNativeFunction( - CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { + if (FunctionCallee FPExpr = getNativeFunction( + CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { Value *opr0 = CI->getArgOperand(0); LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << "sqrt(" << *opr0 << ")\n"); @@ -1334,7 +1353,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, // function. AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo); nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); - Function *Fsincos = dyn_cast_or_null(getFunction(M, nf)); + FunctionCallee Fsincos = getFunction(M, nf); if (!Fsincos) return false; BasicBlock::iterator ItOld = B.GetInsertPoint(); @@ -1342,7 +1361,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, B.SetInsertPoint(UI); Value *P = Alloc; - Type *PTy = Fsincos->getFunctionType()->getParamType(1); + Type *PTy = Fsincos.getFunctionType()->getParamType(1); // The allocaInst allocates the memory in private address space. This need // to be bitcasted to point to the address space of cos pointer type. // In OpenCL 2.0 this is generic, while in 1.2 that is private. @@ -1356,12 +1375,12 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, if (!isSin) { // CI->cos, UI->sin B.SetInsertPoint(&*ItOld); UI->replaceAllUsesWith(&*Call); - Instruction *Reload = B.CreateLoad(Alloc); + Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); CI->replaceAllUsesWith(Reload); UI->eraseFromParent(); CI->eraseFromParent(); } else { // CI->sin, UI->cos - Instruction *Reload = B.CreateLoad(Alloc); + Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); UI->replaceAllUsesWith(Reload); CI->replaceAllUsesWith(Call); UI->eraseFromParent(); @@ -1370,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, return true; } +bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) { + if (!TM) + return false; + + StringRef CPU = TM->getTargetCPU(); + StringRef Features = TM->getTargetFeatureString(); + if ((CPU.empty() || CPU.equals_lower("generic")) && + (Features.empty() || + Features.find_lower("wavefrontsize") == StringRef::npos)) + return false; + + Function *F = CI->getParent()->getParent(); + const GCNSubtarget &ST = TM->getSubtarget(*F); + unsigned N = ST.getWavefrontSize(); + + LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with " + << N << "\n"); + + CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N)); + CI->eraseFromParent(); + return true; +} + // Get insertion point at entry. BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) { Function * Func = UI->getParent()->getParent(); @@ -1679,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) { } // Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) { - return new AMDGPUSimplifyLibCalls(Opt); +FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt, + const TargetMachine *TM) { + return new AMDGPUSimplifyLibCalls(Opt, TM); } FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 4fc3fe0f105b..a5bac25701a0 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULibFunc.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,6 +63,8 @@ struct ManglingRule { int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); } unsigned getNumArgs() const; + + static StringMap buildManglingRulesMap(); }; // Information about library functions with unmangled names. @@ -77,16 +78,7 @@ class UnmangledFuncInfo { // Number of entries in Table. static const unsigned TableSize; - // Map function name to index. - class NameMap : public StringMap { - public: - NameMap() { - for (unsigned I = 0; I != TableSize; ++I) - (*this)[Table[I].Name] = I; - } - }; - friend class NameMap; - static NameMap Map; + static StringMap buildNameMap(); public: using ID = AMDGPULibFunc::EFuncId; @@ -102,7 +94,8 @@ public: static_cast(AMDGPULibFunc::EI_LAST_MANGLED); } static ID toFuncId(unsigned Index) { - assert(Index < TableSize && "Invalid unmangled library function"); + assert(Index < TableSize && + "Invalid unmangled library function"); return static_cast( Index + 1 + static_cast(AMDGPULibFunc::EI_LAST_MANGLED)); } @@ -350,18 +343,7 @@ const UnmangledFuncInfo UnmangledFuncInfo::Table[] = { }; const unsigned UnmangledFuncInfo::TableSize = - sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]); - -UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map; - -static const struct ManglingRulesMap : public StringMap { - ManglingRulesMap() - : StringMap(sizeof(manglingRules)/sizeof(manglingRules[0])) { - int Id = 0; - for (auto Rule : manglingRules) - insert({ Rule.Name, Id++ }); - } -} manglingRulesMap; + array_lengthof(UnmangledFuncInfo::Table); static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id, const AMDGPULibFunc::Param (&Leads)[2]) { @@ -569,7 +551,17 @@ static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) { return Pfx; } +StringMap ManglingRule::buildManglingRulesMap() { + StringMap Map(array_lengthof(manglingRules)); + int Id = 0; + for (auto Rule : manglingRules) + Map.insert({Rule.Name, Id++}); + return Map; +} + bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) { + static const StringMap manglingRulesMap = + ManglingRule::buildManglingRulesMap(); FuncId = static_cast(manglingRulesMap.lookup(FullName)); return FuncId != EI_NONE; } @@ -961,8 +953,8 @@ Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) { return nullptr; } -Function *AMDGPULibFunc::getOrInsertFunction(Module *M, - const AMDGPULibFunc &fInfo) { +FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M, + const AMDGPULibFunc &fInfo) { std::string const FuncName = fInfo.mangle(); Function *F = dyn_cast_or_null( M->getValueSymbolTable().lookup(FuncName)); @@ -988,7 +980,7 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M, } } - Constant *C = nullptr; + FunctionCallee C; if (hasPtr) { // Do not set extra attributes for functions with pointer arguments. C = M->getOrInsertFunction(FuncName, FuncTy); @@ -1002,10 +994,18 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M, C = M->getOrInsertFunction(FuncName, FuncTy, Attr); } - return cast(C); + return C; +} + +StringMap UnmangledFuncInfo::buildNameMap() { + StringMap Map; + for (unsigned I = 0; I != TableSize; ++I) + Map[Table[I].Name] = I; + return Map; } bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) { + static const StringMap Map = buildNameMap(); auto Loc = Map.find(Name); if (Loc != Map.end()) { Id = toFuncId(Loc->second); diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h index fe062384800a..2354ed7df205 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -1,9 +1,8 @@ //===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -394,8 +393,8 @@ public: } static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo); - static Function *getOrInsertFunction(llvm::Module *M, - const AMDGPULibFunc &fInfo); + static FunctionCallee getOrInsertFunction(llvm::Module *M, + const AMDGPULibFunc &fInfo); static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr); private: diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 2cec8fe53283..15032969890e 100644 --- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 743dc7a0d00b..5dd5b3691e0a 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -110,8 +109,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // modes on SI to know the high bits are 0 so pointer adds don't wrap. We // can't represent this with range metadata because it's only allowed for // integer types. - if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && + !ST.hasUsableDSOffset()) continue; // FIXME: We can replace this with equivalent alias.scope/noalias @@ -132,6 +132,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { KernArgBaseAlign); Value *ArgPtr; + Type *AdjustedArgTy; if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant @@ -139,30 +140,27 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // // Additionally widen any sub-dword load to i32 even if suitably aligned, // so that CSE between different argument loads works easily. - ArgPtr = Builder.CreateConstInBoundsGEP1_64( - KernArgSegment, - AlignDownOffset, - Arg.getName() + ".kernarg.offset.align.down"); - ArgPtr = Builder.CreateBitCast(ArgPtr, - Builder.getInt32Ty()->getPointerTo(AS), - ArgPtr->getName() + ".cast"); + Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, + Arg.getName() + ".kernarg.offset.align.down"); + AdjustedArgTy = Builder.getInt32Ty(); } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( - KernArgSegment, - EltOffset, - Arg.getName() + ".kernarg.offset"); - ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), - ArgPtr->getName() + ".cast"); + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".kernarg.offset"); + AdjustedArgTy = ArgTy; } if (IsV3 && Size >= 32) { V4Ty = VectorType::get(VT->getVectorElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads - ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS)); + AdjustedArgTy = V4Ty; } - LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign); + ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), + ArgPtr->getName() + ".cast"); + LoadInst *Load = + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index a43dcef4cf0b..00e12f808783 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index f6bdbf5e9be2..ae4c32c258a7 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -1,9 +1,8 @@ //===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,7 +15,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600AsmPrinter.h" #include "SIInstrInfo.h" @@ -91,6 +90,10 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { return MCSymbolRefExpr::VK_AMDGPU_REL32_LO; case SIInstrInfo::MO_REL32_HI: return MCSymbolRefExpr::VK_AMDGPU_REL32_HI; + case SIInstrInfo::MO_ABS32_LO: + return MCSymbolRefExpr::VK_AMDGPU_ABS32_LO; + case SIInstrInfo::MO_ABS32_HI: + return MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; } } @@ -101,17 +104,22 @@ const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); - assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 && - ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); + // FIXME: The first half of this assert should be removed. This should + // probably be PC relative instead of using the source block symbol, and + // therefore the indirect branch expansion should use a bundle. + assert( + skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() == + AMDGPU::S_GETPC_B64 && + ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); // s_getpc_b64 returns the address of next instruction. const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); - if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD) + if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD) return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); - assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD); + assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD); return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); } @@ -142,10 +150,13 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, SmallString<128> SymbolName; AP.getNameWithPrefix(SymbolName, GV); MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName); - const MCExpr *SymExpr = + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); - const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, - MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + int64_t Offset = MO.getOffset(); + if (Offset != 0) { + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(Offset, Ctx), Ctx); + } MCOp = MCOperand::createExpr(Expr); return true; } @@ -321,14 +332,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { } #endif - if (STI.dumpCode()) { - // Disassemble instruction/operands to text. + if (DumpCodeInstEmitter) { + // Disassemble instruction/operands to text DisasmLines.resize(DisasmLines.size() + 1); std::string &DisasmLine = DisasmLines.back(); raw_string_ostream DisasmStream(DisasmLine); - AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *STI.getInstrInfo(), + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *STI.getInstrInfo(), *STI.getRegisterInfo()); InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI); @@ -337,10 +347,8 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector CodeBytes; raw_svector_ostream CodeStream(CodeBytes); - auto &ObjStreamer = static_cast(*OutStreamer); - MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); - InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, - MF->getSubtarget()); + DumpCodeInstEmitter->encodeInstruction( + TmpInst, CodeStream, Fixups, MF->getSubtarget()); HexLines.resize(HexLines.size() + 1); std::string &HexLine = HexLines.back(); raw_string_ostream HexStream(HexLine); diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 6f44e2dbb2d5..237490957058 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1,9 +1,8 @@ //===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 13b4b50149ce..0d3a1f1a769f 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -30,13 +29,13 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : // except reserved size is not correctly aligned. const Function &F = MF.getFunction(); - if (auto *Resolver = MF.getMMI().getResolver()) { - if (AMDGPUPerfHintAnalysis *PHA = static_cast( - Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { - MemoryBound = PHA->isMemoryBound(&F); - WaveLimiter = PHA->needsWaveLimiter(&F); - } - } + Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound"); + MemoryBound = MemBoundAttr.isStringAttribute() && + MemBoundAttr.getValueAsString() == "true"; + + Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter"); + WaveLimiter = WaveLimitAttr.isStringAttribute() && + WaveLimitAttr.getValueAsString() == "true"; CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 8d6b871bc03e..52987e2fa411 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -1,9 +1,8 @@ //===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index 7b9f673c418c..4d9f08b3af01 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -1,9 +1,8 @@ //===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,6 +23,16 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) AgentSSID = CTX.getOrInsertSyncScopeID("agent"); WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); + SystemOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("one-as"); + AgentOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("agent-one-as"); + WorkgroupOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("workgroup-one-as"); + WavefrontOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("wavefront-one-as"); + SingleThreadOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("singlethread-one-as"); } } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 1219ab26fb69..2b0b8b42acfe 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -1,9 +1,8 @@ //===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,12 +29,22 @@ private: // All supported memory/synchronization scopes can be found here: // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes - /// Agent synchronization scope ID. + /// Agent synchronization scope ID (cross address space). SyncScope::ID AgentSSID; - /// Workgroup synchronization scope ID. + /// Workgroup synchronization scope ID (cross address space). SyncScope::ID WorkgroupSSID; - /// Wavefront synchronization scope ID. + /// Wavefront synchronization scope ID (cross address space). SyncScope::ID WavefrontSSID; + /// System synchronization scope ID (single address space). + SyncScope::ID SystemOneAddressSpaceSSID; + /// Agent synchronization scope ID (single address space). + SyncScope::ID AgentOneAddressSpaceSSID; + /// Workgroup synchronization scope ID (single address space). + SyncScope::ID WorkgroupOneAddressSpaceSSID; + /// Wavefront synchronization scope ID (single address space). + SyncScope::ID WavefrontOneAddressSpaceSSID; + /// Single thread synchronization scope ID (single address space). + SyncScope::ID SingleThreadOneAddressSpaceSSID; /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -44,35 +53,70 @@ private: /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not /// supported by the AMDGPU target. Optional getSyncScopeInclusionOrdering(SyncScope::ID SSID) const { - if (SSID == SyncScope::SingleThread) + if (SSID == SyncScope::SingleThread || + SSID == getSingleThreadOneAddressSpaceSSID()) return 0; - else if (SSID == getWavefrontSSID()) + else if (SSID == getWavefrontSSID() || + SSID == getWavefrontOneAddressSpaceSSID()) return 1; - else if (SSID == getWorkgroupSSID()) + else if (SSID == getWorkgroupSSID() || + SSID == getWorkgroupOneAddressSpaceSSID()) return 2; - else if (SSID == getAgentSSID()) + else if (SSID == getAgentSSID() || + SSID == getAgentOneAddressSpaceSSID()) return 3; - else if (SSID == SyncScope::System) + else if (SSID == SyncScope::System || + SSID == getSystemOneAddressSpaceSSID()) return 4; return None; } + /// \returns True if \p SSID is restricted to single address space, false + /// otherwise + bool isOneAddressSpace(SyncScope::ID SSID) const { + return SSID == getSingleThreadOneAddressSpaceSSID() || + SSID == getWavefrontOneAddressSpaceSSID() || + SSID == getWorkgroupOneAddressSpaceSSID() || + SSID == getAgentOneAddressSpaceSSID() || + SSID == getSystemOneAddressSpaceSSID(); + } + public: AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI); - /// \returns Agent synchronization scope ID. + /// \returns Agent synchronization scope ID (cross address space). SyncScope::ID getAgentSSID() const { return AgentSSID; } - /// \returns Workgroup synchronization scope ID. + /// \returns Workgroup synchronization scope ID (cross address space). SyncScope::ID getWorkgroupSSID() const { return WorkgroupSSID; } - /// \returns Wavefront synchronization scope ID. + /// \returns Wavefront synchronization scope ID (cross address space). SyncScope::ID getWavefrontSSID() const { return WavefrontSSID; } + /// \returns System synchronization scope ID (single address space). + SyncScope::ID getSystemOneAddressSpaceSSID() const { + return SystemOneAddressSpaceSSID; + } + /// \returns Agent synchronization scope ID (single address space). + SyncScope::ID getAgentOneAddressSpaceSSID() const { + return AgentOneAddressSpaceSSID; + } + /// \returns Workgroup synchronization scope ID (single address space). + SyncScope::ID getWorkgroupOneAddressSpaceSSID() const { + return WorkgroupOneAddressSpaceSSID; + } + /// \returns Wavefront synchronization scope ID (single address space). + SyncScope::ID getWavefrontOneAddressSpaceSSID() const { + return WavefrontOneAddressSpaceSSID; + } + /// \returns Single thread synchronization scope ID (single address space). + SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { + return SingleThreadOneAddressSpaceSSID; + } /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -88,7 +132,11 @@ public: if (!AIO || !BIO) return None; - return AIO.getValue() > BIO.getValue(); + bool IsAOneAddressSpace = isOneAddressSpace(A); + bool IsBOneAddressSpace = isOneAddressSpace(B); + + return AIO.getValue() >= BIO.getValue() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index 5e0b7d429022..8c11230f411a 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -1,9 +1,8 @@ //===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/lib/Target/AMDGPU/AMDGPUMacroFusion.h index 844958580a65..da4b3cf8bc24 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.h +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.h @@ -1,9 +1,8 @@ //===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 7bd8533a0ccf..f7231471c107 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -1,9 +1,8 @@ //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -120,11 +119,11 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { auto T = ArrayType::get(Type::getInt64Ty(C), 2); auto *GV = new GlobalVariable( M, T, - /*IsConstant=*/false, GlobalValue::ExternalLinkage, + /*isConstant=*/false, GlobalValue::ExternalLinkage, /*Initializer=*/Constant::getNullValue(T), RuntimeHandle, /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, - /*IsExternallyInitialized=*/false); + /*isExternallyInitialized=*/false); LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); for (auto U : F.users()) { diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h index 2feff14d34a1..8b69f51c1a0d 100644 --- a/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -1,9 +1,8 @@ //===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index e53a8fe7c074..9613d5a843b3 100644 --- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -1,9 +1,8 @@ //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,6 +17,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -72,7 +72,7 @@ public: const TargetLowering *TLI_) : FIM(FIM_), DL(nullptr), TLI(TLI_) {} - void runOnFunction(Function &F); + bool runOnFunction(Function &F); private: struct MemAccessInfo { @@ -101,7 +101,7 @@ private: const TargetLowering *TLI; - void visit(const Function &F); + AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); @@ -203,12 +203,8 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { return false; } -void AMDGPUPerfHint::visit(const Function &F) { - auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); - if (!FIP.second) - return; - - AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; +AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { + AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); @@ -234,10 +230,10 @@ void AMDGPUPerfHint::visit(const Function &F) { if (&F == Callee) // Handle immediate recursion continue; - visit(*Callee); auto Loc = FIM.find(Callee); + if (Loc == FIM.end()) + continue; - assert(Loc != FIM.end() && "No func info"); FI.MemInstCount += Loc->second.MemInstCount; FI.InstCount += Loc->second.InstCount; FI.IAMInstCount += Loc->second.IAMInstCount; @@ -257,36 +253,39 @@ void AMDGPUPerfHint::visit(const Function &F) { } } } -} -void AMDGPUPerfHint::runOnFunction(Function &F) { - if (FIM.find(&F) != FIM.end()) - return; + return &FI; +} +bool AMDGPUPerfHint::runOnFunction(Function &F) { const Module &M = *F.getParent(); DL = &M.getDataLayout(); - visit(F); - auto Loc = FIM.find(&F); + if (F.hasFnAttribute("amdgpu-wave-limiter") && + F.hasFnAttribute("amdgpu-memory-bound")) + return false; + + const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); - assert(Loc != FIM.end() && "No func info"); - LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount + LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount << '\n' - << " IAMInst: " << Loc->second.IAMInstCount << '\n' - << " LSMInst: " << Loc->second.LSMInstCount << '\n' - << " TotalInst: " << Loc->second.InstCount << '\n'); - - auto &FI = Loc->second; + << " IAMInst: " << Info->IAMInstCount << '\n' + << " LSMInst: " << Info->LSMInstCount << '\n' + << " TotalInst: " << Info->InstCount << '\n'); - if (isMemBound(FI)) { + if (isMemBound(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); NumMemBound++; + F.addFnAttr("amdgpu-memory-bound", "true"); } - if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { + if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); NumLimitWave++; + F.addFnAttr("amdgpu-wave-limiter", "true"); } + + return true; } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { @@ -365,17 +364,27 @@ bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( } } // namespace -bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { +bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; const TargetMachine &TM = TPC->getTM(); - const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F); - AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); - Analyzer.runOnFunction(F); - return false; + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; + + const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); + AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); + + if (Analyzer.runOnFunction(*F)) + Changed = true; + } + + return Changed; } bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index be7f37cb6815..9599e09fbd96 100644 --- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -1,9 +1,8 @@ -//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===// +//===- AMDGPUPerfHintAnalysis.h ---- analysis of memory traffic -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,18 +14,20 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H #define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H + +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" namespace llvm { -struct AMDGPUPerfHintAnalysis : public FunctionPass { +struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass { static char ID; public: - AMDGPUPerfHintAnalysis() : FunctionPass(ID) {} + AMDGPUPerfHintAnalysis() : CallGraphSCCPass(ID) {} - bool runOnFunction(Function &F) override; + bool runOnSCC(CallGraphSCC &SCC) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 5d087c099184..e4c9d6685d4a 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -163,12 +162,16 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); - for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { - AllocaInst *AI = dyn_cast(I); - ++I; - if (AI) - Changed |= handleAlloca(*AI, SufficientLDS); + SmallVector Allocas; + for (Instruction &I : EntryBB) { + if (AllocaInst *AI = dyn_cast(&I)) + Allocas.push_back(AI); + } + + for (AllocaInst *AI : Allocas) { + if (handleAlloca(*AI, SufficientLDS)) + Changed = true; } return Changed; @@ -245,11 +248,11 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier // to CSE this. The loads should be mergable later anyway. - Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1); - LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4); + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4); - Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); - LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4); MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); @@ -427,7 +430,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = Builder.CreateLoad(BitCast); + Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); @@ -442,7 +445,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = Builder.CreateLoad(BitCast); + Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *NewVecValue = Builder.CreateInsertElement(VecValue, SI->getValueOperand(), Index); @@ -919,7 +922,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { ); CallInst *NewCall = Builder.CreateCall( - ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)}); + ObjectSize, + {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; diff --git a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp new file mode 100644 index 000000000000..7a7addd0f5cf --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -0,0 +1,336 @@ +//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass propagates attributes from kernels to the non-entry +/// functions. Most of the library functions were not compiled for specific ABI, +/// yet will be correctly compiled if proper attrbutes are propagated from the +/// caller. +/// +/// The pass analyzes call graph and propagates ABI target features through the +/// call graph. +/// +/// It can run in two modes: as a function or module pass. A function pass +/// simply propagates attributes. A module pass clones functions if there are +/// callers with different ABI. If a function is clonned all call sites will +/// be updated to use a correct clone. +/// +/// A function pass is limited in functionality but can run early in the +/// pipeline. A module pass is more powerful but has to run late, so misses +/// library folding opportunities. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include + +#define DEBUG_TYPE "amdgpu-propagate-attributes" + +using namespace llvm; + +namespace llvm { +extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; +} + +namespace { + +class AMDGPUPropagateAttributes { + const FeatureBitset TargetFeatures = { + AMDGPU::FeatureWavefrontSize16, + AMDGPU::FeatureWavefrontSize32, + AMDGPU::FeatureWavefrontSize64 + }; + + class Clone{ + public: + Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) : + FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {} + + FeatureBitset FeatureMask; + Function *OrigF; + Function *NewF; + }; + + const TargetMachine *TM; + + // Clone functions as needed or just set attributes. + bool AllowClone; + + // Option propagation roots. + SmallSet Roots; + + // Clones of functions with their attributes. + SmallVector Clones; + + // Find a clone with required features. + Function *findFunction(const FeatureBitset &FeaturesNeeded, + Function *OrigF); + + // Clone function F and set NewFeatures on the clone. + // Cole takes the name of original function. + Function *cloneWithFeatures(Function &F, + const FeatureBitset &NewFeatures); + + // Set new function's features in place. + void setFeatures(Function &F, const FeatureBitset &NewFeatures); + + std::string getFeatureString(const FeatureBitset &Features) const; + + // Propagate attributes from Roots. + bool process(); + +public: + AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : + TM(TM), AllowClone(AllowClone) {} + + // Use F as a root and propagate its attributes. + bool process(Function &F); + + // Propagate attributes starting from kernel functions. + bool process(Module &M); +}; + +// Allows to propagate attributes early, but no clonning is allowed as it must +// be a function pass to run before any optimizations. +// TODO: We shall only need a one instance of module pass, but that needs to be +// in the linker pipeline which is currently not possible. +class AMDGPUPropagateAttributesEarly : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification + + AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : + FunctionPass(ID), TM(TM) { + initializeAMDGPUPropagateAttributesEarlyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; + +// Allows to propagate attributes with clonning but does that late in the +// pipeline. +class AMDGPUPropagateAttributesLate : public ModulePass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification + + AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : + ModulePass(ID), TM(TM) { + initializeAMDGPUPropagateAttributesLatePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace. + +char AMDGPUPropagateAttributesEarly::ID = 0; +char AMDGPUPropagateAttributesLate::ID = 0; + +INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, + "amdgpu-propagate-attributes-early", + "Early propagate attributes from kernels to functions", + false, false) +INITIALIZE_PASS(AMDGPUPropagateAttributesLate, + "amdgpu-propagate-attributes-late", + "Late propagate attributes from kernels to functions", + false, false) + +Function * +AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded, + Function *OrigF) { + // TODO: search for clone's clones. + for (Clone &C : Clones) + if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask) + return C.NewF; + + return nullptr; +} + +bool AMDGPUPropagateAttributes::process(Module &M) { + for (auto &F : M.functions()) + if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) + Roots.insert(&F); + + return process(); +} + +bool AMDGPUPropagateAttributes::process(Function &F) { + Roots.insert(&F); + return process(); +} + +bool AMDGPUPropagateAttributes::process() { + bool Changed = false; + SmallSet NewRoots; + SmallSet Replaced; + + if (Roots.empty()) + return false; + Module &M = *(*Roots.begin())->getParent(); + + do { + Roots.insert(NewRoots.begin(), NewRoots.end()); + NewRoots.clear(); + + for (auto &F : M.functions()) { + if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F)) + continue; + + const FeatureBitset &CalleeBits = + TM->getSubtargetImpl(F)->getFeatureBits(); + SmallVector, 32> ToReplace; + + for (User *U : F.users()) { + Instruction *I = dyn_cast(U); + if (!I) + continue; + CallBase *CI = dyn_cast(I); + if (!CI) + continue; + Function *Caller = CI->getCaller(); + if (!Caller) + continue; + if (!Roots.count(Caller)) + continue; + + const FeatureBitset &CallerBits = + TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures; + + if (CallerBits == (CalleeBits & TargetFeatures)) { + NewRoots.insert(&F); + continue; + } + + Function *NewF = findFunction(CallerBits, &F); + if (!NewF) { + FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) | + CallerBits); + if (!AllowClone) { + // This may set different features on different iteartions if + // there is a contradiction in callers' attributes. In this case + // we rely on a second pass running on Module, which is allowed + // to clone. + setFeatures(F, NewFeatures); + NewRoots.insert(&F); + Changed = true; + break; + } + + NewF = cloneWithFeatures(F, NewFeatures); + Clones.push_back(Clone(CallerBits, &F, NewF)); + NewRoots.insert(NewF); + } + + ToReplace.push_back(std::make_pair(CI, NewF)); + Replaced.insert(&F); + + Changed = true; + } + + while (!ToReplace.empty()) { + auto R = ToReplace.pop_back_val(); + R.first->setCalledFunction(R.second); + } + } + } while (!NewRoots.empty()); + + for (Function *F : Replaced) { + if (F->use_empty()) + F->eraseFromParent(); + } + + return Changed; +} + +Function * +AMDGPUPropagateAttributes::cloneWithFeatures(Function &F, + const FeatureBitset &NewFeatures) { + LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); + + ValueToValueMapTy dummy; + Function *NewF = CloneFunction(&F, dummy); + setFeatures(*NewF, NewFeatures); + + // Swap names. If that is the only clone it will retain the name of now + // dead value. + if (F.hasName()) { + std::string NewName = NewF->getName(); + NewF->takeName(&F); + F.setName(NewName); + + // Name has changed, it does not need an external symbol. + F.setVisibility(GlobalValue::DefaultVisibility); + F.setLinkage(GlobalValue::InternalLinkage); + } + + return NewF; +} + +void AMDGPUPropagateAttributes::setFeatures(Function &F, + const FeatureBitset &NewFeatures) { + std::string NewFeatureStr = getFeatureString(NewFeatures); + + LLVM_DEBUG(dbgs() << "Set features " + << getFeatureString(NewFeatures & TargetFeatures) + << " on " << F.getName() << '\n'); + + F.removeFnAttr("target-features"); + F.addFnAttr("target-features", NewFeatureStr); +} + +std::string +AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const +{ + std::string Ret; + for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { + if (Features[KV.Value]) + Ret += (StringRef("+") + KV.Key + ",").str(); + else if (TargetFeatures[KV.Value]) + Ret += (StringRef("-") + KV.Key + ",").str(); + } + Ret.pop_back(); // Remove last comma. + return Ret; +} + +bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { + if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + return AMDGPUPropagateAttributes(TM, false).process(F); +} + +bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { + if (!TM) + return false; + + return AMDGPUPropagateAttributes(TM, true).process(M); +} + +FunctionPass +*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { + return new AMDGPUPropagateAttributesEarly(TM); +} + +ModulePass +*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { + return new AMDGPUPropagateAttributesLate(TM); +} diff --git a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp b/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp deleted file mode 100644 index 36d88f52910d..000000000000 --- a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp +++ /dev/null @@ -1,353 +0,0 @@ -//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===// - -#ifdef AMDGPU_REG_ASM_NAMES - -static const char *const VGPR32RegNames[] = { - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31", "v32", "v33", "v34", "v35", - "v36", "v37", "v38", "v39", "v40", "v41", "v42", "v43", "v44", - "v45", "v46", "v47", "v48", "v49", "v50", "v51", "v52", "v53", - "v54", "v55", "v56", "v57", "v58", "v59", "v60", "v61", "v62", - "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71", - "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80", - "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", - "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", - "v99", "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", - "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116", - "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125", - "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134", - "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143", - "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152", - "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161", - "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170", - "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", - "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188", - "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197", - "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206", - "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215", - "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224", - "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233", - "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242", - "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", - "v252", "v253", "v254", "v255" -}; - -static const char *const SGPR32RegNames[] = { - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", - "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", - "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", - "s30", "s31", "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39", - "s40", "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49", - "s50", "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", "s59", - "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69", - "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79", - "s80", "s81", "s82", "s83", "s84", "s85", "s86", "s87", "s88", "s89", - "s90", "s91", "s92", "s93", "s94", "s95", "s96", "s97", "s98", "s99", - "s100", "s101", "s102", "s103" -}; - -static const char *const VGPR64RegNames[] = { - "v[0:1]", "v[1:2]", "v[2:3]", "v[3:4]", "v[4:5]", - "v[5:6]", "v[6:7]", "v[7:8]", "v[8:9]", "v[9:10]", - "v[10:11]", "v[11:12]", "v[12:13]", "v[13:14]", "v[14:15]", - "v[15:16]", "v[16:17]", "v[17:18]", "v[18:19]", "v[19:20]", - "v[20:21]", "v[21:22]", "v[22:23]", "v[23:24]", "v[24:25]", - "v[25:26]", "v[26:27]", "v[27:28]", "v[28:29]", "v[29:30]", - "v[30:31]", "v[31:32]", "v[32:33]", "v[33:34]", "v[34:35]", - "v[35:36]", "v[36:37]", "v[37:38]", "v[38:39]", "v[39:40]", - "v[40:41]", "v[41:42]", "v[42:43]", "v[43:44]", "v[44:45]", - "v[45:46]", "v[46:47]", "v[47:48]", "v[48:49]", "v[49:50]", - "v[50:51]", "v[51:52]", "v[52:53]", "v[53:54]", "v[54:55]", - "v[55:56]", "v[56:57]", "v[57:58]", "v[58:59]", "v[59:60]", - "v[60:61]", "v[61:62]", "v[62:63]", "v[63:64]", "v[64:65]", - "v[65:66]", "v[66:67]", "v[67:68]", "v[68:69]", "v[69:70]", - "v[70:71]", "v[71:72]", "v[72:73]", "v[73:74]", "v[74:75]", - "v[75:76]", "v[76:77]", "v[77:78]", "v[78:79]", "v[79:80]", - "v[80:81]", "v[81:82]", "v[82:83]", "v[83:84]", "v[84:85]", - "v[85:86]", "v[86:87]", "v[87:88]", "v[88:89]", "v[89:90]", - "v[90:91]", "v[91:92]", "v[92:93]", "v[93:94]", "v[94:95]", - "v[95:96]", "v[96:97]", "v[97:98]", "v[98:99]", "v[99:100]", - "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]", - "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]", - "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]", - "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]", - "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]", - "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]", - "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]", - "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]", - "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]", - "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]", - "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]", - "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]", - "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]", - "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]", - "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]", - "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]", - "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]", - "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]", - "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]", - "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]", - "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]", - "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]", - "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]", - "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]", - "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]", - "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]", - "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]", - "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]", - "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]", - "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]", - "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]" -}; - -static const char *const VGPR96RegNames[] = { - "v[0:2]", "v[1:3]", "v[2:4]", "v[3:5]", "v[4:6]", - "v[5:7]", "v[6:8]", "v[7:9]", "v[8:10]", "v[9:11]", - "v[10:12]", "v[11:13]", "v[12:14]", "v[13:15]", "v[14:16]", - "v[15:17]", "v[16:18]", "v[17:19]", "v[18:20]", "v[19:21]", - "v[20:22]", "v[21:23]", "v[22:24]", "v[23:25]", "v[24:26]", - "v[25:27]", "v[26:28]", "v[27:29]", "v[28:30]", "v[29:31]", - "v[30:32]", "v[31:33]", "v[32:34]", "v[33:35]", "v[34:36]", - "v[35:37]", "v[36:38]", "v[37:39]", "v[38:40]", "v[39:41]", - "v[40:42]", "v[41:43]", "v[42:44]", "v[43:45]", "v[44:46]", - "v[45:47]", "v[46:48]", "v[47:49]", "v[48:50]", "v[49:51]", - "v[50:52]", "v[51:53]", "v[52:54]", "v[53:55]", "v[54:56]", - "v[55:57]", "v[56:58]", "v[57:59]", "v[58:60]", "v[59:61]", - "v[60:62]", "v[61:63]", "v[62:64]", "v[63:65]", "v[64:66]", - "v[65:67]", "v[66:68]", "v[67:69]", "v[68:70]", "v[69:71]", - "v[70:72]", "v[71:73]", "v[72:74]", "v[73:75]", "v[74:76]", - "v[75:77]", "v[76:78]", "v[77:79]", "v[78:80]", "v[79:81]", - "v[80:82]", "v[81:83]", "v[82:84]", "v[83:85]", "v[84:86]", - "v[85:87]", "v[86:88]", "v[87:89]", "v[88:90]", "v[89:91]", - "v[90:92]", "v[91:93]", "v[92:94]", "v[93:95]", "v[94:96]", - "v[95:97]", "v[96:98]", "v[97:99]", "v[98:100]", "v[99:101]", - "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]", - "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]", - "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]", - "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]", - "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]", - "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]", - "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]", - "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]", - "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]", - "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]", - "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]", - "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]", - "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]", - "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]", - "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]", - "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]", - "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]", - "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]", - "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]", - "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]", - "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]", - "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]", - "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]", - "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]", - "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]", - "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]", - "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]", - "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]", - "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]", - "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]", - "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]" -}; - -static const char *const VGPR128RegNames[] = { - "v[0:3]", "v[1:4]", "v[2:5]", "v[3:6]", "v[4:7]", - "v[5:8]", "v[6:9]", "v[7:10]", "v[8:11]", "v[9:12]", - "v[10:13]", "v[11:14]", "v[12:15]", "v[13:16]", "v[14:17]", - "v[15:18]", "v[16:19]", "v[17:20]", "v[18:21]", "v[19:22]", - "v[20:23]", "v[21:24]", "v[22:25]", "v[23:26]", "v[24:27]", - "v[25:28]", "v[26:29]", "v[27:30]", "v[28:31]", "v[29:32]", - "v[30:33]", "v[31:34]", "v[32:35]", "v[33:36]", "v[34:37]", - "v[35:38]", "v[36:39]", "v[37:40]", "v[38:41]", "v[39:42]", - "v[40:43]", "v[41:44]", "v[42:45]", "v[43:46]", "v[44:47]", - "v[45:48]", "v[46:49]", "v[47:50]", "v[48:51]", "v[49:52]", - "v[50:53]", "v[51:54]", "v[52:55]", "v[53:56]", "v[54:57]", - "v[55:58]", "v[56:59]", "v[57:60]", "v[58:61]", "v[59:62]", - "v[60:63]", "v[61:64]", "v[62:65]", "v[63:66]", "v[64:67]", - "v[65:68]", "v[66:69]", "v[67:70]", "v[68:71]", "v[69:72]", - "v[70:73]", "v[71:74]", "v[72:75]", "v[73:76]", "v[74:77]", - "v[75:78]", "v[76:79]", "v[77:80]", "v[78:81]", "v[79:82]", - "v[80:83]", "v[81:84]", "v[82:85]", "v[83:86]", "v[84:87]", - "v[85:88]", "v[86:89]", "v[87:90]", "v[88:91]", "v[89:92]", - "v[90:93]", "v[91:94]", "v[92:95]", "v[93:96]", "v[94:97]", - "v[95:98]", "v[96:99]", "v[97:100]", "v[98:101]", "v[99:102]", - "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]", - "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]", - "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]", - "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]", - "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]", - "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]", - "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]", - "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]", - "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]", - "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]", - "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]", - "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]", - "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]", - "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]", - "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]", - "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]", - "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]", - "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]", - "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]", - "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]", - "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]", - "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]", - "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]", - "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]", - "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]", - "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]", - "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]", - "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]", - "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]", - "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]", - "v[250:253]", "v[251:254]", "v[252:255]" -}; - -static const char *const VGPR256RegNames[] = { - "v[0:7]", "v[1:8]", "v[2:9]", "v[3:10]", "v[4:11]", - "v[5:12]", "v[6:13]", "v[7:14]", "v[8:15]", "v[9:16]", - "v[10:17]", "v[11:18]", "v[12:19]", "v[13:20]", "v[14:21]", - "v[15:22]", "v[16:23]", "v[17:24]", "v[18:25]", "v[19:26]", - "v[20:27]", "v[21:28]", "v[22:29]", "v[23:30]", "v[24:31]", - "v[25:32]", "v[26:33]", "v[27:34]", "v[28:35]", "v[29:36]", - "v[30:37]", "v[31:38]", "v[32:39]", "v[33:40]", "v[34:41]", - "v[35:42]", "v[36:43]", "v[37:44]", "v[38:45]", "v[39:46]", - "v[40:47]", "v[41:48]", "v[42:49]", "v[43:50]", "v[44:51]", - "v[45:52]", "v[46:53]", "v[47:54]", "v[48:55]", "v[49:56]", - "v[50:57]", "v[51:58]", "v[52:59]", "v[53:60]", "v[54:61]", - "v[55:62]", "v[56:63]", "v[57:64]", "v[58:65]", "v[59:66]", - "v[60:67]", "v[61:68]", "v[62:69]", "v[63:70]", "v[64:71]", - "v[65:72]", "v[66:73]", "v[67:74]", "v[68:75]", "v[69:76]", - "v[70:77]", "v[71:78]", "v[72:79]", "v[73:80]", "v[74:81]", - "v[75:82]", "v[76:83]", "v[77:84]", "v[78:85]", "v[79:86]", - "v[80:87]", "v[81:88]", "v[82:89]", "v[83:90]", "v[84:91]", - "v[85:92]", "v[86:93]", "v[87:94]", "v[88:95]", "v[89:96]", - "v[90:97]", "v[91:98]", "v[92:99]", "v[93:100]", "v[94:101]", - "v[95:102]", "v[96:103]", "v[97:104]", "v[98:105]", "v[99:106]", - "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]", - "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]", - "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]", - "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]", - "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]", - "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]", - "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]", - "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]", - "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]", - "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]", - "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]", - "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]", - "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]", - "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]", - "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]", - "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]", - "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]", - "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]", - "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]", - "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]", - "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]", - "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]", - "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]", - "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]", - "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]", - "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]", - "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]", - "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]", - "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]", - "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]" -}; - -static const char *const VGPR512RegNames[] = { - "v[0:15]", "v[1:16]", "v[2:17]", "v[3:18]", "v[4:19]", - "v[5:20]", "v[6:21]", "v[7:22]", "v[8:23]", "v[9:24]", - "v[10:25]", "v[11:26]", "v[12:27]", "v[13:28]", "v[14:29]", - "v[15:30]", "v[16:31]", "v[17:32]", "v[18:33]", "v[19:34]", - "v[20:35]", "v[21:36]", "v[22:37]", "v[23:38]", "v[24:39]", - "v[25:40]", "v[26:41]", "v[27:42]", "v[28:43]", "v[29:44]", - "v[30:45]", "v[31:46]", "v[32:47]", "v[33:48]", "v[34:49]", - "v[35:50]", "v[36:51]", "v[37:52]", "v[38:53]", "v[39:54]", - "v[40:55]", "v[41:56]", "v[42:57]", "v[43:58]", "v[44:59]", - "v[45:60]", "v[46:61]", "v[47:62]", "v[48:63]", "v[49:64]", - "v[50:65]", "v[51:66]", "v[52:67]", "v[53:68]", "v[54:69]", - "v[55:70]", "v[56:71]", "v[57:72]", "v[58:73]", "v[59:74]", - "v[60:75]", "v[61:76]", "v[62:77]", "v[63:78]", "v[64:79]", - "v[65:80]", "v[66:81]", "v[67:82]", "v[68:83]", "v[69:84]", - "v[70:85]", "v[71:86]", "v[72:87]", "v[73:88]", "v[74:89]", - "v[75:90]", "v[76:91]", "v[77:92]", "v[78:93]", "v[79:94]", - "v[80:95]", "v[81:96]", "v[82:97]", "v[83:98]", "v[84:99]", - "v[85:100]", "v[86:101]", "v[87:102]", "v[88:103]", "v[89:104]", - "v[90:105]", "v[91:106]", "v[92:107]", "v[93:108]", "v[94:109]", - "v[95:110]", "v[96:111]", "v[97:112]", "v[98:113]", "v[99:114]", - "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]", - "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]", - "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]", - "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]", - "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]", - "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]", - "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]", - "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]", - "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]", - "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]", - "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]", - "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]", - "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]", - "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]", - "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]", - "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]", - "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]", - "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]", - "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]", - "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]", - "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]", - "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]", - "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]", - "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]", - "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]", - "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]", - "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]", - "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]", - "v[240:255]" -}; - -static const char *const SGPR64RegNames[] = { - "s[0:1]", "s[2:3]", "s[4:5]", "s[6:7]", "s[8:9]", "s[10:11]", - "s[12:13]", "s[14:15]", "s[16:17]", "s[18:19]", "s[20:21]", "s[22:23]", - "s[24:25]", "s[26:27]", "s[28:29]", "s[30:31]", "s[32:33]", "s[34:35]", - "s[36:37]", "s[38:39]", "s[40:41]", "s[42:43]", "s[44:45]", "s[46:47]", - "s[48:49]", "s[50:51]", "s[52:53]", "s[54:55]", "s[56:57]", "s[58:59]", - "s[60:61]", "s[62:63]", "s[64:65]", "s[66:67]", "s[68:69]", "s[70:71]", - "s[72:73]", "s[74:75]", "s[76:77]", "s[78:79]", "s[80:81]", "s[82:83]", - "s[84:85]", "s[86:87]", "s[88:89]", "s[90:91]", "s[92:93]", "s[94:95]", - "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]" -}; - -static const char *const SGPR128RegNames[] = { - "s[0:3]", "s[4:7]", "s[8:11]", "s[12:15]", "s[16:19]", "s[20:23]", - "s[24:27]", "s[28:31]", "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]", - "s[48:51]", "s[52:55]", "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]", - "s[72:75]", "s[76:79]", "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]", - "s[96:99]", "s[100:103]" -}; - -static const char *const SGPR256RegNames[] = { - "s[0:7]", "s[4:11]", "s[8:15]", "s[12:19]", "s[16:23]", - "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]", - "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]", - "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]", - "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]" -}; - -static const char *const SGPR512RegNames[] = { - "s[0:15]", "s[4:19]", "s[8:23]", "s[12:27]", "s[16:31]", "s[20:35]", - "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]", "s[44:59]", - "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]", "s[68:83]", - "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]" -}; - -#endif diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7a760dcf7a90..815cbc5e26ee 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -14,9 +13,13 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -31,6 +34,56 @@ using namespace llvm; +namespace { + +// Observer to apply a register bank to new registers created by LegalizerHelper. +class ApplyRegBankMapping final : public GISelChangeObserver { +private: + MachineRegisterInfo &MRI; + const RegisterBank *NewBank; + SmallVector NewInsts; + +public: + ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) + : MRI(MRI_), NewBank(RB) {} + + ~ApplyRegBankMapping() { + for (MachineInstr *MI : NewInsts) + applyBank(*MI); + } + + /// Set any registers that don't have a set register class or bank to SALU. + void applyBank(MachineInstr &MI) { + for (MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + + Register Reg = Op.getReg(); + if (MRI.getRegClassOrRegBank(Reg)) + continue; + + const RegisterBank *RB = NewBank; + // FIXME: This might not be enough to detect when SCC should be used. + if (MRI.getType(Reg) == LLT::scalar(1)) + RB = (NewBank == &AMDGPU::SGPRRegBank ? + &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); + + MRI.setRegBank(Reg, *RB); + } + } + + void erasingInstr(MachineInstr &MI) override {} + + void createdInstr(MachineInstr &MI) override { + // At this point, the instruction was just inserted and has no operands. + NewInsts.push_back(&MI); + } + + void changingInstr(MachineInstr &MI) override {} + void changedInstr(MachineInstr &MI) override {} +}; + +} AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast(&TRI)) { @@ -52,43 +105,62 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) } -static bool isConstant(const MachineOperand &MO, int64_t &C) { - const MachineFunction *MF = MO.getParent()->getParent()->getParent(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const MachineInstr *Def = MRI.getVRegDef(MO.getReg()); - if (!Def) - return false; - - if (Def->getOpcode() == AMDGPU::G_CONSTANT) { - C = Def->getOperand(1).getCImm()->getSExtValue(); - return true; - } - - if (Def->getOpcode() == AMDGPU::COPY) - return isConstant(Def->getOperand(1), C); - - return false; -} - unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { + // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? if (Dst.getID() == AMDGPU::SGPRRegBankID && Src.getID() == AMDGPU::VGPRRegBankID) { return std::numeric_limits::max(); } - // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by - // the valu. - if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID && + // Bool values are tricky, because the meaning is based on context. The SCC + // and VCC banks are for the natural scalar and vector conditions produced by + // a compare. + // + // Legalization doesn't know about the necessary context, so an s1 use may + // have been a truncate from an arbitrary value, in which case a copy (lowered + // as a compare with 0) needs to be inserted. + if (Size == 1 && + (Dst.getID() == AMDGPU::SCCRegBankID || + Dst.getID() == AMDGPU::SGPRRegBankID) && (Src.getID() == AMDGPU::SGPRRegBankID || Src.getID() == AMDGPU::VGPRRegBankID || Src.getID() == AMDGPU::VCCRegBankID)) return std::numeric_limits::max(); + if (Dst.getID() == AMDGPU::SCCRegBankID && + Src.getID() == AMDGPU::VCCRegBankID) + return std::numeric_limits::max(); + return RegisterBankInfo::copyCost(Dst, Src, Size); } +unsigned AMDGPURegisterBankInfo::getBreakDownCost( + const ValueMapping &ValMapping, + const RegisterBank *CurBank) const { + // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to + // VGPR. + // FIXME: Is there a better way to do this? + if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) + return 10; // This is expensive. + + assert(ValMapping.NumBreakDowns == 2 && + ValMapping.BreakDown[0].Length == 32 && + ValMapping.BreakDown[0].StartIdx == 0 && + ValMapping.BreakDown[1].Length == 32 && + ValMapping.BreakDown[1].StartIdx == 32 && + ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); + + // 32-bit extract of a 64-bit value is just access of a subregister, so free. + // TODO: Cost of 0 hits assert, though it's not clear it's what we really + // want. + + // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR + // alignment restrictions, but this probably isn't important. + return 1; +} + const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( const TargetRegisterClass &RC) const { @@ -98,6 +170,163 @@ const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( return getRegBank(AMDGPU::VGPRRegBankID); } +template +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::addMappingFromTable( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const std::array RegSrcOpIdx, + ArrayRef> Table) const { + + InstructionMappings AltMappings; + + SmallVector Operands(MI.getNumOperands()); + + unsigned Sizes[NumOps]; + for (unsigned I = 0; I < NumOps; ++I) { + Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); + Sizes[I] = getSizeInBits(Reg, MRI, *TRI); + } + + for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { + unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); + Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); + } + + unsigned MappingID = 0; + for (const auto &Entry : Table) { + for (unsigned I = 0; I < NumOps; ++I) { + int OpIdx = RegSrcOpIdx[I]; + Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); + } + + AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, + getOperandsMapping(Operands), + Operands.size())); + } + + return AltMappings; +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_readlane: { + static const OpRegBankEntry<3> Table[2] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + + // Need a readfirstlane for the index. + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } + }; + + const std::array RegSrcOpIdx = { { 0, 2, 3 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_writelane: { + static const OpRegBankEntry<4> Table[4] = { + // Perfectly legal. + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + + // Need readfirstlane of first op + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, + + // Need readfirstlane of second op + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, + + // Need readfirstlane of both ops + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } + }; + + // rsrc, voffset, offset + const std::array RegSrcOpIdx = { { 0, 2, 3, 4 } }; + return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + default: + return RegisterBankInfo::getInstrAlternativeMappings(MI); + } +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const { + + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_buffer_load: { + static const OpRegBankEntry<3> Table[4] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + + // Waterfall loop needed for rsrc. In the worst case this will execute + // approximately an extra 10 * wavesize + 2 instructions. + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } + }; + + // rsrc, voffset, offset + const std::array RegSrcOpIdx = { { 2, 3, 4 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_s_buffer_load: { + static const OpRegBankEntry<2> Table[4] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + + // Only need 1 register in loop + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, + + // Have to waterfall the resource. + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, + + // Have to waterfall the resource, and the offset. + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } + }; + + // rsrc, offset + const std::array RegSrcOpIdx = { { 2, 3 } }; + return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + // VGPR = M0, VGPR + static const OpRegBankEntry<3> Table[2] = { + // Perfectly legal. + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + + // Need a readfirstlane for m0 + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } + }; + + const std::array RegSrcOpIdx = { { 0, 2, 3 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_s_sendmsg: + case Intrinsic::amdgcn_s_sendmsghalt: { + static const OpRegBankEntry<1> Table[2] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID }, 1 }, + + // Need readlane + { { AMDGPU::VGPRRegBankID }, 3 } + }; + + const std::array RegSrcOpIdx = { { 2 } }; + return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + default: + return RegisterBankInfo::getInstrAlternativeMappings(MI); + } +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPUInstrInfo::isUniformMMO(MMO); +} + RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { @@ -108,31 +337,102 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; switch (MI.getOpcode()) { - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); - // FIXME: Should we be hard coding the size for these mappings? - const InstructionMapping &SSMapping = getInstructionMapping( + + if (Size == 1) { + // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. + const InstructionMapping &SCCMapping = getInstructionMapping( 1, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands + {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&SCCMapping); + + const InstructionMapping &SGPRMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&SGPRMapping); + + const InstructionMapping &VCCMapping0 = getInstructionMapping( + 2, 10, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&VCCMapping0); + return AltMappings; + } + + if (Size != 64) + break; + + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands AltMappings.push_back(&SSMapping); + const InstructionMapping &VVMapping = getInstructionMapping( + 2, 2, getOperandsMapping( + {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&VVMapping); + + const InstructionMapping &SVMapping = getInstructionMapping( + 3, 3, getOperandsMapping( + {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&SVMapping); + + // SGPR in LHS is slightly preferrable, so make it VS more expensive than + // SV. + const InstructionMapping &VSMapping = getInstructionMapping( + 3, 4, getOperandsMapping( + {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&VSMapping); + break; + } + case TargetOpcode::G_LOAD: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); + // FIXME: Should we be hard coding the size for these mappings? + if (isInstrUniform(MI)) { + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&SSMapping); + } + const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 2); // Num Operands AltMappings.push_back(&VVMapping); - // FIXME: Should this be the pointer-size (64-bits) or the size of the - // register that will hold the bufffer resourc (128-bits). - const InstructionMapping &VSMapping = getInstructionMapping( - 3, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&VSMapping); + // It may be possible to have a vgpr = load sgpr mapping here, because + // the mubuf instructions support this kind of load, but probably for only + // gfx7 and older. However, the addressing mode matching in the instruction + // selector should be able to do a better job of detecting and selecting + // these kinds of loads from the vgpr = load vgpr mapping. return AltMappings; @@ -184,15 +484,32 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&SSMapping); const InstructionMapping &VVMapping = getInstructionMapping(2, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 4); // Num Operands AltMappings.push_back(&VVMapping); return AltMappings; } + case TargetOpcode::G_SMIN: + case TargetOpcode::G_SMAX: + case TargetOpcode::G_UMIN: + case TargetOpcode::G_UMAX: { + static const OpRegBankEntry<3> Table[4] = { + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + + // Scalar requires cmp+select, and extends if 16-bit. + // FIXME: Should there be separate costs for 32 and 16-bit + { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } + }; + + const std::array RegSrcOpIdx = { { 0, 1, 2 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } case TargetOpcode::G_UADDE: case TargetOpcode::G_USUBE: case TargetOpcode::G_SADDE: @@ -234,23 +551,816 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&VMapping); return AltMappings; } + case AMDGPU::G_INTRINSIC: + return getInstrAlternativeMappingsIntrinsic(MI, MRI); + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); default: break; } return RegisterBankInfo::getInstrAlternativeMappings(MI); } -void AMDGPURegisterBankInfo::applyMappingImpl( - const OperandsMapper &OpdMapper) const { - return applyDefaultMapping(OpdMapper); +void AMDGPURegisterBankInfo::split64BitValueForMapping( + MachineIRBuilder &B, + SmallVector &Regs, + LLT HalfTy, + Register Reg) const { + assert(HalfTy.getSizeInBits() == 32); + MachineRegisterInfo *MRI = B.getMRI(); + Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); + Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); + const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); + MRI->setRegBank(LoLHS, *Bank); + MRI->setRegBank(HiLHS, *Bank); + + Regs.push_back(LoLHS); + Regs.push_back(HiLHS); + + B.buildInstr(AMDGPU::G_UNMERGE_VALUES) + .addDef(LoLHS) + .addDef(HiLHS) + .addUse(Reg); } -static bool isInstrUniform(const MachineInstr &MI) { - if (!MI.hasOneMemOperand()) +/// Replace the current type each register in \p Regs has with \p NewTy +static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef Regs, + LLT NewTy) { + for (Register Reg : Regs) { + assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); + MRI.setType(Reg, NewTy); + } +} + +static LLT getHalfSizedType(LLT Ty) { + if (Ty.isVector()) { + assert(Ty.getNumElements() % 2 == 0); + return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); + } + + assert(Ty.getSizeInBits() % 2 == 0); + return LLT::scalar(Ty.getSizeInBits() / 2); +} + +/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If +/// any of the required SGPR operands are VGPRs, perform a waterfall loop to +/// execute the instruction for each unique combination of values in all lanes +/// in the wave. The block will be split such that rest of the instructions are +/// moved to a new block. +/// +/// Essentially performs this loop: +// +/// Save Execution Mask +/// For (Lane : Wavefront) { +/// Enable Lane, Disable all other lanes +/// SGPR = read SGPR value for current lane from VGPR +/// VGPRResult[Lane] = use_op SGPR +/// } +/// Restore Execution Mask +/// +/// There is additional complexity to try for compare values to identify the +/// unique values used. +void AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef OpIndices) const { + MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator I(MI); + + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet SGPROperandRegs; + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + if (SGPROperandRegs.empty()) + return; + + MachineIRBuilder B(MI); + SmallVector ResultRegs; + SmallVector InitResultRegs; + SmallVector PhiRegs; + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } + + Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF) + .addDef(InitSaveExecReg); + + Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + // To insert the loop we need to split the block. Move everything before this + // point to a new block, and insert a new empty block before this instruction. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RestoreExecBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(RestoreExecBB); + LoopBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + B.buildInstr(TargetOpcode::PHI) + .addDef(PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&MBB) + .addReg(NewExec) + .addMBB(LoopBB); + + for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { + B.buildInstr(TargetOpcode::G_PHI) + .addDef(std::get<2>(Result)) + .addReg(std::get<0>(Result)) // Initial value / implicit_def + .addMBB(&MBB) + .addReg(std::get<1>(Result)) // Mid-loop value. + .addMBB(LoopBB); + } + + // Move the instruction into the loop. + LoopBB->splice(LoopBB->end(), &MBB, I); + I = std::prev(LoopBB->end()); + + B.setInstr(*I); + + Register CondReg; + + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg()) + continue; + + assert(!Op.isDef()); + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); + + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); + + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) + .addReg(Op.getReg()); + + Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); + + if (!First) { + Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(AMDGPU::S_AND_B64) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } else { + LLT S32 = LLT::scalar(32); + SmallVector ReadlanePieces; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + bool Is64 = OpSize % 64 == 0; + + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + // Insert the unmerge before the loop. + + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); + + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + unsigned UnmergePiece = Unmerge.getReg(PieceIdx); + + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); + + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } + } else { + CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); + } + + Register NewCondReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); + + if (!First) { + Register AndReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(AMDGPU::S_AND_B64) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } + + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } + + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); + } + } + } + + B.setInsertPt(*LoopBB, LoopBB->end()); + + // Update EXEC, save the original EXEC value to VCC. + B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + .addDef(NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(AMDGPU::S_XOR_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) + .addMBB(LoopBB); + + // Save the EXEC mask before the loop. + BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) + .addReg(AMDGPU::EXEC); + + // Restore the EXEC mask after the loop. + B.setMBB(*RestoreExecBB); + B.buildInstr(AMDGPU::S_MOV_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(SaveExecReg); +} + +// Legalize an operand that must be an SGPR by inserting a readfirstlane. +void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( + MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { + Register Reg = MI.getOperand(OpIdx).getReg(); + const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); + if (Bank != &AMDGPU::VGPRRegBank) + return; + + MachineIRBuilder B(MI); + Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) + .addDef(SGPR) + .addReg(Reg); + + const TargetRegisterClass *Constrained = + constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); + (void)Constrained; + assert(Constrained && "Failed to constrain readfirstlane src reg"); + + MI.getOperand(OpIdx).setReg(SGPR); +} + +// When regbankselect repairs registers, it will insert a repair instruction +// which defines the repaired register. Then it calls applyMapping and expects +// that the targets will either delete or rewrite the originally wrote to the +// repaired registers. Beccause of this, we end up in a situation where +// we have 2 instructions defining the same registers. +static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, + Register Reg, + const MachineInstr &MI) { + // Is there some way we can assert that there are exactly 2 def instructions? + for (MachineInstr &Other : MRI.def_instructions(Reg)) { + if (&Other != &MI) + return &Other; + } + + return nullptr; +} + +bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const { + Register DstReg = MI.getOperand(0).getReg(); + const LLT LoadTy = MRI.getType(DstReg); + unsigned LoadSize = LoadTy.getSizeInBits(); + const unsigned MaxNonSmrdLoadSize = 128; + // 128-bit loads are supported for all instruction types. + if (LoadSize <= MaxNonSmrdLoadSize) return false; - const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); + SmallVector DefRegs(OpdMapper.getVRegs(0)); + SmallVector SrcRegs(OpdMapper.getVRegs(1)); + + // If the pointer is an SGPR, we have nothing to do. + if (SrcRegs.empty()) + return false; + + assert(LoadSize % MaxNonSmrdLoadSize == 0); + + // We want to get the repair instruction now, because it will help us + // determine which instruction the legalizer inserts that will also + // write to DstReg. + MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); + + // RegBankSelect only emits scalar types, so we need to reset the pointer + // operand to a pointer type. + Register BasePtrReg = SrcRegs[0]; + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + MRI.setType(BasePtrReg, PtrTy); + + MachineIRBuilder B(MI); + + unsigned SplitElts = + MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); + const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); + ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); + GISelObserverWrapper Observer(&O); + B.setChangeObserver(Observer); + LegalizerHelper Helper(B.getMF(), Observer, B); + if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + return false; + + // At this point, the legalizer has split the original load into smaller + // loads. At the end of lowering, it inserts an instruction (LegalizedInst) + // that combines the outputs of the lower loads and writes it to DstReg. + // The register bank selector has also added the RepairInst which writes to + // DstReg as well. + + MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); + + // Replace the output of the LegalizedInst with a temporary register, since + // RepairInst already defines DstReg. + Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); + LegalizedInst->getOperand(0).setReg(TmpReg); + B.setInsertPt(*RepairInst->getParent(), RepairInst); + + for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { + Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + B.buildConstant(IdxReg, DefIdx); + MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); + B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); + } + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + return true; +} + +// For cases where only a single copy is inserted for matching register banks. +// Replace the register in the instruction operand +static void substituteSimpleCopyRegs( + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { + SmallVector SrcReg(OpdMapper.getVRegs(OpIdx)); + if (!SrcReg.empty()) { + assert(SrcReg.size() == 1); + OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); + } +} + +void AMDGPURegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + MachineInstr &MI = OpdMapper.getMI(); + unsigned Opc = MI.getOpcode(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + switch (Opc) { + case AMDGPU::G_SELECT: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy.getSizeInBits() != 64) + break; + + LLT HalfTy = getHalfSizedType(DstTy); + + SmallVector DefRegs(OpdMapper.getVRegs(0)); + SmallVector Src0Regs(OpdMapper.getVRegs(1)); + SmallVector Src1Regs(OpdMapper.getVRegs(2)); + SmallVector Src2Regs(OpdMapper.getVRegs(3)); + + // All inputs are SGPRs, nothing special to do. + if (DefRegs.empty()) { + assert(Src1Regs.empty() && Src2Regs.empty()); + break; + } + + MachineIRBuilder B(MI); + if (Src0Regs.empty()) + Src0Regs.push_back(MI.getOperand(1).getReg()); + else { + assert(Src0Regs.size() == 1); + } + + if (Src1Regs.empty()) + split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); + else { + setRegsToType(MRI, Src1Regs, HalfTy); + } + + if (Src2Regs.empty()) + split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); + else + setRegsToType(MRI, Src2Regs, HalfTy); + + setRegsToType(MRI, DefRegs, HalfTy); + + B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]); + B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]); + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_AND: + case AMDGPU::G_OR: + case AMDGPU::G_XOR: { + // 64-bit and is only available on the SALU, so split into 2 32-bit ops if + // there is a VGPR input. + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy.getSizeInBits() != 64) + break; + + LLT HalfTy = getHalfSizedType(DstTy); + SmallVector DefRegs(OpdMapper.getVRegs(0)); + SmallVector Src0Regs(OpdMapper.getVRegs(1)); + SmallVector Src1Regs(OpdMapper.getVRegs(2)); + + // All inputs are SGPRs, nothing special to do. + if (DefRegs.empty()) { + assert(Src0Regs.empty() && Src1Regs.empty()); + break; + } + + assert(DefRegs.size() == 2); + assert(Src0Regs.size() == Src1Regs.size() && + (Src0Regs.empty() || Src0Regs.size() == 2)); + + // Depending on where the source registers came from, the generic code may + // have decided to split the inputs already or not. If not, we still need to + // extract the values. + MachineIRBuilder B(MI); + + if (Src0Regs.empty()) + split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); + else + setRegsToType(MRI, Src0Regs, HalfTy); + + if (Src1Regs.empty()) + split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); + else + setRegsToType(MRI, Src1Regs, HalfTy); + + setRegsToType(MRI, DefRegs, HalfTy); + + B.buildInstr(Opc) + .addDef(DefRegs[0]) + .addUse(Src0Regs[0]) + .addUse(Src1Regs[0]); + + B.buildInstr(Opc) + .addDef(DefRegs[1]) + .addUse(Src0Regs[1]) + .addUse(Src1Regs[1]); + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_ADD: + case AMDGPU::G_SUB: + case AMDGPU::G_MUL: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy != LLT::scalar(16)) + break; + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + if (DstBank == &AMDGPU::VGPRRegBank) + break; + + // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. + MachineFunction *MF = MI.getParent()->getParent(); + MachineIRBuilder B(MI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplySALU); + LegalizerHelper Helper(*MF, Observer, B); + + if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != + LegalizerHelper::Legalized) + llvm_unreachable("widen scalar should have succeeded"); + return; + } + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: { + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + if (DstBank == &AMDGPU::VGPRRegBank) + break; + + MachineFunction *MF = MI.getParent()->getParent(); + MachineIRBuilder B(MI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplySALU); + LegalizerHelper Helper(*MF, Observer, B); + + // Turn scalar min/max into a compare and select. + LLT Ty = MRI.getType(DstReg); + LLT S32 = LLT::scalar(32); + LLT S16 = LLT::scalar(16); + + if (Ty == S16) { + // Need to widen to s32, and expand as cmp + select. + if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("widenScalar should have succeeded"); + + // FIXME: This is relying on widenScalar leaving MI in place. + if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("lower should have succeeded"); + } else { + if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized) + llvm_unreachable("lower should have succeeded"); + } + + return; + } + case AMDGPU::G_SEXT: + case AMDGPU::G_ZEXT: { + Register SrcReg = MI.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + bool Signed = Opc == AMDGPU::G_SEXT; + + MachineIRBuilder B(MI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy.isScalar() && + SrcBank != &AMDGPU::SGPRRegBank && + SrcBank != &AMDGPU::SCCRegBank && + SrcBank != &AMDGPU::VCCRegBank && + // FIXME: Should handle any type that round to s64 when irregular + // breakdowns supported. + DstTy.getSizeInBits() == 64 && + SrcTy.getSizeInBits() <= 32) { + const LLT S32 = LLT::scalar(32); + SmallVector DefRegs(OpdMapper.getVRegs(0)); + + // Extend to 32-bit, and then extend the low half. + if (Signed) { + // TODO: Should really be buildSExtOrCopy + B.buildSExtOrTrunc(DefRegs[0], SrcReg); + + // Replicate sign bit from 32-bit extended part. + auto ShiftAmt = B.buildConstant(S32, 31); + MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); + B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); + } else { + B.buildZExtOrTrunc(DefRegs[0], SrcReg); + B.buildConstant(DefRegs[1], 0); + } + + MRI.setRegBank(DstReg, *SrcBank); + MI.eraseFromParent(); + return; + } + + if (SrcTy != LLT::scalar(1)) + return; + + if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) { + SmallVector DefRegs(OpdMapper.getVRegs(0)); + + const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ? + &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank; + + unsigned DstSize = DstTy.getSizeInBits(); + // 64-bit select is SGPR only + const bool UseSel64 = DstSize > 32 && + SrcBank->getID() == AMDGPU::SCCRegBankID; + + // TODO: Should s16 select be legal? + LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); + auto True = B.buildConstant(SelType, Signed ? -1 : 1); + auto False = B.buildConstant(SelType, 0); + + MRI.setRegBank(True.getReg(0), *DstBank); + MRI.setRegBank(False.getReg(0), *DstBank); + MRI.setRegBank(DstReg, *DstBank); + + if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) { + B.buildSelect(DefRegs[0], SrcReg, True, False); + B.buildCopy(DefRegs[1], DefRegs[0]); + } else if (DstSize < 32) { + auto Sel = B.buildSelect(SelType, SrcReg, True, False); + MRI.setRegBank(Sel.getReg(0), *DstBank); + B.buildTrunc(DstReg, Sel); + } else { + B.buildSelect(DstReg, SrcReg, True, False); + } + + MI.eraseFromParent(); + return; + } + + // Fixup the case with an s1 src that isn't a condition register. Use shifts + // instead of introducing a compare to avoid an unnecessary condition + // register (and since there's no scalar 16-bit compares). + auto Ext = B.buildAnyExt(DstTy, SrcReg); + auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); + auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); + + if (MI.getOpcode() == AMDGPU::G_SEXT) + B.buildAShr(DstReg, Shl, ShiftAmt); + else + B.buildLShr(DstReg, Shl, ShiftAmt); + + MRI.setRegBank(DstReg, *SrcBank); + MRI.setRegBank(Ext.getReg(0), *SrcBank); + MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); + MRI.setRegBank(Shl.getReg(0), *SrcBank); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_EXTRACT_VECTOR_ELT: + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + case AMDGPU::G_INTRINSIC: { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_s_buffer_load: { + // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS + executeInWaterfallLoop(MI, MRI, { 2, 3 }); + return; + } + case Intrinsic::amdgcn_readlane: { + substituteSimpleCopyRegs(OpdMapper, 2); + + assert(empty(OpdMapper.getVRegs(0))); + assert(empty(OpdMapper.getVRegs(3))); + + // Make sure the index is an SGPR. It doesn't make sense to run this in a + // waterfall loop, so assume it's a uniform value. + constrainOpWithReadfirstlane(MI, MRI, 3); // Index + return; + } + case Intrinsic::amdgcn_writelane: { + assert(empty(OpdMapper.getVRegs(0))); + assert(empty(OpdMapper.getVRegs(2))); + assert(empty(OpdMapper.getVRegs(3))); + + substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val + constrainOpWithReadfirstlane(MI, MRI, 2); // Source value + constrainOpWithReadfirstlane(MI, MRI, 3); // Index + return; + } + default: + break; + } + break; + } + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_buffer_load: { + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + // This is only allowed to execute with 1 lane, so readfirstlane is safe. + assert(empty(OpdMapper.getVRegs(0))); + substituteSimpleCopyRegs(OpdMapper, 3); + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + case Intrinsic::amdgcn_s_sendmsg: + case Intrinsic::amdgcn_s_sendmsghalt: { + // FIXME: Should this use a waterfall loop? + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + default: + break; + } + break; + } + case AMDGPU::G_LOAD: { + if (applyMappingWideLoad(MI, OpdMapper, MRI)) + return; + break; + } + default: + break; + } + + return applyDefaultMapping(OpdMapper); } bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { @@ -259,7 +1369,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { if (!MI.getOperand(i).isReg()) continue; - unsigned Reg = MI.getOperand(i).getReg(); + Register Reg = MI.getOperand(i).getReg(); if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { if (Bank->getID() == AMDGPU::VGPRRegBankID) return false; @@ -299,7 +1409,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { if (MI.getOperand(OpdIdx).isIntrinsicID()) OpdsMapping[OpdIdx++] = nullptr; - unsigned Reg1 = MI.getOperand(OpdIdx).getReg(); + Register Reg1 = MI.getOperand(OpdIdx).getReg(); unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); unsigned DefaultBankID = Size1 == 1 ? @@ -309,7 +1419,11 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { - unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI); + const MachineOperand &MO = MI.getOperand(OpdIdx); + if (!MO.isReg()) + continue; + + unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); } @@ -325,7 +1439,11 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { SmallVector OpdsMapping(MI.getNumOperands()); for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); + const MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } @@ -340,6 +1458,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector OpdsMapping(MI.getNumOperands()); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); const ValueMapping *ValMapping; @@ -350,7 +1469,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { - ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } @@ -366,7 +1485,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { } unsigned -AMDGPURegisterBankInfo::getRegBankID(unsigned Reg, +AMDGPURegisterBankInfo::getRegBankID(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, unsigned Default) const { @@ -383,13 +1502,81 @@ AMDGPURegisterBankInfo::getRegBankID(unsigned Reg, /// const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (MI.isRegSequence()) { + // If any input is a VGPR, the result must be a VGPR. The default handling + // assumes any copy between banks is legal. + unsigned BankID = AMDGPU::SGPRRegBankID; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); + // It doesn't make sense to use vcc or scc banks here, so just ignore + // them. + if (OpBank != AMDGPU::SGPRRegBankID) { + BankID = AMDGPU::VGPRRegBankID; + break; + } + } + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + + const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); + return getInstructionMapping( + 1, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); + } + + // The default handling is broken and doesn't handle illegal SGPR->VGPR copies + // properly. + // + // TODO: There are additional exec masking dependencies to analyze. + if (MI.getOpcode() == TargetOpcode::G_PHI) { + // TODO: Generate proper invalid bank enum. + int ResultBank = -1; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + unsigned Reg = MI.getOperand(I).getReg(); + const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); + + // FIXME: Assuming VGPR for any undetermined inputs. + if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { + ResultBank = AMDGPU::VGPRRegBankID; + break; + } + + unsigned OpBank = Bank->getID(); + // scc, scc -> sgpr + if (OpBank == AMDGPU::SCCRegBankID) { + // There's only one SCC register, so a phi requires copying to SGPR. + OpBank = AMDGPU::SGPRRegBankID; + } else if (OpBank == AMDGPU::VCCRegBankID) { + // vcc, vcc -> vcc + // vcc, sgpr -> vgpr + if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) { + ResultBank = AMDGPU::VGPRRegBankID; + break; + } + } + + ResultBank = OpBank; + } + + assert(ResultBank != -1); + + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + + const ValueMapping &ValMap = + getValueMapping(0, Size, getRegBank(ResultBank)); + return getInstructionMapping( + 1, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); + } + + const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; - const MachineFunction &MF = *MI.getParent()->getParent(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector OpdsMapping(MI.getNumOperands()); switch (MI.getOpcode()) { @@ -401,18 +1588,86 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_XOR: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); if (Size == 1) { - OpdsMapping[0] = OpdsMapping[1] = - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + const RegisterBank *DstBank + = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); + + unsigned TargetBankID = -1; + unsigned BankLHS = -1; + unsigned BankRHS = -1; + if (DstBank) { + TargetBankID = DstBank->getID(); + if (DstBank == &AMDGPU::VCCRegBank) { + TargetBankID = AMDGPU::VCCRegBankID; + BankLHS = AMDGPU::VCCRegBankID; + BankRHS = AMDGPU::VCCRegBankID; + } else if (DstBank == &AMDGPU::SCCRegBank) { + TargetBankID = AMDGPU::SCCRegBankID; + BankLHS = AMDGPU::SGPRRegBankID; + BankRHS = AMDGPU::SGPRRegBankID; + } else { + BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + } + } else { + BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::VCCRegBankID); + BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::VCCRegBankID); + + // Both inputs should be true booleans to produce a boolean result. + if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { + TargetBankID = AMDGPU::VGPRRegBankID; + } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { + TargetBankID = AMDGPU::VCCRegBankID; + BankLHS = AMDGPU::VCCRegBankID; + BankRHS = AMDGPU::VCCRegBankID; + } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { + TargetBankID = AMDGPU::SGPRRegBankID; + } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) { + // The operation must be done on a 32-bit register, but it will set + // scc. The result type could interchangably be SCC or SGPR, since + // both values will be produced. + TargetBankID = AMDGPU::SCCRegBankID; + BankLHS = AMDGPU::SGPRRegBankID; + BankRHS = AMDGPU::SGPRRegBankID; + } + } + + OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); + break; + } + + if (Size == 64) { + + if (isSALUMapping(MI)) { + OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; + } else { + OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); + unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); + + unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); + } + break; } LLVM_FALLTHROUGH; } + case AMDGPU::G_GEP: case AMDGPU::G_ADD: case AMDGPU::G_SUB: case AMDGPU::G_MUL: case AMDGPU::G_SHL: + case AMDGPU::G_LSHR: + case AMDGPU::G_ASHR: case AMDGPU::G_UADDO: case AMDGPU::G_SADDO: case AMDGPU::G_USUBO: @@ -421,6 +1676,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: + case AMDGPU::G_UMULH: + case AMDGPU::G_SMULH: + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); LLVM_FALLTHROUGH; @@ -431,11 +1692,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: case AMDGPU::G_FMA: + case AMDGPU::G_FSQRT: case AMDGPU::G_SITOFP: case AMDGPU::G_UITOFP: case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FPEXT: case AMDGPU::G_FEXP2: case AMDGPU::G_FLOG2: + case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: case AMDGPU::G_INTRINSIC_ROUND: return getDefaultMappingVOP(MI); @@ -473,7 +1737,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = nullptr; break; } - case AMDGPU::G_MERGE_VALUES: { + case AMDGPU::G_MERGE_VALUES: + case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_CONCAT_VECTORS: { unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -502,8 +1768,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_TRUNC: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); unsigned Bank = getRegBankID(Src, MRI, *TRI); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); @@ -514,23 +1780,35 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ZEXT: case AMDGPU::G_SEXT: case AMDGPU::G_ANYEXT: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); - unsigned SrcBank = getRegBankID(Src, MRI, *TRI, - SrcSize == 1 ? AMDGPU::SGPRRegBankID : - AMDGPU::VGPRRegBankID); - unsigned DstBank = SrcBank; - if (SrcSize == 1) { - if (SrcBank == AMDGPU::SGPRRegBankID) - DstBank = AMDGPU::VGPRRegBankID; - else - DstBank = AMDGPU::SGPRRegBankID; - } - - OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); - OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize); + + unsigned DstBank; + const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); + assert(SrcBank); + switch (SrcBank->getID()) { + case AMDGPU::SCCRegBankID: + case AMDGPU::SGPRRegBankID: + DstBank = AMDGPU::SGPRRegBankID; + break; + default: + DstBank = AMDGPU::VGPRRegBankID; + break; + } + + // TODO: Should anyext be split into 32-bit part as well? + if (MI.getOpcode() == AMDGPU::G_ANYEXT) { + OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); + } else { + // Scalar extend can use 64-bit BFE, but VGPRs require extending to + // 32-bits, and then to 64. + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), + SrcSize); + } break; } case AMDGPU::G_FCMP: { @@ -542,16 +1820,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); break; } - case AMDGPU::G_GEP: { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isReg()) - continue; - - unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits(); - OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); - } - break; - } case AMDGPU::G_STORE: { assert(MI.getOperand(0).isReg()); unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -571,57 +1839,55 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_ICMP: { + auto Pred = static_cast(MI.getOperand(1).getPredicate()); unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); - unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID && - Op3Bank == AMDGPU::SGPRRegBankID ? - AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + + bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && + Op3Bank == AMDGPU::SGPRRegBankID && + (Size == 32 || (Size == 64 && + (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && + MF.getSubtarget().hasScalarCompareEq64())); + + unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); OpdsMapping[1] = nullptr; // Predicate Operand. OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); break; } - - case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned IdxOp = 2; - int64_t Imm; - // XXX - Do we really need to fully handle these? The constant case should - // be legalized away before RegBankSelect? - - unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ? + unsigned OutputBankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - + unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits()); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits()); + + OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); // The index can be either if the source vector is VGPR. - OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); + OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); break; } case AMDGPU::G_INSERT_VECTOR_ELT: { - // XXX - Do we really need to fully handle these? The constant case should - // be legalized away before RegBankSelect? - - int64_t Imm; - - unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3; - unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - - + unsigned OutputBankID = isSALUMapping(MI) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - // TODO: Can do SGPR indexing, which would obviate the need for the - // isConstant check. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); - OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); - } + unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); + unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); + OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + // The index can be either if the source vector is VGPR. + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { @@ -637,14 +1903,70 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(1).getIntrinsicID()) { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { default: return getInvalidInstructionMapping(); case Intrinsic::maxnum: case Intrinsic::minnum: + case Intrinsic::amdgcn_div_fmas: + case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_sin: + case Intrinsic::amdgcn_cos: + case Intrinsic::amdgcn_log_clamp: + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_ldexp: + case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_frexp_exp: + case Intrinsic::amdgcn_fract: case Intrinsic::amdgcn_cvt_pkrtz: + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: + case Intrinsic::amdgcn_fmed3: + case Intrinsic::amdgcn_cubeid: + case Intrinsic::amdgcn_cubema: + case Intrinsic::amdgcn_cubesc: + case Intrinsic::amdgcn_cubetc: + case Intrinsic::amdgcn_sffbh: + case Intrinsic::amdgcn_fmad_ftz: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_ubfe: + case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_lerp: + case Intrinsic::amdgcn_sad_u8: + case Intrinsic::amdgcn_msad_u8: + case Intrinsic::amdgcn_sad_hi_u8: + case Intrinsic::amdgcn_sad_u16: + case Intrinsic::amdgcn_qsad_pk_u16_u8: + case Intrinsic::amdgcn_mqsad_pk_u16_u8: + case Intrinsic::amdgcn_mqsad_u32_u8: + case Intrinsic::amdgcn_cvt_pk_u8_f32: + case Intrinsic::amdgcn_alignbit: + case Intrinsic::amdgcn_alignbyte: + case Intrinsic::amdgcn_fdot2: + case Intrinsic::amdgcn_sdot2: + case Intrinsic::amdgcn_udot2: + case Intrinsic::amdgcn_sdot4: + case Intrinsic::amdgcn_udot4: + case Intrinsic::amdgcn_sdot8: + case Intrinsic::amdgcn_udot8: + case Intrinsic::amdgcn_fdiv_fast: + case Intrinsic::amdgcn_wwm: + case Intrinsic::amdgcn_wqm: return getDefaultMappingVOP(MI); - case Intrinsic::amdgcn_kernarg_segment_ptr: { + case Intrinsic::amdgcn_ds_permute: + case Intrinsic::amdgcn_ds_bpermute: + case Intrinsic::amdgcn_update_dpp: + return getDefaultMappingAllVGPR(MI); + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_s_getpc: + case Intrinsic::amdgcn_groupstaticsize: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -652,16 +1974,142 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wqm_vote: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = OpdsMapping[2] - = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); + break; + } + case Intrinsic::amdgcn_s_buffer_load: { + // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS + Register RSrc = MI.getOperand(2).getReg(); // SGPR + Register Offset = MI.getOperand(3).getReg(); // SGPR/imm + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); + unsigned Size3 = MRI.getType(Offset).getSizeInBits(); + + unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); + unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); + OpdsMapping[1] = nullptr; // intrinsic id + + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc + OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); + OpdsMapping[4] = nullptr; + break; + } + case Intrinsic::amdgcn_div_scale: { + unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); + + unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); + OpdsMapping[3] = AMDGPU::getValueMapping( + getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); + OpdsMapping[4] = AMDGPU::getValueMapping( + getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); + + break; + } + case Intrinsic::amdgcn_class: { + Register Src0Reg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); + unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), + Src0Size); + OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), + Src1Size); + break; + } + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + // This is not VCCRegBank because this is not used in boolean contexts. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); + OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); + break; + } + case Intrinsic::amdgcn_readlane: { + // This must be an SGPR, but accept a VGPR. + unsigned IdxReg = MI.getOperand(3).getReg(); + unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); + unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + LLVM_FALLTHROUGH; + } + case Intrinsic::amdgcn_readfirstlane: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); + break; + } + case Intrinsic::amdgcn_writelane: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcReg = MI.getOperand(2).getReg(); + unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); + unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + unsigned IdxReg = MI.getOperand(3).getReg(); + unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); + unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + + // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted + // to legalize. + OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); + break; + } + case Intrinsic::amdgcn_if_break: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } } break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - switch (MI.getOperand(0).getIntrinsicID()) { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { default: return getInvalidInstructionMapping(); + case Intrinsic::amdgcn_s_getreg: + case Intrinsic::amdgcn_s_memtime: + case Intrinsic::amdgcn_s_memrealtime: + case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + return getDefaultMappingAllVGPR(MI); + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + break; + } case Intrinsic::amdgcn_exp_compr: OpdsMapping[0] = nullptr; // IntrinsicID // FIXME: These are immediate values which can't be read from registers. @@ -688,24 +2136,82 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; + case Intrinsic::amdgcn_buffer_load: { + Register RSrc = MI.getOperand(2).getReg(); // SGPR + Register VIndex = MI.getOperand(3).getReg(); // VGPR + Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); + unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); + unsigned Size4 = MRI.getType(Offset).getSizeInBits(); + + unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); + unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); + OpdsMapping[1] = nullptr; // intrinsic id + + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); + OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); + OpdsMapping[5] = nullptr; + OpdsMapping[6] = nullptr; + break; + } + case Intrinsic::amdgcn_s_sendmsg: + case Intrinsic::amdgcn_s_sendmsghalt: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } + case Intrinsic::amdgcn_end_cf: { + unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } } break; } case AMDGPU::G_SELECT: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, AMDGPU::SGPRRegBankID); - unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); - bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID && - Op2Bank == AMDGPU::SGPRRegBankID && + unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && Op3Bank == AMDGPU::SGPRRegBankID; - unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; - OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); - OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1); - OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); - OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); + + unsigned CondBankDefault = SGPRSrcs ? + AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + CondBankDefault); + if (CondBank == AMDGPU::SGPRRegBankID) + CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + else if (CondBank == AMDGPU::VGPRRegBankID) + CondBank = AMDGPU::VCCRegBankID; + + unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + + assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID); + + if (Size == 64) { + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); + } else { + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); + } + break; } @@ -737,6 +2243,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } } - return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), + return getInstructionMapping(/*ID*/1, /*Cost*/1, + getOperandsMapping(OpdsMapping), MI.getNumOperands()); } + diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index d29f4bc79a51..f3a96e2a6128 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -1,9 +1,8 @@ //===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -14,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #define GET_REGBANK_DECLARATIONS @@ -22,6 +22,8 @@ namespace llvm { +class LLT; +class MachineIRBuilder; class SIRegisterInfo; class TargetRegisterInfo; @@ -36,16 +38,53 @@ protected: class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; + void executeInWaterfallLoop(MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef OpIndices) const; + + void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, + unsigned OpIdx) const; + bool applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; + /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; const RegisterBankInfo::InstructionMapping & getInstrMappingForLoad(const MachineInstr &MI) const; - unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI, + unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, unsigned Default = AMDGPU::VGPRRegBankID) const; + /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p + /// Regs. This appropriately sets the regbank of the new registers. + void split64BitValueForMapping(MachineIRBuilder &B, + SmallVector &Regs, + LLT HalfTy, + Register Reg) const; + + template + struct OpRegBankEntry { + int8_t RegBanks[NumOps]; + int16_t Cost; + }; + + template + InstructionMappings + addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const std::array RegSrcOpIdx, + ArrayRef> Table) const; + + RegisterBankInfo::InstructionMappings + getInstrAlternativeMappingsIntrinsic( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const; + + RegisterBankInfo::InstructionMappings + getInstrAlternativeMappingsIntrinsicWSideEffects( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const; + bool isSALUMapping(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const; @@ -57,6 +96,9 @@ public: unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; + unsigned getBreakDownCost(const ValueMapping &ValMapping, + const RegisterBank *CurBank = nullptr) const override; + const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC) const override; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 570379a820e1..9555694fb106 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -1,9 +1,8 @@ //=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,7 +14,7 @@ def VGPRRegBank : RegisterBank<"VGPR", [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] >; -def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>; +def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>; // It is helpful to distinguish conditions from ordinary SGPRs. def VCCRegBank : RegisterBank <"VCC", [SReg_64]>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 50f859addc2b..7cffdf1a4dcf 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -32,7 +31,10 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15 + AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24, + AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29, + AMDGPU::sub30, AMDGPU::sub31 }; assert(Channel < array_lengthof(SubRegs)); @@ -83,7 +85,18 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, } } -unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const SIFrameLowering *TFI = + MF.getSubtarget().getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - return FuncInfo->getFrameOffsetReg(); + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() + : FuncInfo->getStackPtrOffsetReg(); +} + +const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { + return CSR_AMDGPU_AllVGPRs_RegMask; +} + +const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { + return CSR_AMDGPU_AllAllocatableSRegs_RegMask; } diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 922d974f2ebd..3453a8c1b0b3 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -1,9 +1,8 @@ //===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td index ceabae524414..ab71b7aa8a57 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -1,9 +1,8 @@ //===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,7 +12,7 @@ let Namespace = "AMDGPU" in { -foreach Index = 0-15 in { +foreach Index = 0-31 in { def sub#Index : SubRegIndex<32, !shl(Index, 5)>; } diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index efe501cb73c2..4f095087a57f 100644 --- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -1,9 +1,8 @@ //===- AMDGPURewriteOutArgumentsPass.cpp - Create struct returns ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 9dbd7751b4d8..f8703c36127a 100644 --- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -1,9 +1,8 @@ //===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -49,6 +48,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -70,8 +71,59 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; + +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ed0cc70c3d9a..1eb9b83456c5 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -41,12 +40,17 @@ using namespace llvm; #undef AMDGPUSubtarget #include "R600GenSubtargetInfo.inc" +static cl::opt DisablePowerSched( + "amdgpu-disable-power-sched", + cl::desc("Disable scheduling to minimize mAI power bursts"), + cl::init(false)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & R600Subtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); + SmallString<256> FullFS("+promote-alloca,"); FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -65,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT, GCNSubtarget & GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS) { + StringRef GPU, StringRef FS) { // Determine default and user-specified characteristics // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be // enabled, but some instructions do not respect them and they run at the @@ -78,10 +82,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); + // Assuming ECC is enabled is the conservative default. + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; // FIXME: I don't think think Evergreen has any useful support for // denormals, but should be checked. Should we issue a warning somewhere @@ -94,6 +99,16 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + // Disable mutually exclusive bits. + if (FS.find_lower("+wavefrontsize") != StringRef::npos) { + if (FS.find_lower("wavefrontsize16") == StringRef::npos) + FullFS += "-wavefrontsize16,"; + if (FS.find_lower("wavefrontsize32") == StringRef::npos) + FullFS += "-wavefrontsize32,"; + if (FS.find_lower("wavefrontsize64") == StringRef::npos) + FullFS += "-wavefrontsize64,"; + } + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -124,8 +139,25 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, HasMovrel = true; } + // Don't crash on invalid devices. + if (WavefrontSize == 0) + WavefrontSize = 64; + HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + if (DoesNotSupportXNACK && EnableXNACK) { + ToggleFeature(AMDGPU::FeatureXNACK); + EnableXNACK = false; + } + + // ECC is on by default, but turn it off if the hardware doesn't support it + // anyway. This matters for the gfx9 targets with d16 loads, but don't support + // ECC. + if (DoesNotSupportSRAMECC && EnableSRAMECC) { + ToggleFeature(AMDGPU::FeatureSRAMECC); + EnableSRAMECC = false; + } + return *this; } @@ -152,8 +184,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AMDGPUGenSubtargetInfo(TT, GPU, FS), AMDGPUSubtarget(TT), TargetTriple(TT), - Gen(SOUTHERN_ISLANDS), - IsaVersion(ISAVersion0_0_0), + Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), InstrItins(getInstrItineraryForCPU(GPU)), LDSBankCount(0), MaxPrivateElementSize(0), @@ -162,7 +193,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HalfRate64Ops(false), FP64FP16Denormals(false), - DX10Clamp(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), @@ -171,11 +201,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasApertureRegs(false), EnableXNACK(false), + DoesNotSupportXNACK(false), + EnableCuMode(false), TrapHandler(false), - DebuggerInsertNops(false), - DebuggerEmitPrologue(false), - EnableHugePrivateBuffer(false), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), @@ -186,8 +215,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP64(false), GCN3Encoding(false), CIInsts(false), - VIInsts(false), + GFX8Insts(false), GFX9Insts(false), + GFX10Insts(false), + GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), HasIntClamp(false), @@ -202,19 +233,47 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasDPP8(false), HasR128A16(false), + HasNSAEncoding(false), HasDLInsts(false), - HasDotInsts(false), + HasDot1Insts(false), + HasDot2Insts(false), + HasDot3Insts(false), + HasDot4Insts(false), + HasDot5Insts(false), + HasDot6Insts(false), + HasMAIInsts(false), + HasPkFmacF16Inst(false), + HasAtomicFaddInsts(false), EnableSRAMECC(false), + DoesNotSupportSRAMECC(false), + HasNoSdstCMPX(false), + HasVscnt(false), + HasRegisterBanking(false), + HasVOP3Literal(false), + HasNoDataDepHazard(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), FlatScratchInsts(false), + ScalarFlatScratchInsts(false), AddNoCarryInsts(false), HasUnpackedD16VMem(false), + LDSMisalignedBug(false), ScalarizeGlobal(false), + HasVcmpxPermlaneHazard(false), + HasVMEMtoScalarWriteHazard(false), + HasSMEMtoVectorWriteHazard(false), + HasInstFwdPrefetchBug(false), + HasVcmpxExecWARHazard(false), + HasLdsBranchVmemWARHazard(false), + HasNSAtoVMEMBug(false), + HasOffset3fBug(false), + HasFlatSegmentOffsetBug(false), + FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), @@ -226,12 +285,34 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, *this, *static_cast(RegBankInfo.get()), TM)); } +unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { + if (getGeneration() < GFX10) + return 1; + + switch (Opcode) { + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHL_B64: + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHR_B64: + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHR_I64: + return 1; + } + + return 2; +} + unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, const Function &F) const { if (NWaves == 1) return getLocalMemorySize(); unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } @@ -240,6 +321,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); @@ -260,7 +343,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); + return std::make_pair(getWavefrontSize() * 2, + std::max(getWavefrontSize() * 4, 256u)); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: @@ -280,12 +364,6 @@ std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( std::pair Default = getDefaultFlatWorkGroupSize(F.getCallingConv()); - // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa - // starts using "amdgpu-flat-work-group-size" attribute. - Default.second = AMDGPU::getIntegerAttribute( - F, "amdgpu-max-work-group-size", Default.second); - Default.first = std::min(Default.first, Default.second); - // Requested minimum/maximum flat work group sizes. std::pair Requested = AMDGPU::getIntegerPairAttribute( F, "amdgpu-flat-work-group-size", Default); @@ -319,10 +397,7 @@ std::pair AMDGPUSubtarget::getWavesPerEU( getMaxWavesPerEU(FlatWorkGroupSizes.second); bool RequestedFlatWorkGroupSize = false; - // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa - // starts using "amdgpu-flat-work-group-size" attribute. - if (F.hasFnAttribute("amdgpu-max-work-group-size") || - F.hasFnAttribute("amdgpu-flat-work-group-size")) { + if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { Default.first = MinImpliedByFlatWorkGroupSize; RequestedFlatWorkGroupSize = true; } @@ -460,7 +535,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, FMA(false), CaymanISA(false), CFALUBug(false), - DX10Clamp(false), HasVertexCache(false), R600ALUInst(false), FP64(false), @@ -486,7 +560,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } +bool GCNSubtarget::hasMadF16() const { + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; +} + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 10; + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) return 10; @@ -533,6 +614,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo(); + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. + if (MFI.hasFlatScratchInit()) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). @@ -631,9 +715,7 @@ struct MemOpClusterMutation : ScheduleDAGMutation { MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} - void apply(ScheduleDAGInstrs *DAGInstrs) override { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); - + void apply(ScheduleDAGInstrs *DAG) override { SUnit *SUa = nullptr; // Search for two consequent memory operations and link them // to prevent scheduler from moving them apart. @@ -674,11 +756,130 @@ struct MemOpClusterMutation : ScheduleDAGMutation { } } }; + +struct FillMFMAShadowMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} + + bool isSALU(const SUnit *SU) const { + const MachineInstr *MI = SU->getInstr(); + return MI && TII->isSALU(*MI) && !MI->isTerminator(); + } + + bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { + if (Pred->NodeNum < Succ->NodeNum) + return true; + + SmallVector Succs({Succ}), Preds({Pred}); + + for (unsigned I = 0; I < Succs.size(); ++I) { + for (const SDep &SI : Succs[I]->Succs) { + const SUnit *SU = SI.getSUnit(); + if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) + Succs.push_back(SU); + } + } + + SmallPtrSet Visited; + while (!Preds.empty()) { + const SUnit *SU = Preds.pop_back_val(); + if (llvm::find(Succs, SU) != Succs.end()) + return false; + Visited.insert(SU); + for (const SDep &SI : SU->Preds) + if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) + Preds.push_back(SI.getSUnit()); + } + + return true; + } + + // Link as much SALU intructions in chain as possible. Return the size + // of the chain. Links up to MaxChain instructions. + unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, + SmallPtrSetImpl &Visited) const { + SmallVector Worklist({To}); + unsigned Linked = 0; + + while (!Worklist.empty() && MaxChain-- > 0) { + SUnit *SU = Worklist.pop_back_val(); + if (!Visited.insert(SU).second) + continue; + + LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); + dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); + + if (SU->addPred(SDep(From, SDep::Artificial), false)) + ++Linked; + + for (SDep &SI : From->Succs) { + SUnit *SUv = SI.getSUnit(); + if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU)) + SUv->addPred(SDep(SU, SDep::Artificial), false); + } + + for (SDep &SI : SU->Succs) { + SUnit *Succ = SI.getSUnit(); + if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) + Worklist.push_back(Succ); + } + } + + return Linked; + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + if (!ST.hasMAIInsts() || DisablePowerSched) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + // Scan for MFMA long latency instructions and try to add a dependency + // of available SALU instructions to give them a chance to fill MFMA + // shadow. That is desirable to fill MFMA shadow with SALU instructions + // rather than VALU to prevent power consumption bursts and throttle. + auto LastSALU = DAG->SUnits.begin(); + auto E = DAG->SUnits.end(); + SmallPtrSet Visited; + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) + continue; + + unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); + dbgs() << "Need " << Lat + << " instructions to cover latency.\n"); + + // Find up to Lat independent scalar instructions as early as + // possible such that they can be scheduled after this MFMA. + for ( ; Lat && LastSALU != E; ++LastSALU) { + if (Visited.count(&*LastSALU)) + continue; + + if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) + continue; + + Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); + } + } + } +}; } // namespace void GCNSubtarget::getPostRAMutations( std::vector> &Mutations) const { Mutations.push_back(llvm::make_unique(&InstrInfo)); + Mutations.push_back(llvm::make_unique(&InstrInfo)); } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 5584759e5580..78c3b823946d 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -1,9 +1,8 @@ //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// // @@ -56,7 +55,8 @@ public: SOUTHERN_ISLANDS = 4, SEA_ISLANDS = 5, VOLCANIC_ISLANDS = 6, - GFX9 = 7 + GFX9 = 7, + GFX10 = 8 }; private: @@ -246,26 +246,6 @@ public: class GCNSubtarget : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { public: - enum { - ISAVersion0_0_0, - ISAVersion6_0_0, - ISAVersion6_0_1, - ISAVersion7_0_0, - ISAVersion7_0_1, - ISAVersion7_0_2, - ISAVersion7_0_3, - ISAVersion7_0_4, - ISAVersion8_0_1, - ISAVersion8_0_2, - ISAVersion8_0_3, - ISAVersion8_1_0, - ISAVersion9_0_0, - ISAVersion9_0_2, - ISAVersion9_0_4, - ISAVersion9_0_6, - ISAVersion9_0_9, - }; - enum TrapHandlerAbi { TrapHandlerAbiNone = 0, TrapHandlerAbiHsa = 1 @@ -297,7 +277,6 @@ protected: // Basic subtarget description. Triple TargetTriple; unsigned Gen; - unsigned IsaVersion; InstrItineraryData InstrItins; int LDSBankCount; unsigned MaxPrivateElementSize; @@ -308,7 +287,6 @@ protected: // Dynamially set bits that enable features. bool FP64FP16Denormals; - bool DX10Clamp; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; bool CodeObjectV3; @@ -316,12 +294,11 @@ protected: bool UnalignedBufferAccess; bool HasApertureRegs; bool EnableXNACK; + bool DoesNotSupportXNACK; + bool EnableCuMode; bool TrapHandler; - bool DebuggerInsertNops; - bool DebuggerEmitPrologue; // Used as options. - bool EnableHugePrivateBuffer; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -336,8 +313,10 @@ protected: bool IsGCN; bool GCN3Encoding; bool CIInsts; - bool VIInsts; + bool GFX8Insts; bool GFX9Insts; + bool GFX10Insts; + bool GFX7GFX8GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; bool HasIntClamp; @@ -352,23 +331,51 @@ protected: bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasDPP8; bool HasR128A16; + bool HasNSAEncoding; bool HasDLInsts; - bool HasDotInsts; + bool HasDot1Insts; + bool HasDot2Insts; + bool HasDot3Insts; + bool HasDot4Insts; + bool HasDot5Insts; + bool HasDot6Insts; + bool HasMAIInsts; + bool HasPkFmacF16Inst; + bool HasAtomicFaddInsts; bool EnableSRAMECC; + bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; + bool HasVscnt; + bool HasRegisterBanking; + bool HasVOP3Literal; + bool HasNoDataDepHazard; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; bool FlatScratchInsts; + bool ScalarFlatScratchInsts; bool AddNoCarryInsts; bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; + bool LDSMisalignedBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; + bool HasVcmpxPermlaneHazard; + bool HasVMEMtoScalarWriteHazard; + bool HasSMEMtoVectorWriteHazard; + bool HasInstFwdPrefetchBug; + bool HasVcmpxExecWARHazard; + bool HasLdsBranchVmemWARHazard; + bool HasNSAtoVMEMBug; + bool HasOffset3fBug; + bool HasFlatSegmentOffsetBug; + // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -378,6 +385,9 @@ private: SITargetLowering TLInfo; SIFrameLowering FrameLowering; + // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. + static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); + public: GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); @@ -437,6 +447,11 @@ public: return Log2_32(WavefrontSize); } + /// Return the number of high bits known to be zero fror a frame index. + unsigned getKnownHighZeroBitsForFrameIndex() const { + return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + } + int getLDSBankCount() const { return LDSBankCount; } @@ -445,6 +460,8 @@ public: return MaxPrivateElementSize; } + unsigned getConstantBusLimit(unsigned Opcode) const; + bool hasIntClamp() const { return HasIntClamp; } @@ -473,6 +490,12 @@ public: return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); } + // Return true if the target only has the reverse operand versions of VALU + // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). + bool hasOnlyRevVALUShifts() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + bool hasBFE() const { return true; } @@ -525,14 +548,48 @@ public: return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } - bool enableHugePrivateBuffer() const { - return EnableHugePrivateBuffer; + /// True if the offset field of DS instructions works as expected. On SI, the + /// offset uses a 16-bit adder and does not always wrap properly. + bool hasUsableDSOffset() const { + return getGeneration() >= SEA_ISLANDS; } bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } + /// Condition output from div_scale is usable. + bool hasUsableDivScaleConditionOutput() const { + return getGeneration() != SOUTHERN_ISLANDS; + } + + /// Extra wait hazard is needed in some cases before + /// s_cbranch_vccnz/s_cbranch_vccz. + bool hasReadVCCZBug() const { + return getGeneration() <= SEA_ISLANDS; + } + + /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR + /// was written by a VALU instruction. + bool hasSMRDReadVALUDefHazard() const { + return getGeneration() == SOUTHERN_ISLANDS; + } + + /// A read of an SGPR by a VMEM instruction requires 5 wait states when the + /// SGPR was written by a VALU Instruction. + bool hasVMEMReadSGPRVALUDefHazard() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + + bool hasRFEHazards() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + + /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. + unsigned getSetRegWaitStates() const { + return getGeneration() <= SEA_ISLANDS ? 1 : 2; + } + bool dumpCode() const { return DumpCode; } @@ -554,14 +611,6 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool enableDX10Clamp() const { - return DX10Clamp; - } - - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction().getCallingConv()); - } - bool useFlatForGlobal() const { return FlatForGlobal; } @@ -572,6 +621,11 @@ public: return CIInsts && EnableDS128; } + /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 + bool haveRoundOpsF64() const { + return CIInsts; + } + /// \returns If MUBUF instructions always perform range checking, even for /// buffer resources used for private memory access. bool privateMemoryResourceIsRangeChecked() const { @@ -613,10 +667,18 @@ public: return EnableXNACK; } + bool isCuModeEnabled() const { + return EnableCuMode; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } + bool hasFlatScrRegister() const { + return hasFlatAddressSpace(); + } + bool hasFlatInstOffsets() const { return FlatInstOffsets; } @@ -629,6 +691,14 @@ public: return FlatScratchInsts; } + bool hasScalarFlatScratchInsts() const { + return ScalarFlatScratchInsts; + } + + bool hasFlatSegmentOffsetBug() const { + return HasFlatSegmentOffsetBug; + } + bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; } @@ -637,12 +707,34 @@ public: return getGeneration() >= GFX9; } + bool d16PreservesUnusedBits() const { + return hasD16LoadStore() && !isSRAMECCEnabled(); + } + + bool hasD16Images() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + /// Return if most LDS instructions have an m0 use that require m0 to be /// iniitalized. bool ldsRequiresM0Init() const { return getGeneration() < GFX9; } + // True if the hardware rewinds and replays GWS operations if a wave is + // preempted. + // + // If this is false, a GWS operation requires testing if a nack set the + // MEM_VIOL bit, and repeating if so. + bool hasGWSAutoReplay() const { + return getGeneration() >= GFX9; + } + + /// \returns if target has ds_gws_sema_release_all instruction. + bool hasGWSSemaReleaseAll() const { + return CIInsts; + } + bool hasAddNoCarry() const { return AddNoCarryInsts; } @@ -680,22 +772,74 @@ public: return HasSDWAOutModsVOPC; } - bool vmemWriteNeedsExpWaitcnt() const { - return getGeneration() < SEA_ISLANDS; - } - bool hasDLInsts() const { return HasDLInsts; } - bool hasDotInsts() const { - return HasDotInsts; + bool hasDot1Insts() const { + return HasDot1Insts; + } + + bool hasDot2Insts() const { + return HasDot2Insts; + } + + bool hasDot3Insts() const { + return HasDot3Insts; + } + + bool hasDot4Insts() const { + return HasDot4Insts; + } + + bool hasDot5Insts() const { + return HasDot5Insts; + } + + bool hasDot6Insts() const { + return HasDot6Insts; + } + + bool hasMAIInsts() const { + return HasMAIInsts; + } + + bool hasPkFmacF16Inst() const { + return HasPkFmacF16Inst; + } + + bool hasAtomicFaddInsts() const { + return HasAtomicFaddInsts; } bool isSRAMECCEnabled() const { return EnableSRAMECC; } + bool hasNoSdstCMPX() const { + return HasNoSdstCMPX; + } + + bool hasVscnt() const { + return HasVscnt; + } + + bool hasRegisterBanking() const { + return HasRegisterBanking; + } + + bool hasVOP3Literal() const { + return HasVOP3Literal; + } + + bool hasNoDataDepHazard() const { + return HasNoDataDepHazard; + } + + bool vmemWriteNeedsExpWaitcnt() const { + return getGeneration() < SEA_ISLANDS; + } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. @@ -792,29 +936,34 @@ public: return HasScalarAtomics; } + bool hasLDSFPAtomics() const { + return GFX8Insts; + } bool hasDPP() const { return HasDPP; } + bool hasDPP8() const { + return HasDPP8; + } + bool hasR128A16() const { return HasR128A16; } - bool enableSIScheduler() const { - return EnableSIScheduler; + bool hasOffset3fBug() const { + return HasOffset3fBug; } - bool debuggerSupported() const { - return debuggerInsertNops() && debuggerEmitPrologue(); + bool hasNSAEncoding() const { + return HasNSAEncoding; } - bool debuggerInsertNops() const { - return DebuggerInsertNops; - } + bool hasMadF16() const; - bool debuggerEmitPrologue() const { - return DebuggerEmitPrologue; + bool enableSIScheduler() const { + return EnableSIScheduler; } bool loadStoreOptEnabled() const { @@ -835,15 +984,48 @@ public: } bool hasSMovFedHazard() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; + return getGeneration() == AMDGPUSubtarget::GFX9; } bool hasReadM0MovRelInterpHazard() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; + return getGeneration() == AMDGPUSubtarget::GFX9; } bool hasReadM0SendMsgHazard() const { - return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + getGeneration() <= AMDGPUSubtarget::GFX9; + } + + bool hasVcmpxPermlaneHazard() const { + return HasVcmpxPermlaneHazard; + } + + bool hasVMEMtoScalarWriteHazard() const { + return HasVMEMtoScalarWriteHazard; + } + + bool hasSMEMtoVectorWriteHazard() const { + return HasSMEMtoVectorWriteHazard; + } + + bool hasLDSMisalignedBug() const { + return LDSMisalignedBug && !EnableCuMode; + } + + bool hasInstFwdPrefetchBug() const { + return HasInstFwdPrefetchBug; + } + + bool hasVcmpxExecWARHazard() const { + return HasVcmpxExecWARHazard; + } + + bool hasLdsBranchVmemWARHazard() const { + return HasLdsBranchVmemWARHazard; + } + + bool hasNSAtoVMEMBug() const { + return HasNSAtoVMEMBug; } /// Return the maximum number of waves per SIMD for kernels using \p SGPRs @@ -957,6 +1139,14 @@ public: std::vector> &Mutations) const override; + bool isWave32() const { + return WavefrontSize == 32; + } + + const TargetRegisterClass *getBoolRC() const { + return getRegisterInfo()->getBoolRC(); + } + /// \returns Maximum number of work groups per compute unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { @@ -994,7 +1184,6 @@ private: bool FMA; bool CaymanISA; bool CFALUBug; - bool DX10Clamp; bool HasVertexCache; bool R600ALUInst; bool FP64; diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e8cefdbf74b9..0ea8db04c298 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,11 +24,14 @@ #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" +#include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -67,6 +69,11 @@ EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false)); +static cl::opt +OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, + cl::desc("Run pre-RA exec mask optimizations"), + cl::init(true)); + static cl::opt EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -109,7 +116,7 @@ static cl::opt EnableSDWAPeephole( static cl::opt EnableDPPCombine( "amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), - cl::init(false)); + cl::init(true)); // Enable address space based alias analysis static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, @@ -123,11 +130,11 @@ static cl::opt LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -static cl::opt EnableAMDGPUFunctionCalls( +static cl::opt EnableAMDGPUFunctionCallsOpt( "amdgpu-function-calls", cl::desc("Enable AMDGPU function call support"), cl::location(AMDGPUTargetMachine::EnableFunctionCalls), - cl::init(false), + cl::init(true), cl::Hidden); // Enable lib calls simplifications @@ -143,6 +150,12 @@ static cl::opt EnableLowerKernelArguments( cl::init(true), cl::Hidden); +static cl::opt EnableRegReassign( + "amdgpu-reassign-regs", + cl::desc("Enable register reassign optimizations on gfx10+"), + cl::init(true), + cl::Hidden); + // Enable atomic optimization static cl::opt EnableAtomicOptimizations( "amdgpu-atomic-optimizations", @@ -157,6 +170,18 @@ static cl::opt EnableSIModeRegisterPass( cl::init(true), cl::Hidden); +// Option is used in lit tests to prevent deadcoding of patterns inspected. +static cl::opt +EnableDCEInRA("amdgpu-dce-in-ra", + cl::init(true), cl::Hidden, + cl::desc("Enable machine DCE inside regalloc")); + +static cl::opt EnableScalarIRPasses( + "amdgpu-scalar-ir-passes", + cl::desc("Enable scalar IR passes"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -172,6 +197,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFixupVectorISelPass(*PR); @@ -192,6 +218,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPUPropagateAttributesEarlyPass(*PR); + initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); @@ -201,9 +229,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); - initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); - initializeSIFixWWMLivenessPass(*PR); + initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); @@ -211,6 +238,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeGCNRegBankReassignPass(*PR); + initializeGCNNSAReassignPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -295,10 +324,11 @@ static StringRef computeDataLayout(const Triple &TT) { } // 32-bit private, local, and region pointers. 64-bit global, constant and - // flat. + // flat, non-integral buffer fat pointers. return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + "-ni:7"; } LLVM_READNONE @@ -306,8 +336,9 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { if (!GPU.empty()) return GPU; + // Need to default to a target with flat support for HSA. if (TT.getArch() == Triple::amdgcn) - return "generic"; + return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; return "r600"; } @@ -363,24 +394,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { bool EnableOpt = getOptLevel() > CodeGenOpt::None; bool Internalize = InternalizeSymbols; - bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls; + bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; - if (EnableAMDGPUFunctionCalls) { + if (EnableFunctionCalls) { delete Builder.Inliner; Builder.Inliner = createAMDGPUFunctionInliningPass(); } Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, - [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } PM.add(createAMDGPUUnifyMetadataPass()); + PM.add(createAMDGPUPropagateAttributesLatePass(this)); if (Internalize) { PM.add(createInternalizePass(mustPreserveGV)); PM.add(createGlobalDCEPass()); @@ -392,15 +424,16 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } + PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) - PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt)); + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); }); Builder.addExtension( @@ -428,6 +461,11 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { setRequiresStructuredCFG(true); + + // Override the default since calls aren't supported for r600. + if (EnableFunctionCalls && + EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0) + EnableFunctionCalls = false; } const R600Subtarget *R600TargetMachine::getSubtargetImpl( @@ -528,8 +566,14 @@ public: bool addPreISel() override; bool addInstSelector() override; bool addGCPasses() override; + + std::unique_ptr getCSEConfig() const override; }; +std::unique_ptr AMDGPUPassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} + class R600PassConfig final : public AMDGPUPassConfig { public: R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) @@ -572,9 +616,10 @@ public: bool addLegalizeMachineIR() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; - void addFastRegAlloc(FunctionPass *RegAllocPass) override; - void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + void addFastRegAlloc() override; + void addOptimizedRegAlloc() override; void addPreRegAlloc() override; + bool addPreRewrite() override; void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -614,12 +659,16 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); - addPass(createAtomicExpandPass()); - // This must occur before inlining, as the inliner will not look through // bitcast calls. addPass(createAMDGPUFixFunctionBitcastsPass()); + // A call to propagate attributes pass in the backend in case opt was not run. + addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); + + addPass(createAtomicExpandPass()); + + addPass(createAMDGPULowerIntrinsicsPass()); // Function calls are not supported, so make sure we inline everything. @@ -652,7 +701,8 @@ void AMDGPUPassConfig::addIRPasses() { if (EnableSROA) addPass(createSROAPass()); - addStraightLineScalarOptimizationPasses(); + if (EnableScalarIRPasses) + addStraightLineScalarOptimizationPasses(); if (EnableAMDGPUAliasAnalysis) { addPass(createAMDGPUAAWrapperPass()); @@ -678,15 +728,20 @@ void AMDGPUPassConfig::addIRPasses() { // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn) + addPass(createAMDGPUAnnotateKernelFeaturesPass()); + if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + addPass(&AMDGPUPerfHintAnalysisID); + TargetPassConfig::addCodeGenPrepare(); if (EnableLoadStoreVectorizer) @@ -700,7 +755,8 @@ bool AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); + // Defer the verifier until FinalizeISel. + addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false); return false; } @@ -770,7 +826,6 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. @@ -783,6 +838,7 @@ bool GCNPassConfig::addPreISel() { if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); } + addPass(createLCSSAPass()); return false; } @@ -856,7 +912,7 @@ void GCNPassConfig::addPreRegAlloc() { addPass(createSIWholeQuadModePass()); } -void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { +void GCNPassConfig::addFastRegAlloc() { // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. @@ -865,28 +921,40 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just after RegisterCoalescing. + insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); - TargetPassConfig::addFastRegAlloc(RegAllocPass); + TargetPassConfig::addFastRegAlloc(); } -void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); - - insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); +void GCNPassConfig::addOptimizedRegAlloc() { + if (OptExecMaskPreRA) { + insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); + } else { + insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + } // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just after RegisterCoalescing. + insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); + + if (EnableDCEInRA) + insertPass(&RenameIndependentSubregsID, &DeadMachineInstructionElimID); - TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); + TargetPassConfig::addOptimizedRegAlloc(); +} + +bool GCNPassConfig::addPreRewrite() { + if (EnableRegReassign) { + addPass(&GCNNSAReassignID); + addPass(&GCNRegBankReassignID); + } + return true; } void GCNPassConfig::addPostRegAlloc() { @@ -894,6 +962,9 @@ void GCNPassConfig::addPostRegAlloc() { if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { @@ -919,10 +990,164 @@ void GCNPassConfig::addPreEmitPass() { addPass(&PostRAHazardRecognizerID); addPass(&SIInsertSkipsPassID); - addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(*this, PM); } + +yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::SIMachineFunctionInfo(); +} + +yaml::MachineFunctionInfo * +GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + return new yaml::SIMachineFunctionInfo(*MFI, + *MF.getSubtarget().getRegisterInfo()); +} + +bool GCNTargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const yaml::SIMachineFunctionInfo &YamlMFI = + reinterpret_cast(MFI_); + MachineFunction &MF = PFS.MF; + SIMachineFunctionInfo *MFI = MF.getInfo(); + + MFI->initializeBaseYamlFields(YamlMFI); + + auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) { + if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) { + SourceRange = RegName.SourceRange; + return true; + } + + return false; + }; + + auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { + // Create a diagnostic for a the register string literal. + const MemoryBuffer &Buffer = + *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); + Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, + RegName.Value.size(), SourceMgr::DK_Error, + "incorrect register class for field", RegName.Value, + None, None); + SourceRange = RegName.SourceRange; + return true; + }; + + if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || + parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || + parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || + parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) + return true; + + if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && + !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) { + return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); + } + + if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && + !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { + return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); + } + + if (MFI->FrameOffsetReg != AMDGPU::FP_REG && + !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { + return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); + } + + if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && + !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { + return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); + } + + auto parseAndCheckArgument = [&](const Optional &A, + const TargetRegisterClass &RC, + ArgDescriptor &Arg, unsigned UserSGPRs, + unsigned SystemSGPRs) { + // Skip parsing if it's not present. + if (!A) + return false; + + if (A->IsRegister) { + unsigned Reg; + if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { + SourceRange = A->RegisterName.SourceRange; + return true; + } + if (!RC.contains(Reg)) + return diagnoseRegisterClass(A->RegisterName); + Arg = ArgDescriptor::createRegister(Reg); + } else + Arg = ArgDescriptor::createStack(A->StackOffset); + // Check and apply the optional mask. + if (A->Mask) + Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); + + MFI->NumUserSGPRs += UserSGPRs; + MFI->NumSystemSGPRs += SystemSGPRs; + return false; + }; + + if (YamlMFI.ArgInfo && + (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, + AMDGPU::SReg_128RegClass, + MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, + AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, + 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, + MFI->ArgInfo.QueuePtr, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.KernargSegmentPtr, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, + AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, + 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.FlatScratchInit, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.PrivateSegmentSize, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, + 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, + 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, + 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.WorkGroupInfo, 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.ImplicitArgPtr, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, + AMDGPU::VGPR_32RegClass, + MFI->ArgInfo.WorkItemIDX, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, + AMDGPU::VGPR_32RegClass, + MFI->ArgInfo.WorkItemIDY, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, + AMDGPU::VGPR_32RegClass, + MFI->ArgInfo.WorkItemIDZ, 0, 0))) + return true; + + MFI->Mode.IEEE = YamlMFI.Mode.IEEE; + MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; + + return false; +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 62fbe71d1902..70fa3961236f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -1,9 +1,8 @@ //===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,6 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/StringMap.h" @@ -95,7 +93,6 @@ public: class GCNTargetMachine final : public AMDGPUTargetMachine { private: - AMDGPUIntrinsicInfo IntrinsicInfo; mutable StringMap> SubtargetMap; public: @@ -110,13 +107,17 @@ public: TargetTransformInfo getTargetTransformInfo(const Function &F) override; - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - bool useIPRA() const override { return true; } + + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index c4e1efde130b..6569980d2c75 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index a4ae1a2c18c2..819bebb7932d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 11e4ba4b5010..aaed280a1270 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -118,8 +117,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Add a small bonus for each of such "if" statements. if (const BranchInst *Br = dyn_cast(&I)) { if (UP.Threshold < MaxBoost && Br->isConditional()) { - if (L->isLoopExiting(Br->getSuccessor(0)) || - L->isLoopExiting(Br->getSuccessor(1))) + BasicBlock *Succ0 = Br->getSuccessor(0); + BasicBlock *Succ1 = Br->getSuccessor(1); + if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || + (L->contains(Succ1) && L->isLoopExiting(Succ1))) continue; if (dependsOnLocalPhi(L, Br->getCondition())) { UP.Threshold += UnrollThresholdIf; @@ -141,7 +142,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned Threshold = 0; if (AS == AMDGPUAS::PRIVATE_ADDRESS) Threshold = ThresholdPrivate; - else if (AS == AMDGPUAS::LOCAL_ADDRESS) + else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) Threshold = ThresholdLocal; else continue; @@ -159,7 +160,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; if (AllocaSize > MaxAlloca) continue; - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || + AS == AMDGPUAS::REGION_ADDRESS) { LocalGEPsSeen++; // Inhibit unroll for local memory if we have seen addressing not to // a variable, most likely we will be unable to combine it. @@ -254,7 +256,8 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { return 512; } @@ -308,6 +311,8 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -399,7 +404,7 @@ int GCNTTIImpl::getArithmeticInstrCost( if (SLT == MVT::f64) { int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); // Add cost of workaround. - if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (!ST->hasUsableDivScaleConditionOutput()) Cost += 3 * getFullRateInstrCost(); return LT.first * Cost * NElts; @@ -577,6 +582,8 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { return false; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: return true; } } @@ -607,7 +614,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = TM.getSubtargetImpl(*Caller)->getFeatureBits(); @@ -616,7 +623,14 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; - return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); + if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) + return false; + + // FIXME: dx10_clamp can just take the caller setting, but there seems to be + // no way to support merge for backend defined attributes. + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + return CallerMode.isInlineCompatible(CalleeMode); } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 397c5c6fa6fb..6f1bf5a26f0d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -1,9 +1,8 @@ //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -78,13 +77,16 @@ class GCNTTIImpl final : public BasicTTIImplBase { AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureAutoWaitcntBeforeBarrier, - AMDGPU::FeatureDebuggerEmitPrologue, - AMDGPU::FeatureDebuggerInsertNops, // Property of the kernel/environment which can't actually differ. AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, AMDGPU::FeatureTrapHandler, + AMDGPU::FeatureCodeObjectV3, + + // The default assumption needs to be ecc is enabled, but no directly + // exposed operations depend on it, so it can be safely inlined. + AMDGPU::FeatureSRAMECC, // Perf-tuning features AMDGPU::FeatureFastFMAF32, @@ -178,8 +180,7 @@ public: // don't use flat addressing. if (IsGraphicsShader) return -1; - return ST->hasFlatAddressSpace() ? - AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE; + return AMDGPUAS::FLAT_ADDRESS; } unsigned getVectorSplitCost() { return 0; } @@ -190,7 +191,9 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 9; } + unsigned getInliningThresholdMultiplier() { return 7; } + + int getInlinerVectorBonusPercent() { return 0; } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index ced3f6f567e2..396e0ed2e76c 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -1,9 +1,8 @@ //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -199,14 +198,11 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); } else { // Conditional branch. // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(), - "TransitionBlock", &F); - - // Move BI from BB to the new transition block. - BI->removeFromParent(); - TransitionBB->getInstList().push_back(BI); + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - // Create a branch that will always branch to the transition block. + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); } } diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index 1f6d9234c1ed..d4401a22a1ad 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -1,9 +1,8 @@ //===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 11cd49e5b3dc..12f2e9519c9e 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1,9 +1,8 @@ //===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h index 289642aaa2d0..3e658a144c1f 100644 --- a/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -1,9 +1,8 @@ //===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file AMDKernelCodeT.h @@ -127,8 +126,12 @@ enum amd_code_property_mask_t { AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, - AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, - AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6, + AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 = ((1 << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, + + AMD_CODE_PROPERTY_RESERVED1_SHIFT = 11, + AMD_CODE_PROPERTY_RESERVED1_WIDTH = 5, AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT, /// Control wave ID base counter for GDS ordered-append. Used to set diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3f9af27a2e5e..6d678966c98e 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1,9 +1,8 @@ //===- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,6 +12,7 @@ #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIDefines.h" #include "SIInstrInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" @@ -69,7 +69,7 @@ namespace { class AMDGPUAsmParser; -enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL }; +enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_AGPR, IS_TTMP, IS_SPECIAL }; //===----------------------------------------------------------------------===// // Operand @@ -103,14 +103,14 @@ public: int64_t getFPModifiersOperand() const { int64_t Operand = 0; - Operand |= Abs ? SISrcMods::ABS : 0; - Operand |= Neg ? SISrcMods::NEG : 0; + Operand |= Abs ? SISrcMods::ABS : 0u; + Operand |= Neg ? SISrcMods::NEG : 0u; return Operand; } int64_t getIntModifiersOperand() const { int64_t Operand = 0; - Operand |= Sext ? SISrcMods::SEXT : 0; + Operand |= Sext ? SISrcMods::SEXT : 0u; return Operand; } @@ -140,21 +140,25 @@ public: ImmTyInstOffset, ImmTyOffset0, ImmTyOffset1, + ImmTyDLC, ImmTyGLC, ImmTySLC, ImmTyTFE, ImmTyD16, ImmTyClampSI, ImmTyOModSI, + ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, ImmTyDppBankMask, ImmTyDppBoundCtrl, + ImmTyDppFi, ImmTySdwaDstSel, ImmTySdwaSrc0Sel, ImmTySdwaSrc1Sel, ImmTySdwaDstUnused, ImmTyDMask, + ImmTyDim, ImmTyUNorm, ImmTyDA, ImmTyR128A16, @@ -174,9 +178,15 @@ public: ImmTyNegLo, ImmTyNegHi, ImmTySwizzle, - ImmTyHigh + ImmTyGprIdxMode, + ImmTyHigh, + ImmTyBLGP, + ImmTyCBSZ, + ImmTyABID, + ImmTyEndpgm, }; +private: struct TokOp { const char *Data; unsigned Length; @@ -191,7 +201,6 @@ public: struct RegOp { unsigned RegNo; - bool IsForcedVOP3; Modifiers Mods; }; @@ -202,6 +211,7 @@ public: const MCExpr *Expr; }; +public: bool isToken() const override { if (Kind == Token) return true; @@ -231,32 +241,32 @@ public: return isRegKind() && !hasModifiers(); } - bool isRegOrImmWithInputMods(MVT type) const { - return isRegKind() || isInlinableImm(type); + bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const { + return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type); } bool isRegOrImmWithInt16InputMods() const { - return isRegOrImmWithInputMods(MVT::i16); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16); } bool isRegOrImmWithInt32InputMods() const { - return isRegOrImmWithInputMods(MVT::i32); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32); } bool isRegOrImmWithInt64InputMods() const { - return isRegOrImmWithInputMods(MVT::i64); + return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64); } bool isRegOrImmWithFP16InputMods() const { - return isRegOrImmWithInputMods(MVT::f16); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16); } bool isRegOrImmWithFP32InputMods() const { - return isRegOrImmWithInputMods(MVT::f32); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32); } bool isRegOrImmWithFP64InputMods() const { - return isRegOrImmWithInputMods(MVT::f64); + return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64); } bool isVReg() const { @@ -268,8 +278,12 @@ public: isRegClass(AMDGPU::VReg_512RegClassID); } + bool isVReg32() const { + return isRegClass(AMDGPU::VGPR_32RegClassID); + } + bool isVReg32OrOff() const { - return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); + return isOff() || isVReg32(); } bool isSDWAOperand(MVT type) const; @@ -289,6 +303,7 @@ public: bool isClampSI() const { return isImmTy(ImmTyClampSI); } bool isOModSI() const { return isImmTy(ImmTyOModSI); } bool isDMask() const { return isImmTy(ImmTyDMask); } + bool isDim() const { return isImmTy(ImmTyDim); } bool isUNorm() const { return isImmTy(ImmTyUNorm); } bool isDA() const { return isImmTy(ImmTyDA); } bool isR128A16() const { return isImmTy(ImmTyR128A16); } @@ -301,13 +316,13 @@ public: bool isIdxen() const { return isImmTy(ImmTyIdxen); } bool isAddr64() const { return isImmTy(ImmTyAddr64); } bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } - bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); } + bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); } bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } - bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); } - bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); } + bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); } bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } + bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } @@ -316,6 +331,7 @@ public: bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } + bool isFI() const { return isImmTy(ImmTyDppFi); } bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); } bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } @@ -339,6 +355,8 @@ public: bool isRegClass(unsigned RCID) const; + bool isInlineValue() const; + bool isRegOrInlineNoMods(unsigned RCID, MVT type) const { return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers(); } @@ -359,6 +377,8 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64); } + bool isBoolReg() const; + bool isSCSrcF16() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } @@ -411,6 +431,11 @@ public: return isSSrcF16(); } + bool isSSrcOrLdsB32() const { + return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) || + isLiteralImm(MVT::i32) || isExpr(); + } + bool isVCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } @@ -456,8 +481,7 @@ public: } bool isVSrcV2B16() const { - llvm_unreachable("cannot happen"); - return isVSrcB16(); + return isVSrcB16() || isLiteralImm(MVT::v2i16); } bool isVSrcF32() const { @@ -473,8 +497,127 @@ public: } bool isVSrcV2F16() const { - llvm_unreachable("cannot happen"); - return isVSrcF16(); + return isVSrcF16() || isLiteralImm(MVT::v2f16); + } + + bool isVISrcB32() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); + } + + bool isVISrcB16() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i16); + } + + bool isVISrcV2B16() const { + return isVISrcB16(); + } + + bool isVISrcF32() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f32); + } + + bool isVISrcF16() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f16); + } + + bool isVISrcV2F16() const { + return isVISrcF16() || isVISrcB32(); + } + + bool isAISrcB32() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32); + } + + bool isAISrcB16() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i16); + } + + bool isAISrcV2B16() const { + return isAISrcB16(); + } + + bool isAISrcF32() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f32); + } + + bool isAISrcF16() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f16); + } + + bool isAISrcV2F16() const { + return isAISrcF16() || isAISrcB32(); + } + + bool isAISrc_128B32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32); + } + + bool isAISrc_128B16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i16); + } + + bool isAISrc_128V2B16() const { + return isAISrc_128B16(); + } + + bool isAISrc_128F32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f32); + } + + bool isAISrc_128F16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f16); + } + + bool isAISrc_128V2F16() const { + return isAISrc_128F16() || isAISrc_128B32(); + } + + bool isAISrc_512B32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32); + } + + bool isAISrc_512B16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i16); + } + + bool isAISrc_512V2B16() const { + return isAISrc_512B16(); + } + + bool isAISrc_512F32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f32); + } + + bool isAISrc_512F16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f16); + } + + bool isAISrc_512V2F16() const { + return isAISrc_512F16() || isAISrc_512B32(); + } + + bool isAISrc_1024B32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i32); + } + + bool isAISrc_1024B16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i16); + } + + bool isAISrc_1024V2B16() const { + return isAISrc_1024B16(); + } + + bool isAISrc_1024F32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f32); + } + + bool isAISrc_1024F16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f16); + } + + bool isAISrc_1024V2F16() const { + return isAISrc_1024F16() || isAISrc_1024B32(); } bool isKImmFP32() const { @@ -504,10 +647,15 @@ public: bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; + bool isDPP8() const; bool isDPPCtrl() const; + bool isBLGP() const; + bool isCBSZ() const; + bool isABID() const; bool isGPRIdxMode() const; bool isS16Imm() const; bool isU16Imm() const; + bool isEndpgm() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -535,6 +683,7 @@ public: } unsigned getReg() const override { + assert(isRegKind()); return Reg.RegNo; } @@ -594,6 +743,10 @@ public: void addRegOperands(MCInst &Inst, unsigned N) const; + void addBoolRegOperands(MCInst &Inst, unsigned N) const { + addRegOperands(Inst, N); + } + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { if (isRegKind()) addRegOperands(Inst, N); @@ -661,6 +814,7 @@ public: case ImmTyInstOffset: OS << "InstOffset"; break; case ImmTyOffset0: OS << "Offset0"; break; case ImmTyOffset1: OS << "Offset1"; break; + case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; @@ -668,15 +822,18 @@ public: case ImmTyFORMAT: OS << "FORMAT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; + case ImmTyDPP8: OS << "DPP8"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; case ImmTyDppRowMask: OS << "DppRowMask"; break; case ImmTyDppBankMask: OS << "DppBankMask"; break; case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break; + case ImmTyDppFi: OS << "FI"; break; case ImmTySdwaDstSel: OS << "SdwaDstSel"; break; case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break; case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break; case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break; case ImmTyDMask: OS << "DMask"; break; + case ImmTyDim: OS << "Dim"; break; case ImmTyUNorm: OS << "UNorm"; break; case ImmTyDA: OS << "DA"; break; case ImmTyR128A16: OS << "R128A16"; break; @@ -695,7 +852,12 @@ public: case ImmTyNegLo: OS << "NegLo"; break; case ImmTyNegHi: OS << "NegHi"; break; case ImmTySwizzle: OS << "Swizzle"; break; + case ImmTyGprIdxMode: OS << "GprIdxMode"; break; case ImmTyHigh: OS << "High"; break; + case ImmTyBLGP: OS << "BLGP"; break; + case ImmTyCBSZ: OS << "CBSZ"; break; + case ImmTyABID: OS << "ABID"; break; + case ImmTyEndpgm: OS << "Endpgm"; break; } } @@ -747,12 +909,10 @@ public: static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser, unsigned RegNo, SMLoc S, - SMLoc E, - bool ForceVOP3) { + SMLoc E) { auto Op = llvm::make_unique(Register, AsmParser); Op->Reg.RegNo = RegNo; Op->Reg.Mods = Modifiers(); - Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -817,6 +977,7 @@ public: void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { switch (RegKind) { case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; + case IS_AGPR: // fall through case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break; default: break; } @@ -853,6 +1014,8 @@ private: /// \param VCCUsed [in] Whether VCC special SGPR is reserved. /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved. /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved. + /// \param EnableWavefrontSize32 [in] Value of ENABLE_WAVEFRONT_SIZE32 kernel + /// descriptor field, if valid. /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one. /// \param VGPRRange [in] Token range, used for VGPR diagnostics. /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one. @@ -861,9 +1024,10 @@ private: /// \param SGPRBlocks [out] Result SGPR block count. bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed, - unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, - unsigned &VGPRBlocks, unsigned &SGPRBlocks); + Optional EnableWavefrontSize32, unsigned NextFreeVGPR, + SMRange VGPRRange, unsigned NextFreeSGPR, + SMRange SGPRRange, unsigned &VGPRBlocks, + unsigned &SGPRBlocks); bool ParseDirectiveAMDGCNTarget(); bool ParseDirectiveAMDHSAKernel(); bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); @@ -876,7 +1040,15 @@ private: bool ParseDirectiveISAVersion(); bool ParseDirectiveHSAMetadata(); + bool ParseDirectivePALMetadataBegin(); bool ParseDirectivePALMetadata(); + bool ParseDirectiveAMDGPULDS(); + + /// Common code to parse out a block of text (typically YAML) between start and + /// end directives. + bool ParseToEndDirective(const char *AssemblerDirectiveBegin, + const char *AssemblerDirectiveEnd, + std::string &CollectString); bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, @@ -884,6 +1056,8 @@ private: bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex); + bool isRegister(); + bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const; Optional getGprCountSymbolName(RegisterKind RegKind); void initializeGprCountSymbol(RegisterKind RegKind); bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex, @@ -897,6 +1071,10 @@ public: enum AMDGPUMatchResultTy { Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; + enum OperandMode { + OperandMode_Default, + OperandMode_NSA, + }; using OptionalImmIndexMap = std::map; @@ -908,7 +1086,7 @@ public: if (getFeatureBits().none()) { // Set default features. - copySTI().ToggleFeature("SOUTHERN_ISLANDS"); + copySTI().ToggleFeature("southern-islands"); } setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); @@ -924,6 +1102,10 @@ public: MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } else { MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); @@ -969,6 +1151,10 @@ public: return AMDGPU::isGFX9(getSTI()); } + bool isGFX10() const { + return AMDGPU::isGFX10(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -978,7 +1164,11 @@ public: } bool hasSGPR102_SGPR103() const { - return !isVI(); + return !isVI() && !isGFX9(); + } + + bool hasSGPR104_SGPR105() const { + return isGFX10(); } bool hasIntClamp() const { @@ -1024,7 +1214,8 @@ public: uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool ParseDirective(AsmToken DirectiveID) override; - OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic, + OperandMode Mode = OperandMode_Default); StringRef parseMnemonicSuffix(StringRef Name); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; @@ -1037,11 +1228,11 @@ public: AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, bool (*ConvertResult)(int64_t &) = nullptr); - OperandMatchResultTy parseOperandArrayWithPrefix( - const char *Prefix, - OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - bool (*ConvertResult)(int64_t&) = nullptr); + OperandMatchResultTy + parseOperandArrayWithPrefix(const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = nullptr); OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, @@ -1049,10 +1240,15 @@ public: OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); - bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false); - OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false); + bool isModifier(); + bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; + bool isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; + bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; + bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const; + bool parseSP3NegModifier(); + OperandMatchResultTy parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false); OperandMatchResultTy parseReg(OperandVector &Operands); - OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false); + OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false); OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); @@ -1073,33 +1269,63 @@ private: struct OperandInfoTy { int64_t Id; bool IsSymbolic = false; + bool IsDefined = false; OperandInfoTy(int64_t Id_) : Id(Id_) {} }; - bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId); - bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); + bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream); + bool validateSendMsg(const OperandInfoTy &Msg, + const OperandInfoTy &Op, + const OperandInfoTy &Stream, + const SMLoc Loc); + + bool parseHwregBody(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); + bool validateHwreg(const OperandInfoTy &HwReg, + const int64_t Offset, + const int64_t Width, + const SMLoc Loc); void errorExpTgt(); OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); + SMLoc getFlatOffsetLoc(const OperandVector &Operands) const; - bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc); + bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands); + bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands); + bool validateSOPLiteral(const MCInst &Inst) const; bool validateConstantBusLimitations(const MCInst &Inst); bool validateEarlyClobberLimitations(const MCInst &Inst); bool validateIntClampSupported(const MCInst &Inst); bool validateMIMGAtomicDMask(const MCInst &Inst); bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMIMGDataSize(const MCInst &Inst); + bool validateMIMGAddrSize(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); + bool validateMIMGDim(const MCInst &Inst); + bool validateLdsDirect(const MCInst &Inst); + bool validateOpSel(const MCInst &Inst); + bool validateVccOperand(unsigned Reg) const; + bool validateVOP3Literal(const MCInst &Inst) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + bool isId(const StringRef Id) const; + bool isId(const AsmToken &Token, const StringRef Id) const; + bool isToken(const AsmToken::TokenKind Kind) const; bool trySkipId(const StringRef Id); + bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind); bool trySkipToken(const AsmToken::TokenKind Kind); bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + void peekTokens(MutableArrayRef Tokens); + AsmToken::TokenKind getTokenKind() const; bool parseExpr(int64_t &Imm); + StringRef getTokenStr() const; + AsmToken peekToken(); + AsmToken getToken() const; + SMLoc getLoc() const; + void lex(); public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1110,6 +1336,7 @@ public: OperandMatchResultTy parseInterpSlot(OperandVector &Operands); OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + OperandMatchResultTy parseBoolReg(OperandVector &Operands); bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, const unsigned MinVal, @@ -1124,20 +1351,23 @@ public: bool parseSwizzleSwap(int64_t &Imm); bool parseSwizzleReverse(int64_t &Imm); + OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands); + int64_t parseGPRIdxMacro(); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); } void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultDLC() const; AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultSLC() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; - AMDGPUOperand::Ptr defaultOffsetU12() const; - AMDGPUOperand::Ptr defaultOffsetS13() const; + AMDGPUOperand::Ptr defaultFlatOffset() const; OperandMatchResultTy parseOModOperand(OperandVector &Operands); @@ -1153,11 +1383,15 @@ public: bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseDim(OperandVector &Operands); + OperandMatchResultTy parseDPP8(OperandVector &Operands); OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); AMDGPUOperand::Ptr defaultRowMask() const; AMDGPUOperand::Ptr defaultBankMask() const; AMDGPUOperand::Ptr defaultBoundCtrl() const; - void cvtDPP(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultFI() const; + void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false); + void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); } OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type); @@ -1168,6 +1402,13 @@ public: void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, uint64_t BasicInstType, bool skipVcc = false); + + AMDGPUOperand::Ptr defaultBLGP() const; + AMDGPUOperand::Ptr defaultCBSZ() const; + AMDGPUOperand::Ptr defaultABID() const; + + OperandMatchResultTy parseEndpgmOp(OperandVector &Operands); + AMDGPUOperand::Ptr defaultEndpgmImmOperands() const; }; struct OptionalOperand { @@ -1203,6 +1444,8 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: return &APFloat::IEEEsingle(); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: @@ -1215,6 +1458,12 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: return &APFloat::IEEEhalf(); default: llvm_unreachable("unsupported fp type"); @@ -1243,7 +1492,20 @@ static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) { return true; } +static bool isSafeTruncation(int64_t Val, unsigned Size) { + return isUIntN(Size, Val) || isIntN(Size, Val); +} + bool AMDGPUOperand::isInlinableImm(MVT type) const { + + // This is a hack to enable named inline values like + // shared_base with both 32-bit and 64-bit operands. + // Note that these values are defined as + // 32-bit operands only. + if (isInlineValue()) { + return true; + } + if (!isImmTy(ImmTyNone)) { // Only plain immediates are inlinable (e.g. "clamp" attribute is not) return false; @@ -1282,6 +1544,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { AsmParser->hasInv2PiInlineImm()); } + if (!isSafeTruncation(Imm.Val, type.getScalarSizeInBits())) { + return false; + } + if (type.getScalarSizeInBits() == 16) { return AMDGPU::isInlinableLiteral16( static_cast(Literal.getLoBits(16).getSExtValue()), @@ -1315,7 +1581,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP // types. - return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val); + return isSafeTruncation(Imm.Val, Size); } // We got fp literal token @@ -1330,8 +1596,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { return false; } + // We allow fp literals with f16x2 operands assuming that the specified + // literal goes into the lower half and the upper half is zero. We also + // require that the literal may be losslesly converted to f16. + MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : + (type == MVT::v2i16)? MVT::i16 : type; + APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); - return canLosslesslyConvertToFPType(FPLiteral, type); + return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); } bool AMDGPUOperand::isRegClass(unsigned RCID) const { @@ -1340,9 +1612,9 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) - return isVReg(); - else if (AsmParser->isGFX9()) - return isRegKind() || isInlinableImm(type); + return isVReg32(); + else if (AsmParser->isGFX9() || AsmParser->isGFX10()) + return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type); else return false; } @@ -1363,6 +1635,11 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { return isSDWAOperand(MVT::i32); } +bool AMDGPUOperand::isBoolReg() const { + return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + isSCSrcB64() : isSCSrcB32(); +} + uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const { assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); @@ -1441,12 +1718,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision @@ -1456,11 +1741,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // checked earlier in isLiteralImm() uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); - if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || - OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { - ImmVal |= (ImmVal << 16); - } - Inst.addOperand(MCOperand::createImm(ImmVal)); return; } @@ -1471,15 +1751,18 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; } - // We got int literal token. + // We got int literal token. // Only sign extend inline immediates. - // FIXME: No errors on truncation switch (OpTy) { case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: - if (isInt<32>(Val) && + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + if (isSafeTruncation(Val, 32) && AMDGPU::isInlinableLiteral32(static_cast(Val), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); @@ -1505,7 +1788,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - if (isInt<16>(Val) && + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + if (isSafeTruncation(Val, 16) && AMDGPU::isInlinableLiteral16(static_cast(Val), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); @@ -1516,14 +1801,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { - auto LiteralVal = static_cast(Literal.getLoBits(16).getZExtValue()); - assert(AMDGPU::isInlinableLiteral16(LiteralVal, + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + assert(isSafeTruncation(Val, 16)); + assert(AMDGPU::isInlinableLiteral16(static_cast(Val), AsmParser->hasInv2PiInlineImm())); - uint32_t ImmVal = static_cast(LiteralVal) << 16 | - static_cast(LiteralVal); - Inst.addOperand(MCOperand::createImm(ImmVal)); + Inst.addOperand(MCOperand::createImm(Val)); return; } default: @@ -1552,6 +1837,27 @@ void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI()))); } +static bool isInlineValue(unsigned Reg) { + switch (Reg) { + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + return true; + case AMDGPU::SRC_VCCZ: + case AMDGPU::SRC_EXECZ: + case AMDGPU::SRC_SCC: + return true; + default: + return false; + } +} + +bool AMDGPUOperand::isInlineValue() const { + return isRegKind() && ::isInlineValue(getReg()); +} + //===----------------------------------------------------------------------===// // AsmParser //===----------------------------------------------------------------------===// @@ -1585,6 +1891,15 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 8: return AMDGPU::SGPR_256RegClassID; case 16: return AMDGPU::SGPR_512RegClassID; } + } else if (Is == IS_AGPR) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::AGPR_32RegClassID; + case 2: return AMDGPU::AReg_64RegClassID; + case 4: return AMDGPU::AReg_128RegClassID; + case 16: return AMDGPU::AReg_512RegClassID; + case 32: return AMDGPU::AReg_1024RegClassID; + } } return -1; } @@ -1595,8 +1910,25 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("vcc", AMDGPU::VCC) .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("xnack_mask", AMDGPU::XNACK_MASK) + .Case("shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("lds_direct", AMDGPU::LDS_DIRECT) + .Case("src_lds_direct", AMDGPU::LDS_DIRECT) .Case("m0", AMDGPU::M0) - .Case("scc", AMDGPU::SCC) + .Case("vccz", AMDGPU::SRC_VCCZ) + .Case("src_vccz", AMDGPU::SRC_VCCZ) + .Case("execz", AMDGPU::SRC_EXECZ) + .Case("src_execz", AMDGPU::SRC_EXECZ) + .Case("scc", AMDGPU::SRC_SCC) + .Case("src_scc", AMDGPU::SRC_SCC) .Case("tba", AMDGPU::TBA) .Case("tma", AMDGPU::TMA) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) @@ -1611,6 +1943,7 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("tma_hi", AMDGPU::TMA_HI) .Case("tba_lo", AMDGPU::TBA_LO) .Case("tba_hi", AMDGPU::TBA_HI) + .Case("null", AMDGPU::SGPR_NULL) .Default(0); } @@ -1663,6 +1996,7 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, return false; case IS_VGPR: case IS_SGPR: + case IS_AGPR: case IS_TTMP: if (Reg1 != Reg + RegWidth) { return false; @@ -1674,6 +2008,53 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, } } +static const StringRef Registers[] = { + { "v" }, + { "s" }, + { "ttmp" }, + { "acc" }, + { "a" }, +}; + +bool +AMDGPUAsmParser::isRegister(const AsmToken &Token, + const AsmToken &NextToken) const { + + // A list of consecutive registers: [s0,s1,s2,s3] + if (Token.is(AsmToken::LBrac)) + return true; + + if (!Token.is(AsmToken::Identifier)) + return false; + + // A single register like s0 or a range of registers like s[0:1] + + StringRef RegName = Token.getString(); + + for (StringRef Reg : Registers) { + if (RegName.startswith(Reg)) { + if (Reg.size() < RegName.size()) { + unsigned RegNum; + // A single register with an index: rXX + if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum)) + return true; + } else { + // A range of registers: r[XX:YY]. + if (NextToken.is(AsmToken::LBrac)) + return true; + } + } + } + + return getSpecialRegForName(RegName); +} + +bool +AMDGPUAsmParser::isRegister() +{ + return isRegister(getToken(), peekToken()); +} + bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, unsigned &RegNum, unsigned &RegWidth, unsigned *DwordRegIndex) { @@ -1692,6 +2073,9 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, } else if (RegName[0] == 's') { RegNumIndex = 1; RegKind = IS_SGPR; + } else if (RegName[0] == 'a') { + RegNumIndex = RegName.startswith("acc") ? 3 : 1; + RegKind = IS_AGPR; } else if (RegName.startswith("ttmp")) { RegNumIndex = strlen("ttmp"); RegKind = IS_TTMP; @@ -1773,6 +2157,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, break; case IS_VGPR: case IS_SGPR: + case IS_AGPR: case IS_TTMP: { unsigned Size = 1; @@ -1859,6 +2244,8 @@ std::unique_ptr AMDGPUAsmParser::parseRegister() { unsigned Reg, RegNum, RegWidth, DwordRegIndex; if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) { + //FIXME: improve error messages (bug 41303). + Error(StartLoc, "not a valid operand."); return nullptr; } if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { @@ -1866,202 +2253,261 @@ std::unique_ptr AMDGPUAsmParser::parseRegister() { return nullptr; } else KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); - return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false); + return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc); } -bool -AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) { - if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) && - (getLexer().getKind() == AsmToken::Integer || - getLexer().getKind() == AsmToken::Real)) { - // This is a workaround for handling operands like these: - // |1.0| - // |-1| - // This syntax is not compatible with syntax of standard - // MC expressions (due to the trailing '|'). - - SMLoc EndLoc; - const MCExpr *Expr; +OperandMatchResultTy +AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { + // TODO: add syntactic sugar for 1/(2*PI) - if (getParser().parsePrimaryExpr(Expr, EndLoc)) { - return true; - } + assert(!isRegister()); + assert(!isModifier()); + + const auto& Tok = getToken(); + const auto& NextTok = peekToken(); + bool IsReal = Tok.is(AsmToken::Real); + SMLoc S = getLoc(); + bool Negate = false; - return !Expr->evaluateAsAbsolute(Val); + if (!IsReal && Tok.is(AsmToken::Minus) && NextTok.is(AsmToken::Real)) { + lex(); + IsReal = true; + Negate = true; } - return getParser().parseAbsoluteExpression(Val); -} + if (IsReal) { + // Floating-point expressions are not supported. + // Can only allow floating-point literals with an + // optional sign. -OperandMatchResultTy -AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { - // TODO: add syntactic sugar for 1/(2*PI) - bool Minus = false; - if (getLexer().getKind() == AsmToken::Minus) { - const AsmToken NextToken = getLexer().peekTok(); - if (!NextToken.is(AsmToken::Integer) && - !NextToken.is(AsmToken::Real)) { - return MatchOperand_NoMatch; - } - Minus = true; - Parser.Lex(); - } + StringRef Num = getTokenStr(); + lex(); - SMLoc S = Parser.getTok().getLoc(); - switch(getLexer().getKind()) { - case AsmToken::Integer: { - int64_t IntVal; - if (parseAbsoluteExpr(IntVal, AbsMod)) + APFloat RealVal(APFloat::IEEEdouble()); + auto roundMode = APFloat::rmNearestTiesToEven; + if (RealVal.convertFromString(Num, roundMode) == APFloat::opInvalidOp) { return MatchOperand_ParseFail; - if (Minus) - IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + } + if (Negate) + RealVal.changeSign(); + + Operands.push_back( + AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S, + AMDGPUOperand::ImmTyNone, true)); + return MatchOperand_Success; - } - case AsmToken::Real: { + + } else { int64_t IntVal; - if (parseAbsoluteExpr(IntVal, AbsMod)) - return MatchOperand_ParseFail; + const MCExpr *Expr; + SMLoc S = getLoc(); + + if (HasSP3AbsModifier) { + // This is a workaround for handling expressions + // as arguments of SP3 'abs' modifier, for example: + // |1.0| + // |-1| + // |1+x| + // This syntax is not compatible with syntax of standard + // MC expressions (due to the trailing '|'). + SMLoc EndLoc; + if (getParser().parsePrimaryExpr(Expr, EndLoc)) + return MatchOperand_ParseFail; + } else { + if (Parser.parseExpression(Expr)) + return MatchOperand_ParseFail; + } + + if (Expr->evaluateAsAbsolute(IntVal)) { + Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + } else { + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); + } - APFloat F(BitsToDouble(IntVal)); - if (Minus) - F.changeSign(); - Operands.push_back( - AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S, - AMDGPUOperand::ImmTyNone, true)); return MatchOperand_Success; } - default: - return MatchOperand_NoMatch; - } + + return MatchOperand_NoMatch; } OperandMatchResultTy AMDGPUAsmParser::parseReg(OperandVector &Operands) { + if (!isRegister()) + return MatchOperand_NoMatch; + if (auto R = parseRegister()) { assert(R->isReg()); - R->Reg.IsForcedVOP3 = isForcedVOP3(); Operands.push_back(std::move(R)); return MatchOperand_Success; } - return MatchOperand_NoMatch; + return MatchOperand_ParseFail; } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) { - auto res = parseImm(Operands, AbsMod); +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod) { + auto res = parseReg(Operands); if (res != MatchOperand_NoMatch) { return res; + } else if (isModifier()) { + return MatchOperand_NoMatch; + } else { + return parseImm(Operands, HasSP3AbsMod); } +} - return parseReg(Operands); +bool +AMDGPUAsmParser::isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const { + if (Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::LParen)) { + const auto &str = Token.getString(); + return str == "abs" || str == "neg" || str == "sext"; + } + return false; } -OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, - bool AllowImm) { - bool Negate = false, Negate2 = false, Abs = false, Abs2 = false; +bool +AMDGPUAsmParser::isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const { + return Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::Colon); +} + +bool +AMDGPUAsmParser::isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const { + return isNamedOperandModifier(Token, NextToken) || Token.is(AsmToken::Pipe); +} - if (getLexer().getKind()== AsmToken::Minus) { - const AsmToken NextToken = getLexer().peekTok(); +bool +AMDGPUAsmParser::isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const { + return isRegister(Token, NextToken) || isOperandModifier(Token, NextToken); +} + +// Check if this is an operand modifier or an opcode modifier +// which may look like an expression but it is not. We should +// avoid parsing these modifiers as expressions. Currently +// recognized sequences are: +// |...| +// abs(...) +// neg(...) +// sext(...) +// -reg +// -|...| +// -abs(...) +// name:... +// Note that simple opcode modifiers like 'gds' may be parsed as +// expressions; this is a special case. See getExpressionAsToken. +// +bool +AMDGPUAsmParser::isModifier() { - // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. - if (NextToken.is(AsmToken::Minus)) { - Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier"); - return MatchOperand_ParseFail; - } + AsmToken Tok = getToken(); + AsmToken NextToken[2]; + peekTokens(NextToken); - // '-' followed by an integer literal N should be interpreted as integer - // negation rather than a floating-point NEG modifier applied to N. - // Beside being contr-intuitive, such use of floating-point NEG modifier - // results in different meaning of integer literals used with VOP1/2/C - // and VOP3, for example: - // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF - // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 - // Negative fp literals should be handled likewise for unifomtity - if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) { - Parser.Lex(); - Negate = true; - } + return isOperandModifier(Tok, NextToken[0]) || + (Tok.is(AsmToken::Minus) && isRegOrOperandModifier(NextToken[0], NextToken[1])) || + isOpcodeModifierWithVal(Tok, NextToken[0]); +} + +// Check if the current token is an SP3 'neg' modifier. +// Currently this modifier is allowed in the following context: +// +// 1. Before a register, e.g. "-v0", "-v[...]" or "-[v0,v1]". +// 2. Before an 'abs' modifier: -abs(...) +// 3. Before an SP3 'abs' modifier: -|...| +// +// In all other cases "-" is handled as a part +// of an expression that follows the sign. +// +// Note: When "-" is followed by an integer literal, +// this is interpreted as integer negation rather +// than a floating-point NEG modifier applied to N. +// Beside being contr-intuitive, such use of floating-point +// NEG modifier would have resulted in different meaning +// of integer literals used with VOP1/2/C and VOP3, +// for example: +// v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF +// v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 +// Negative fp literals with preceding "-" are +// handled likewise for unifomtity +// +bool +AMDGPUAsmParser::parseSP3NegModifier() { + + AsmToken NextToken[2]; + peekTokens(NextToken); + + if (isToken(AsmToken::Minus) && + (isRegister(NextToken[0], NextToken[1]) || + NextToken[0].is(AsmToken::Pipe) || + isId(NextToken[0], "abs"))) { + lex(); + return true; } - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == "neg") { - if (Negate) { - Error(Parser.getTok().getLoc(), "expected register or immediate"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Negate2 = true; - if (getLexer().isNot(AsmToken::LParen)) { - Error(Parser.getTok().getLoc(), "expected left paren after neg"); - return MatchOperand_ParseFail; - } - Parser.Lex(); + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, + bool AllowImm) { + bool Neg, SP3Neg; + bool Abs, SP3Abs; + SMLoc Loc; + + // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. + if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) { + Error(getLoc(), "invalid syntax, expected 'neg' modifier"); + return MatchOperand_ParseFail; } - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == "abs") { - Parser.Lex(); - Abs2 = true; - if (getLexer().isNot(AsmToken::LParen)) { - Error(Parser.getTok().getLoc(), "expected left paren after abs"); - return MatchOperand_ParseFail; - } - Parser.Lex(); + SP3Neg = parseSP3NegModifier(); + + Loc = getLoc(); + Neg = trySkipId("neg"); + if (Neg && SP3Neg) { + Error(Loc, "expected register or immediate"); + return MatchOperand_ParseFail; } + if (Neg && !skipToken(AsmToken::LParen, "expected left paren after neg")) + return MatchOperand_ParseFail; - if (getLexer().getKind() == AsmToken::Pipe) { - if (Abs2) { - Error(Parser.getTok().getLoc(), "expected register or immediate"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Abs = true; + Abs = trySkipId("abs"); + if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs")) + return MatchOperand_ParseFail; + + Loc = getLoc(); + SP3Abs = trySkipToken(AsmToken::Pipe); + if (Abs && SP3Abs) { + Error(Loc, "expected register or immediate"); + return MatchOperand_ParseFail; } OperandMatchResultTy Res; if (AllowImm) { - Res = parseRegOrImm(Operands, Abs); + Res = parseRegOrImm(Operands, SP3Abs); } else { Res = parseReg(Operands); } if (Res != MatchOperand_Success) { - return Res; + return (SP3Neg || Neg || SP3Abs || Abs)? MatchOperand_ParseFail : Res; } + if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar")) + return MatchOperand_ParseFail; + if (Abs && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return MatchOperand_ParseFail; + if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return MatchOperand_ParseFail; + AMDGPUOperand::Modifiers Mods; - if (Abs) { - if (getLexer().getKind() != AsmToken::Pipe) { - Error(Parser.getTok().getLoc(), "expected vertical bar"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Abs = true; - } - if (Abs2) { - if (getLexer().isNot(AsmToken::RParen)) { - Error(Parser.getTok().getLoc(), "expected closing parentheses"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Abs = true; - } + Mods.Abs = Abs || SP3Abs; + Mods.Neg = Neg || SP3Neg; - if (Negate) { - Mods.Neg = true; - } else if (Negate2) { - if (getLexer().isNot(AsmToken::RParen)) { - Error(Parser.getTok().getLoc(), "expected closing parentheses"); + if (Mods.hasFPModifiers()) { + AMDGPUOperand &Op = static_cast(*Operands.back()); + if (Op.isExpr()) { + Error(Op.getStartLoc(), "expected an absolute expression"); return MatchOperand_ParseFail; } - Parser.Lex(); - Mods.Neg = true; - } - - if (Mods.hasFPModifiers()) { - AMDGPUOperand &Op = static_cast(*Operands.back()); - Op.setModifiers(Mods); + Op.setModifiers(Mods); } return MatchOperand_Success; } @@ -2069,18 +2515,9 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, OperandMatchResultTy AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { - bool Sext = false; - - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == "sext") { - Parser.Lex(); - Sext = true; - if (getLexer().isNot(AsmToken::LParen)) { - Error(Parser.getTok().getLoc(), "expected left paren after sext"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - } + bool Sext = trySkipId("sext"); + if (Sext && !skipToken(AsmToken::LParen, "expected left paren after sext")) + return MatchOperand_ParseFail; OperandMatchResultTy Res; if (AllowImm) { @@ -2089,21 +2526,21 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, Res = parseReg(Operands); } if (Res != MatchOperand_Success) { - return Res; + return Sext? MatchOperand_ParseFail : Res; } + if (Sext && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return MatchOperand_ParseFail; + AMDGPUOperand::Modifiers Mods; - if (Sext) { - if (getLexer().isNot(AsmToken::RParen)) { - Error(Parser.getTok().getLoc(), "expected closing parentheses"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Sext = true; - } + Mods.Sext = Sext; if (Mods.hasIntModifiers()) { AMDGPUOperand &Op = static_cast(*Operands.back()); + if (Op.isExpr()) { + Error(Op.getStartLoc(), "expected an absolute expression"); + return MatchOperand_ParseFail; + } Op.setModifiers(Mods); } @@ -2121,21 +2558,24 @@ AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) { } OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) { + auto Loc = getLoc(); + if (trySkipId("off")) { + Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Loc, + AMDGPUOperand::ImmTyOff, false)); + return MatchOperand_Success; + } + + if (!isRegister()) + return MatchOperand_NoMatch; + std::unique_ptr Reg = parseRegister(); if (Reg) { Operands.push_back(std::move(Reg)); return MatchOperand_Success; } - const AsmToken &Tok = Parser.getTok(); - if (Tok.getString() == "off") { - Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(), - AMDGPUOperand::ImmTyOff, false)); - Parser.Lex(); - return MatchOperand_Success; - } + return MatchOperand_ParseFail; - return MatchOperand_NoMatch; } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -2163,15 +2603,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { } } - if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) { - // FIXME: Produces error without correct column reported. - auto OpNum = - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset); - const auto &Op = Inst.getOperand(OpNum); - if (Op.getImm() != 0) - return Match_InvalidOperand; - } - return Match_Success; } @@ -2214,7 +2645,10 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { switch (Reg) { case AMDGPU::FLAT_SCR: case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: case AMDGPU::M0: + case AMDGPU::SGPR_NULL: return Reg; default: break; @@ -2248,7 +2682,11 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, case 2: { const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType; if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || - OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 || + OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) { return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); } else { return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); @@ -2272,6 +2710,8 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { const unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); unsigned ConstantBusUseCount = 0; + unsigned NumLiterals = 0; + unsigned LiteralSize; if (Desc.TSFlags & (SIInstrFlags::VOPC | @@ -2283,8 +2723,10 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { ++ConstantBusUseCount; } + SmallDenseSet SGPRsUsed; unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); if (SGPRUsed != AMDGPU::NoRegister) { + SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } @@ -2307,16 +2749,41 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { // flat_scratch_lo, flat_scratch_hi // are theoretically valid but they are disabled anyway. // Note that this code mimics SIInstrInfo::verifyInstruction - if (Reg != SGPRUsed) { + if (!SGPRsUsed.count(Reg)) { + SGPRsUsed.insert(Reg); ++ConstantBusUseCount; } - SGPRUsed = Reg; } else { // Expression or a literal - ++ConstantBusUseCount; + + if (Desc.OpInfo[OpIdx].OperandType == MCOI::OPERAND_IMMEDIATE) + continue; // special operand like VINTERP attr_chan + + // An instruction may use only one literal. + // This has been validated on the previous step. + // See validateVOP3Literal. + // This literal may be used as more than one operand. + // If all these operands are of the same size, + // this literal counts as one scalar value. + // Otherwise it counts as 2 scalar values. + // See "GFX10 Shader Programming", section 3.6.2.3. + + unsigned Size = AMDGPU::getOperandSize(Desc, OpIdx); + if (Size < 4) Size = 4; + + if (NumLiterals == 0) { + NumLiterals = 1; + LiteralSize = Size; + } else if (LiteralSize != Size) { + NumLiterals = 2; + } } } } } + ConstantBusUseCount += NumLiterals; + + if (isGFX10()) + return ConstantBusUseCount <= 2; return ConstantBusUseCount <= 1; } @@ -2405,6 +2872,46 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { return (VDataSize / 4) == DataSize + TFESize; } +bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10()) + return true; + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); + + assert(VAddr0Idx != -1); + assert(SrsrcIdx != -1); + assert(DimIdx != -1); + assert(SrsrcIdx > VAddr0Idx); + + unsigned Dim = Inst.getOperand(DimIdx).getImm(); + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); + bool IsNSA = SrsrcIdx - VAddr0Idx > 1; + unsigned VAddrSize = + IsNSA ? SrsrcIdx - VAddr0Idx + : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; + + unsigned AddrSize = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + + (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + if (!IsNSA) { + if (AddrSize > 8) + AddrSize = 16; + else if (AddrSize > 4) + AddrSize = 8; + } + + return VAddrSize == AddrSize; +} + bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); @@ -2461,8 +2968,346 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateMIMGDim(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + + int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); + if (DimIdx < 0) + return true; + + long Imm = Inst.getOperand(DimIdx).getImm(); + if (Imm < 0 || Imm >= 8) + return false; + + return true; +} + +static bool IsRevOpcode(const unsigned Opcode) +{ + switch (Opcode) { + case AMDGPU::V_SUBREV_F32_e32: + case AMDGPU::V_SUBREV_F32_e64: + case AMDGPU::V_SUBREV_F32_e32_gfx10: + case AMDGPU::V_SUBREV_F32_e32_gfx6_gfx7: + case AMDGPU::V_SUBREV_F32_e32_vi: + case AMDGPU::V_SUBREV_F32_e64_gfx10: + case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7: + case AMDGPU::V_SUBREV_F32_e64_vi: + + case AMDGPU::V_SUBREV_I32_e32: + case AMDGPU::V_SUBREV_I32_e64: + case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7: + case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7: + + case AMDGPU::V_SUBBREV_U32_e32: + case AMDGPU::V_SUBBREV_U32_e64: + case AMDGPU::V_SUBBREV_U32_e32_gfx6_gfx7: + case AMDGPU::V_SUBBREV_U32_e32_vi: + case AMDGPU::V_SUBBREV_U32_e64_gfx6_gfx7: + case AMDGPU::V_SUBBREV_U32_e64_vi: + + case AMDGPU::V_SUBREV_U32_e32: + case AMDGPU::V_SUBREV_U32_e64: + case AMDGPU::V_SUBREV_U32_e32_gfx9: + case AMDGPU::V_SUBREV_U32_e32_vi: + case AMDGPU::V_SUBREV_U32_e64_gfx9: + case AMDGPU::V_SUBREV_U32_e64_vi: + + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32_gfx10: + case AMDGPU::V_SUBREV_F16_e32_vi: + case AMDGPU::V_SUBREV_F16_e64_gfx10: + case AMDGPU::V_SUBREV_F16_e64_vi: + + case AMDGPU::V_SUBREV_U16_e32: + case AMDGPU::V_SUBREV_U16_e64: + case AMDGPU::V_SUBREV_U16_e32_vi: + case AMDGPU::V_SUBREV_U16_e64_vi: + + case AMDGPU::V_SUBREV_CO_U32_e32_gfx9: + case AMDGPU::V_SUBREV_CO_U32_e64_gfx10: + case AMDGPU::V_SUBREV_CO_U32_e64_gfx9: + + case AMDGPU::V_SUBBREV_CO_U32_e32_gfx9: + case AMDGPU::V_SUBBREV_CO_U32_e64_gfx9: + + case AMDGPU::V_SUBREV_NC_U32_e32_gfx10: + case AMDGPU::V_SUBREV_NC_U32_e64_gfx10: + + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e64_gfx10: + + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32_gfx6_gfx7: + case AMDGPU::V_LSHRREV_B32_e64_gfx6_gfx7: + case AMDGPU::V_LSHRREV_B32_e32_vi: + case AMDGPU::V_LSHRREV_B32_e64_vi: + case AMDGPU::V_LSHRREV_B32_e32_gfx10: + case AMDGPU::V_LSHRREV_B32_e64_gfx10: + + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32_gfx10: + case AMDGPU::V_ASHRREV_I32_e32_gfx6_gfx7: + case AMDGPU::V_ASHRREV_I32_e32_vi: + case AMDGPU::V_ASHRREV_I32_e64_gfx10: + case AMDGPU::V_ASHRREV_I32_e64_gfx6_gfx7: + case AMDGPU::V_ASHRREV_I32_e64_vi: + + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32_gfx10: + case AMDGPU::V_LSHLREV_B32_e32_gfx6_gfx7: + case AMDGPU::V_LSHLREV_B32_e32_vi: + case AMDGPU::V_LSHLREV_B32_e64_gfx10: + case AMDGPU::V_LSHLREV_B32_e64_gfx6_gfx7: + case AMDGPU::V_LSHLREV_B32_e64_vi: + + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32_vi: + case AMDGPU::V_LSHLREV_B16_e64_vi: + case AMDGPU::V_LSHLREV_B16_gfx10: + + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_LSHRREV_B16_e32_vi: + case AMDGPU::V_LSHRREV_B16_e64_vi: + case AMDGPU::V_LSHRREV_B16_gfx10: + + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_ASHRREV_I16_e32_vi: + case AMDGPU::V_ASHRREV_I16_e64_vi: + case AMDGPU::V_ASHRREV_I16_gfx10: + + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHLREV_B64_vi: + + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHRREV_B64_vi: + + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHRREV_I64_vi: + + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHLREV_B16_gfx10: + case AMDGPU::V_PK_LSHLREV_B16_vi: + + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_LSHRREV_B16_gfx10: + case AMDGPU::V_PK_LSHRREV_B16_vi: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_ASHRREV_I16_gfx10: + case AMDGPU::V_PK_ASHRREV_I16_vi: + return true; + default: + return false; + } +} + +bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { + + using namespace SIInstrFlags; + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + + // lds_direct register is defined so that it can be used + // with 9-bit operands only. Ignore encodings which do not accept these. + if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int SrcIndices[] = { Src1Idx, Src2Idx }; + + // lds_direct cannot be specified as either src1 or src2. + for (int SrcIdx : SrcIndices) { + if (SrcIdx == -1) break; + const MCOperand &Src = Inst.getOperand(SrcIdx); + if (Src.isReg() && Src.getReg() == LDS_DIRECT) { + return false; + } + } + + if (Src0Idx == -1) + return true; + + const MCOperand &Src = Inst.getOperand(Src0Idx); + if (!Src.isReg() || Src.getReg() != LDS_DIRECT) + return true; + + // lds_direct is specified as src0. Check additional limitations. + return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode); +} + +SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const { + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (Op.isFlatOffset()) + return Op.getStartLoc(); + } + return getLoc(); +} + +bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, + const OperandVector &Operands) { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & SIInstrFlags::FLAT) == 0) + return true; + + auto Opcode = Inst.getOpcode(); + auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset); + assert(OpNum != -1); + + const auto &Op = Inst.getOperand(OpNum); + if (!hasFlatOffsets() && Op.getImm() != 0) { + Error(getFlatOffsetLoc(Operands), + "flat offset modifier is not supported on this GPU"); + return false; + } + + // Address offset is 12-bit signed for GFX10, 13-bit for GFX9. + // For FLAT segment the offset must be positive; + // MSB is ignored and forced to zero. + unsigned OffsetSize = isGFX9() ? 13 : 12; + if (TSFlags & SIInstrFlags::IsNonFlatSeg) { + if (!isIntN(OffsetSize, Op.getImm())) { + Error(getFlatOffsetLoc(Operands), + isGFX9() ? "expected a 13-bit signed offset" : + "expected a 12-bit signed offset"); + return false; + } + } else { + if (!isUIntN(OffsetSize - 1, Op.getImm())) { + Error(getFlatOffsetLoc(Operands), + isGFX9() ? "expected a 12-bit unsigned offset" : + "expected an 11-bit unsigned offset"); + return false; + } + } + + return true; +} + +bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { + unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + if (!(Desc.TSFlags & (SIInstrFlags::SOP2 | SIInstrFlags::SOPC))) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + + const int OpIndices[] = { Src0Idx, Src1Idx }; + + unsigned NumLiterals = 0; + uint32_t LiteralValue; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) break; + + const MCOperand &MO = Inst.getOperand(OpIdx); + if (MO.isImm() && + // Exclude special imm operands (like that used by s_set_gpr_idx_on) + AMDGPU::isSISrcOperand(Desc, OpIdx) && + !isInlineConstant(Inst, OpIdx)) { + uint32_t Value = static_cast(MO.getImm()); + if (NumLiterals == 0 || LiteralValue != Value) { + LiteralValue = Value; + ++NumLiterals; + } + } + } + + return NumLiterals <= 1; +} + +bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || + Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + if (OpSel & ~3) + return false; + } + return true; +} + +// Check if VCC register matches wavefront size +bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const { + auto FB = getFeatureBits(); + return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) || + (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO); +} + +// VOP3 literal is only allowed in GFX10+ and only one can be used +bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { + unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P))) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + unsigned NumLiterals = 0; + uint32_t LiteralValue; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) break; + + const MCOperand &MO = Inst.getOperand(OpIdx); + if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx)) + continue; + + if (!isInlineConstant(Inst, OpIdx)) { + uint32_t Value = static_cast(MO.getImm()); + if (NumLiterals == 0 || LiteralValue != Value) { + LiteralValue = Value; + ++NumLiterals; + } + } + } + + return !NumLiterals || + (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]); +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, - const SMLoc &IDLoc) { + const SMLoc &IDLoc, + const OperandVector &Operands) { + if (!validateLdsDirect(Inst)) { + Error(IDLoc, + "invalid use of lds_direct"); + return false; + } + if (!validateSOPLiteral(Inst)) { + Error(IDLoc, + "only one literal operand is allowed"); + return false; + } + if (!validateVOP3Literal(Inst)) { + Error(IDLoc, + "invalid literal operand"); + return false; + } if (!validateConstantBusLimitations(Inst)) { Error(IDLoc, "invalid operand (violates constant bus restrictions)"); @@ -2478,17 +3323,31 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "integer clamping is not supported on this GPU"); return false; } + if (!validateOpSel(Inst)) { + Error(IDLoc, + "invalid op_sel operand"); + return false; + } // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. if (!validateMIMGD16(Inst)) { Error(IDLoc, "d16 modifier is not supported on this GPU"); return false; } + if (!validateMIMGDim(Inst)) { + Error(IDLoc, "dim modifier is required on this GPU"); + return false; + } if (!validateMIMGDataSize(Inst)) { Error(IDLoc, "image data size does not match dmask and tfe"); return false; } + if (!validateMIMGAddrSize(Inst)) { + Error(IDLoc, + "image address size does not match dim and a16"); + return false; + } if (!validateMIMGAtomicDMask(Inst)) { Error(IDLoc, "invalid atomic image dmask"); @@ -2499,11 +3358,15 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "invalid image_gather dmask: only one bit must be set"); return false; } + if (!validateFlatOffset(Inst, Operands)) { + return false; + } return true; } -static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string AMDGPUMnemonicSpellCheck(StringRef S, + const FeatureBitset &FBS, unsigned VariantID = 0); bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -2538,7 +3401,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (Result) { default: break; case Match_Success: - if (!validateInstruction(Inst, IDLoc)) { + if (!validateInstruction(Inst, IDLoc, Operands)) { return true; } Inst.setLoc(IDLoc); @@ -2549,7 +3412,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "instruction not supported on this GPU"); case Match_MnemonicFail: { - uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = AMDGPUMnemonicSpellCheck( ((AMDGPUOperand &)*Operands[0]).getToken(), FBS); return Error(IDLoc, "invalid instruction" + Suggestion, @@ -2632,32 +3495,39 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) { bool AMDGPUAsmParser::calculateGPRBlocks( const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, - bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks, - unsigned &SGPRBlocks) { + bool XNACKUsed, Optional EnableWavefrontSize32, unsigned NextFreeVGPR, + SMRange VGPRRange, unsigned NextFreeSGPR, SMRange SGPRRange, + unsigned &VGPRBlocks, unsigned &SGPRBlocks) { // TODO(scott.linder): These calculations are duplicated from // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. IsaVersion Version = getIsaVersion(getSTI().getCPU()); unsigned NumVGPRs = NextFreeVGPR; unsigned NumSGPRs = NextFreeSGPR; - unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI()); - if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && - NumSGPRs > MaxAddressableNumSGPRs) - return OutOfRangeError(SGPRRange); + if (Version.Major >= 10) + NumSGPRs = 0; + else { + unsigned MaxAddressableNumSGPRs = + IsaInfo::getAddressableNumSGPRs(&getSTI()); - NumSGPRs += - IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); + if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && + NumSGPRs > MaxAddressableNumSGPRs) + return OutOfRangeError(SGPRRange); - if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && - NumSGPRs > MaxAddressableNumSGPRs) - return OutOfRangeError(SGPRRange); + NumSGPRs += + IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); - if (Features.test(FeatureSGPRInitBug)) - NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && + NumSGPRs > MaxAddressableNumSGPRs) + return OutOfRangeError(SGPRRange); + + if (Features.test(FeatureSGPRInitBug)) + NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + } - VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs); + VGPRBlocks = + IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs, EnableWavefrontSize32); SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs); return false; @@ -2674,7 +3544,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (getParser().parseIdentifier(KernelName)) return true; - kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(); + kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI()); StringSet<> Seen; @@ -2688,6 +3558,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { bool ReserveVCC = true; bool ReserveFlatScr = true; bool ReserveXNACK = hasXNACK(); + Optional EnableWavefrontSize32; while (true) { while (getLexer().is(AsmToken::EndOfStatement)) @@ -2736,37 +3607,45 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 4; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 1; + } else if (ID == ".amdhsa_wavefront_size32") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + EnableWavefrontSize32 = Val; + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, + Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { PARSE_BITS_ENTRY( KD.compute_pgm_rsrc2, @@ -2841,6 +3720,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val, ValRange); + } else if (ID == ".amdhsa_workgroup_processor_mode") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val, + ValRange); + } else if (ID == ".amdhsa_memory_ordered") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val, + ValRange); + } else if (ID == ".amdhsa_forward_progress") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val, + ValRange); } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") { PARSE_BITS_ENTRY( KD.compute_pgm_rsrc2, @@ -2888,8 +3785,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { unsigned VGPRBlocks; unsigned SGPRBlocks; if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, - ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR, - SGPRRange, VGPRBlocks, SGPRBlocks)) + ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR, + VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks, + SGPRBlocks)) return true; if (!isUInt( @@ -2994,6 +3892,46 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, return TokError(Err.str()); } Lex(); + + if (ID == "enable_wavefront_size32") { + if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) { + if (!isGFX10()) + return TokError("enable_wavefront_size32=1 is only allowed on GFX10+"); + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) + return TokError("enable_wavefront_size32=1 requires +WavefrontSize32"); + } else { + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64]) + return TokError("enable_wavefront_size32=0 requires +WavefrontSize64"); + } + } + + if (ID == "wavefront_size") { + if (Header.wavefront_size == 5) { + if (!isGFX10()) + return TokError("wavefront_size=5 is only allowed on GFX10+"); + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) + return TokError("wavefront_size=5 requires +WavefrontSize32"); + } else if (Header.wavefront_size == 6) { + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64]) + return TokError("wavefront_size=6 requires +WavefrontSize64"); + } + } + + if (ID == "enable_wgp_mode") { + if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10()) + return TokError("enable_wgp_mode=1 is only allowed on GFX10+"); + } + + if (ID == "enable_mem_ordered") { + if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10()) + return TokError("enable_mem_ordered=1 is only allowed on GFX10+"); + } + + if (ID == "enable_fwd_progress") { + if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10()) + return TokError("enable_fwd_progress=1 is only allowed on GFX10+"); + } + return false; } @@ -3081,14 +4019,35 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { } std::string HSAMetadataString; - raw_string_ostream YamlStream(HSAMetadataString); + if (ParseToEndDirective(AssemblerDirectiveBegin, AssemblerDirectiveEnd, + HSAMetadataString)) + return true; + + if (IsaInfo::hasCodeObjectV3(&getSTI())) { + if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } else { + if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } + + return false; +} + +/// Common code to parse out a block of text (typically YAML) between start and +/// end directives. +bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin, + const char *AssemblerDirectiveEnd, + std::string &CollectString) { + + raw_string_ostream CollectStream(CollectString); getLexer().setSkipSpace(false); bool FoundEnd = false; while (!getLexer().is(AsmToken::Eof)) { while (getLexer().is(AsmToken::Space)) { - YamlStream << getLexer().getTok().getString(); + CollectStream << getLexer().getTok().getString(); Lex(); } @@ -3101,8 +4060,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { } } - YamlStream << Parser.parseStringToEndOfStatement() - << getContext().getAsmInfo()->getSeparatorString(); + CollectStream << Parser.parseStringToEndOfStatement() + << getContext().getAsmInfo()->getSeparatorString(); Parser.eatToEndOfStatement(); } @@ -3111,22 +4070,27 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { if (getLexer().is(AsmToken::Eof) && !FoundEnd) { return TokError(Twine("expected directive ") + - Twine(HSAMD::AssemblerDirectiveEnd) + Twine(" not found")); + Twine(AssemblerDirectiveEnd) + Twine(" not found")); } - YamlStream.flush(); + CollectStream.flush(); + return false; +} - if (IsaInfo::hasCodeObjectV3(&getSTI())) { - if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) - return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); - } else { - if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) - return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); - } +/// Parse the assembler directive for new MsgPack-format PAL metadata. +bool AMDGPUAsmParser::ParseDirectivePALMetadataBegin() { + std::string String; + if (ParseToEndDirective(AMDGPU::PALMD::AssemblerDirectiveBegin, + AMDGPU::PALMD::AssemblerDirectiveEnd, String)) + return true; + auto PALMetadata = getTargetStreamer().getPALMetadata(); + if (!PALMetadata->setFromString(String)) + return Error(getParser().getTok().getLoc(), "invalid PAL metadata"); return false; } +/// Parse the assembler directive for old linear-format PAL metadata. bool AMDGPUAsmParser::ParseDirectivePALMetadata() { if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) { return Error(getParser().getTok().getLoc(), @@ -3134,19 +4098,82 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() { "not available on non-amdpal OSes")).str()); } - PALMD::Metadata PALMetadata; + auto PALMetadata = getTargetStreamer().getPALMetadata(); + PALMetadata->setLegacy(); for (;;) { - uint32_t Value; + uint32_t Key, Value; + if (ParseAsAbsoluteExpression(Key)) { + return TokError(Twine("invalid value in ") + + Twine(PALMD::AssemblerDirective)); + } + if (getLexer().isNot(AsmToken::Comma)) { + return TokError(Twine("expected an even number of values in ") + + Twine(PALMD::AssemblerDirective)); + } + Lex(); if (ParseAsAbsoluteExpression(Value)) { return TokError(Twine("invalid value in ") + Twine(PALMD::AssemblerDirective)); } - PALMetadata.push_back(Value); + PALMetadata->setRegister(Key, Value); if (getLexer().isNot(AsmToken::Comma)) break; Lex(); } - getTargetStreamer().EmitPALMetadata(PALMetadata); + return false; +} + +/// ParseDirectiveAMDGPULDS +/// ::= .amdgpu_lds identifier ',' size_expression [',' align_expression] +bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { + if (getParser().checkForValidSection()) + return true; + + StringRef Name; + SMLoc NameLoc = getLexer().getLoc(); + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + + MCSymbol *Symbol = getContext().getOrCreateSymbol(Name); + if (parseToken(AsmToken::Comma, "expected ','")) + return true; + + unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI()); + + int64_t Size; + SMLoc SizeLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(Size)) + return true; + if (Size < 0) + return Error(SizeLoc, "size must be non-negative"); + if (Size > LocalMemorySize) + return Error(SizeLoc, "size is too large"); + + int64_t Align = 4; + if (getLexer().is(AsmToken::Comma)) { + Lex(); + SMLoc AlignLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(Align)) + return true; + if (Align < 0 || !isPowerOf2_64(Align)) + return Error(AlignLoc, "alignment must be a power of two"); + + // Alignment larger than the size of LDS is possible in theory, as long + // as the linker manages to place to symbol at address 0, but we do want + // to make sure the alignment fits nicely into a 32-bit integer. + if (Align >= 1u << 31) + return Error(AlignLoc, "alignment is too large"); + } + + if (parseToken(AsmToken::EndOfStatement, + "unexpected token in '.amdgpu_lds' directive")) + return true; + + Symbol->redefineIfPossible(); + if (!Symbol->isUndefined()) + return Error(NameLoc, "invalid symbol redefinition"); + + getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align); return false; } @@ -3183,6 +4210,12 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { return ParseDirectiveHSAMetadata(); } + if (IDVal == ".amdgpu_lds") + return ParseDirectiveAMDGPULDS(); + + if (IDVal == PALMD::AssemblerDirectiveBegin) + return ParseDirectivePALMetadataBegin(); + if (IDVal == PALMD::AssemblerDirective) return ParseDirectivePALMetadata(); @@ -3195,21 +4228,36 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true); R.isValid(); ++R) { if (*R == RegNo) - return isGFX9(); + return isGFX9() || isGFX10(); + } + + // GFX10 has 2 more SGPRs 104 and 105. + for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return hasSGPR104_SGPR105(); } switch (RegNo) { + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + return !isCI() && !isSI() && !isVI(); case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: case AMDGPU::TMA: case AMDGPU::TMA_LO: case AMDGPU::TMA_HI: - return !isGFX9(); + return !isGFX9() && !isGFX10(); case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: - return !isCI() && !isSI() && hasXNACK(); + return !isCI() && !isSI() && !isGFX10() && hasXNACK(); + case AMDGPU::SGPR_NULL: + return isGFX10(); default: break; } @@ -3217,8 +4265,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, if (isCI()) return true; - if (isSI()) { - // No flat_scr + if (isSI() || isGFX10()) { + // No flat_scr on SI. + // On GFX10 flat scratch is not a valid register operand and can only be + // accessed with s_setreg/s_getreg. switch (RegNo) { case AMDGPU::FLAT_SCR: case AMDGPU::FLAT_SCR_LO: @@ -3234,14 +4284,15 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); R.isValid(); ++R) { if (*R == RegNo) - return false; + return hasSGPR102_SGPR103(); } return true; } OperandMatchResultTy -AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, + OperandMode Mode) { // Try to parse with a custom parser OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -3255,28 +4306,36 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { getLexer().is(AsmToken::EndOfStatement)) return ResTy; - ResTy = parseRegOrImm(Operands); + if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) { + unsigned Prefix = Operands.size(); + SMLoc LBraceLoc = getTok().getLoc(); + Parser.Lex(); // eat the '[' - if (ResTy == MatchOperand_Success) - return ResTy; + for (;;) { + ResTy = parseReg(Operands); + if (ResTy != MatchOperand_Success) + return ResTy; - const auto &Tok = Parser.getTok(); - SMLoc S = Tok.getLoc(); + if (getLexer().is(AsmToken::RBrac)) + break; - const MCExpr *Expr = nullptr; - if (!Parser.parseExpression(Expr)) { - Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); - return MatchOperand_Success; - } + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + } - // Possibly this is an instruction flag like 'gds'. - if (Tok.getKind() == AsmToken::Identifier) { - Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), S)); - Parser.Lex(); + if (Operands.size() - Prefix > 1) { + Operands.insert(Operands.begin() + Prefix, + AMDGPUOperand::CreateToken(this, "[", LBraceLoc)); + Operands.push_back(AMDGPUOperand::CreateToken(this, "]", + getTok().getLoc())); + } + + Parser.Lex(); // eat the ']' return MatchOperand_Success; } - return MatchOperand_NoMatch; + return parseRegOrImm(Operands); } StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { @@ -3308,8 +4367,13 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, Name = parseMnemonicSuffix(Name); Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc)); + bool IsMIMG = Name.startswith("image_"); + while (!getLexer().is(AsmToken::EndOfStatement)) { - OperandMatchResultTy Res = parseOperand(Operands, Name); + OperandMode Mode = OperandMode_Default; + if (IsMIMG && isGFX10() && Operands.size() == 2) + Mode = OperandMode_NSA; + OperandMatchResultTy Res = parseOperand(Operands, Name, Mode); // Eat the comma or space if there is one. if (getLexer().is(AsmToken::Comma)) @@ -3318,12 +4382,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, switch (Res) { case MatchOperand_Success: break; case MatchOperand_ParseFail: + // FIXME: use real operand location rather than the current location. Error(getLexer().getLoc(), "failed parsing operand."); while (!getLexer().is(AsmToken::EndOfStatement)) { Parser.Lex(); } return true; case MatchOperand_NoMatch: + // FIXME: use real operand location rather than the current location. Error(getLexer().getLoc(), "not a valid operand."); while (!getLexer().is(AsmToken::EndOfStatement)) { Parser.Lex(); @@ -3340,46 +4406,19 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, //===----------------------------------------------------------------------===// OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { - switch(getLexer().getKind()) { - default: return MatchOperand_NoMatch; - case AsmToken::Identifier: { - StringRef Name = Parser.getTok().getString(); - if (!Name.equals(Prefix)) { - return MatchOperand_NoMatch; - } - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Colon)) - return MatchOperand_ParseFail; +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &IntVal) { - Parser.Lex(); - - bool IsMinus = false; - if (getLexer().getKind() == AsmToken::Minus) { - Parser.Lex(); - IsMinus = true; - } - - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - - if (getParser().parseAbsoluteExpression(Int)) - return MatchOperand_ParseFail; + if (!trySkipId(Prefix, AsmToken::Colon)) + return MatchOperand_NoMatch; - if (IsMinus) - Int = -Int; - break; - } - } - return MatchOperand_Success; + return parseExpr(IntVal) ? MatchOperand_Success : MatchOperand_ParseFail; } OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy, bool (*ConvertResult)(int64_t&)) { - SMLoc S = Parser.getTok().getLoc(); + SMLoc S = getLoc(); int64_t Value = 0; OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); @@ -3387,59 +4426,55 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, return Res; if (ConvertResult && !ConvertResult(Value)) { - return MatchOperand_ParseFail; + Error(S, "invalid " + StringRef(Prefix) + " value."); } Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy)); return MatchOperand_Success; } -OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix( - const char *Prefix, - OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy, - bool (*ConvertResult)(int64_t&)) { - StringRef Name = Parser.getTok().getString(); - if (!Name.equals(Prefix)) +OperandMatchResultTy +AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { + SMLoc S = getLoc(); + if (!trySkipId(Prefix, AsmToken::Colon)) return MatchOperand_NoMatch; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Colon)) + if (!skipToken(AsmToken::LBrac, "expected a left square bracket")) return MatchOperand_ParseFail; - Parser.Lex(); - if (getLexer().isNot(AsmToken::LBrac)) - return MatchOperand_ParseFail; - Parser.Lex(); - unsigned Val = 0; - SMLoc S = Parser.getTok().getLoc(); + const unsigned MaxSize = 4; // FIXME: How to verify the number of elements matches the number of src // operands? - for (int I = 0; I < 4; ++I) { - if (I != 0) { - if (getLexer().is(AsmToken::RBrac)) - break; + for (int I = 0; ; ++I) { + int64_t Op; + SMLoc Loc = getLoc(); + if (!parseExpr(Op)) + return MatchOperand_ParseFail; - if (getLexer().isNot(AsmToken::Comma)) - return MatchOperand_ParseFail; - Parser.Lex(); + if (Op != 0 && Op != 1) { + Error(Loc, "invalid " + StringRef(Prefix) + " value."); + return MatchOperand_ParseFail; } - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; + Val |= (Op << I); - int64_t Op; - if (getParser().parseAbsoluteExpression(Op)) + if (trySkipToken(AsmToken::RBrac)) + break; + + if (I + 1 == MaxSize) { + Error(getLoc(), "expected a closing square bracket"); return MatchOperand_ParseFail; + } - if (Op != 0 && Op != 1) + if (!skipToken(AsmToken::Comma, "expected a comma")) return MatchOperand_ParseFail; - Val |= (Op << I); } - Parser.Lex(); Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); return MatchOperand_Success; } @@ -3459,7 +4494,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, if (Tok == Name) { if (Tok == "r128" && isGFX9()) Error(S, "r128 modifier is not supported on this GPU"); - if (Tok == "a16" && !isGFX9()) + if (Tok == "a16" && !isGFX9() && !isGFX10()) Error(S, "a16 modifier is not supported on this GPU"); Bit = 1; Parser.Lex(); @@ -3476,6 +4511,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, } } + if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy)); return MatchOperand_Success; } @@ -3616,7 +4654,8 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, } AMDGPUOperand::ImmTy OffsetType = - (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si || + (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 || + Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 || Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : AMDGPUOperand::ImmTyOffset; @@ -3716,20 +4755,18 @@ encodeCnt( } bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { - StringRef CntName = Parser.getTok().getString(); - int64_t CntVal; - Parser.Lex(); - if (getLexer().isNot(AsmToken::LParen)) - return true; + SMLoc CntLoc = getLoc(); + StringRef CntName = getTokenStr(); - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return true; + if (!skipToken(AsmToken::Identifier, "expected a counter name") || + !skipToken(AsmToken::LParen, "expected a left parenthesis")) + return false; - SMLoc ValLoc = Parser.getTok().getLoc(); - if (getParser().parseAbsoluteExpression(CntVal)) - return true; + int64_t CntVal; + SMLoc ValLoc = getLoc(); + if (!parseExpr(CntVal)) + return false; AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); @@ -3742,265 +4779,240 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt); } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") { Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt); + } else { + Error(CntLoc, "invalid counter name " + CntName); + return false; } if (Failed) { Error(ValLoc, "too large value for " + CntName); - return true; + return false; } - if (getLexer().isNot(AsmToken::RParen)) { - return true; - } + if (!skipToken(AsmToken::RParen, "expected a closing parenthesis")) + return false; - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) { - const AsmToken NextToken = getLexer().peekTok(); - if (NextToken.is(AsmToken::Identifier)) { - Parser.Lex(); + if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) { + if (isToken(AsmToken::EndOfStatement)) { + Error(getLoc(), "expected a counter name"); + return false; } } - return false; + return true; } OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); int64_t Waitcnt = getWaitcntBitMask(ISA); - SMLoc S = Parser.getTok().getLoc(); + SMLoc S = getLoc(); - switch(getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(Waitcnt)) - return MatchOperand_ParseFail; - break; - - case AsmToken::Identifier: - do { - if (parseCnt(Waitcnt)) - return MatchOperand_ParseFail; - } while(getLexer().isNot(AsmToken::EndOfStatement)); - break; + // If parse failed, do not return error code + // to avoid excessive error messages. + if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) { + while (parseCnt(Waitcnt) && !isToken(AsmToken::EndOfStatement)); + } else { + parseExpr(Waitcnt); } + Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S)); return MatchOperand_Success; } -bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, - int64_t &Width) { - using namespace llvm::AMDGPU::Hwreg; +bool +AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} - if (Parser.getTok().getString() != "hwreg") - return true; - Parser.Lex(); +//===----------------------------------------------------------------------===// +// hwreg +//===----------------------------------------------------------------------===// - if (getLexer().isNot(AsmToken::LParen)) - return true; - Parser.Lex(); +bool +AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg, + int64_t &Offset, + int64_t &Width) { + using namespace llvm::AMDGPU::Hwreg; - if (getLexer().is(AsmToken::Identifier)) { + // The register may be specified by name or using a numeric code + if (isToken(AsmToken::Identifier) && + (HwReg.Id = getHwregId(getTokenStr())) >= 0) { HwReg.IsSymbolic = true; - HwReg.Id = ID_UNKNOWN_; - const StringRef tok = Parser.getTok().getString(); - int Last = ID_SYMBOLIC_LAST_; - if (isSI() || isCI() || isVI()) - Last = ID_SYMBOLIC_FIRST_GFX9_; - for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) { - if (tok == IdSymbolic[i]) { - HwReg.Id = i; - break; - } - } - Parser.Lex(); - } else { - HwReg.IsSymbolic = false; - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(HwReg.Id)) - return true; - } - - if (getLexer().is(AsmToken::RParen)) { - Parser.Lex(); + lex(); // skip message name + } else if (!parseExpr(HwReg.Id)) { return false; } - // optional params - if (getLexer().isNot(AsmToken::Comma)) + if (trySkipToken(AsmToken::RParen)) return true; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Offset)) - return true; - - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); + // parse optional params + return + skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis") && + parseExpr(Offset) && + skipToken(AsmToken::Comma, "expected a comma") && + parseExpr(Width) && + skipToken(AsmToken::RParen, "expected a closing parenthesis"); +} - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Width)) - return true; +bool +AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg, + const int64_t Offset, + const int64_t Width, + const SMLoc Loc) { - if (getLexer().isNot(AsmToken::RParen)) - return true; - Parser.Lex(); + using namespace llvm::AMDGPU::Hwreg; - return false; + if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) { + Error(Loc, "specified hardware register is not supported on this GPU"); + return false; + } else if (!isValidHwreg(HwReg.Id)) { + Error(Loc, "invalid code of hardware register: only 6-bit values are legal"); + return false; + } else if (!isValidHwregOffset(Offset)) { + Error(Loc, "invalid bit offset: only 5-bit values are legal"); + return false; + } else if (!isValidHwregWidth(Width)) { + Error(Loc, "invalid bitfield width: only values from 1 to 32 are legal"); + return false; + } + return true; } -OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { +OperandMatchResultTy +AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { using namespace llvm::AMDGPU::Hwreg; - int64_t Imm16Val = 0; - SMLoc S = Parser.getTok().getLoc(); - - switch(getLexer().getKind()) { - default: return MatchOperand_NoMatch; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(Imm16Val)) - return MatchOperand_NoMatch; - if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { - Error(S, "invalid immediate: only 16-bit values are legal"); - // Do not return error code, but create an imm operand anyway and proceed - // to the next operand, if any. That avoids unneccessary error messages. - } - break; - - case AsmToken::Identifier: { - OperandInfoTy HwReg(ID_UNKNOWN_); - int64_t Offset = OFFSET_DEFAULT_; - int64_t Width = WIDTH_M1_DEFAULT_ + 1; - if (parseHwregConstruct(HwReg, Offset, Width)) - return MatchOperand_ParseFail; - if (HwReg.Id < 0 || !isUInt(HwReg.Id)) { - if (HwReg.IsSymbolic) - Error(S, "invalid symbolic name of hardware register"); - else - Error(S, "invalid code of hardware register: only 6-bit values are legal"); - } - if (Offset < 0 || !isUInt(Offset)) - Error(S, "invalid bit offset: only 5-bit values are legal"); - if ((Width-1) < 0 || !isUInt(Width-1)) - Error(S, "invalid bitfield width: only values from 1 to 32 are legal"); - Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_); - } - break; + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + + // If parse failed, do not return error code + // to avoid excessive error messages. + if (trySkipId("hwreg", AsmToken::LParen)) { + OperandInfoTy HwReg(ID_UNKNOWN_); + int64_t Offset = OFFSET_DEFAULT_; + int64_t Width = WIDTH_DEFAULT_; + if (parseHwregBody(HwReg, Offset, Width) && + validateHwreg(HwReg, Offset, Width, Loc)) { + ImmVal = encodeHwreg(HwReg.Id, Offset, Width); + } + } else if (parseExpr(ImmVal)) { + if (ImmVal < 0 || !isUInt<16>(ImmVal)) + Error(Loc, "invalid immediate: only 16-bit values are legal"); } - Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg)); - return MatchOperand_Success; -} -bool AMDGPUOperand::isSWaitCnt() const { - return isImm(); + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg)); + return MatchOperand_Success; } bool AMDGPUOperand::isHwreg() const { return isImmTy(ImmTyHwreg); } -bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) { +//===----------------------------------------------------------------------===// +// sendmsg +//===----------------------------------------------------------------------===// + +bool +AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg, + OperandInfoTy &Op, + OperandInfoTy &Stream) { using namespace llvm::AMDGPU::SendMsg; - if (Parser.getTok().getString() != "sendmsg") - return true; - Parser.Lex(); + if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) { + Msg.IsSymbolic = true; + lex(); // skip message name + } else if (!parseExpr(Msg.Id)) { + return false; + } - if (getLexer().isNot(AsmToken::LParen)) - return true; - Parser.Lex(); + if (trySkipToken(AsmToken::Comma)) { + Op.IsDefined = true; + if (isToken(AsmToken::Identifier) && + (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) { + lex(); // skip operation name + } else if (!parseExpr(Op.Id)) { + return false; + } - if (getLexer().is(AsmToken::Identifier)) { - Msg.IsSymbolic = true; - Msg.Id = ID_UNKNOWN_; - const std::string tok = Parser.getTok().getString(); - for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { - switch(i) { - default: continue; // Omit gaps. - case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break; - } - if (tok == IdSymbolic[i]) { - Msg.Id = i; - break; - } + if (trySkipToken(AsmToken::Comma)) { + Stream.IsDefined = true; + if (!parseExpr(Stream.Id)) + return false; } - Parser.Lex(); - } else { - Msg.IsSymbolic = false; - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Msg.Id)) - return true; - if (getLexer().is(AsmToken::Integer)) - if (getParser().parseAbsoluteExpression(Msg.Id)) - Msg.Id = ID_UNKNOWN_; } - if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest. - return false; - if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) { - if (getLexer().isNot(AsmToken::RParen)) - return true; - Parser.Lex(); + return skipToken(AsmToken::RParen, "expected a closing parenthesis"); +} + +bool +AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, + const OperandInfoTy &Op, + const OperandInfoTy &Stream, + const SMLoc S) { + using namespace llvm::AMDGPU::SendMsg; + + // Validation strictness depends on whether message is specified + // in a symbolc or in a numeric form. In the latter case + // only encoding possibility is checked. + bool Strict = Msg.IsSymbolic; + + if (!isValidMsgId(Msg.Id, getSTI(), Strict)) { + Error(S, "invalid message id"); + return false; + } else if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) { + Error(S, Op.IsDefined ? + "message does not support operations" : + "missing message operation"); + return false; + } else if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) { + Error(S, "invalid operation id"); + return false; + } else if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) { + Error(S, "message operation does not support streams"); + return false; + } else if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) { + Error(S, "invalid message stream id"); return false; } + return true; +} - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); +OperandMatchResultTy +AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { + using namespace llvm::AMDGPU::SendMsg; - assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG); - Operation.Id = ID_UNKNOWN_; - if (getLexer().is(AsmToken::Identifier)) { - Operation.IsSymbolic = true; - const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic; - const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_; - const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_; - const StringRef Tok = Parser.getTok().getString(); - for (int i = F; i < L; ++i) { - if (Tok == S[i]) { - Operation.Id = i; - break; - } + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + + // If parse failed, do not return error code + // to avoid excessive error messages. + if (trySkipId("sendmsg", AsmToken::LParen)) { + OperandInfoTy Msg(ID_UNKNOWN_); + OperandInfoTy Op(OP_NONE_); + OperandInfoTy Stream(STREAM_ID_NONE_); + if (parseSendMsgBody(Msg, Op, Stream) && + validateSendMsg(Msg, Op, Stream, Loc)) { + ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id); } - Parser.Lex(); - } else { - Operation.IsSymbolic = false; - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Operation.Id)) - return true; + } else if (parseExpr(ImmVal)) { + if (ImmVal < 0 || !isUInt<16>(ImmVal)) + Error(Loc, "invalid immediate: only 16-bit values are legal"); } - if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { - // Stream id is optional. - if (getLexer().is(AsmToken::RParen)) { - Parser.Lex(); - return false; - } - - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); - - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(StreamId)) - return true; - } + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg)); + return MatchOperand_Success; +} - if (getLexer().isNot(AsmToken::RParen)) - return true; - Parser.Lex(); - return false; +bool AMDGPUOperand::isSendMsg() const { + return isImmTy(ImmTySendMsg); } +//===----------------------------------------------------------------------===// +// v_interp +//===----------------------------------------------------------------------===// + OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) { if (getLexer().getKind() != AsmToken::Identifier) return MatchOperand_NoMatch; @@ -4062,6 +5074,10 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { return MatchOperand_Success; } +//===----------------------------------------------------------------------===// +// exp +//===----------------------------------------------------------------------===// + void AMDGPUAsmParser::errorExpTgt() { Error(Parser.getTok().getLoc(), "invalid exp target"); } @@ -4094,13 +5110,18 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str, if (Str.getAsInteger(10, Val)) return MatchOperand_ParseFail; - if (Val > 3) + if (Val > 4 || (Val == 4 && !isGFX10())) errorExpTgt(); Val += 12; return MatchOperand_Success; } + if (isGFX10() && Str == "prim") { + Val = 20; + return MatchOperand_Success; + } + if (Str.startswith("param")) { Str = Str.drop_front(5); if (Str.getAsInteger(10, Val)) @@ -4118,121 +5139,62 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str, if (Str.getAsInteger(10, Val)) return MatchOperand_ParseFail; - errorExpTgt(); - return MatchOperand_Success; - } - - return MatchOperand_NoMatch; -} - -OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) { - uint8_t Val; - StringRef Str = Parser.getTok().getString(); - - auto Res = parseExpTgtImpl(Str, Val); - if (Res != MatchOperand_Success) - return Res; - - SMLoc S = Parser.getTok().getLoc(); - Parser.Lex(); - - Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, - AMDGPUOperand::ImmTyExpTgt)); - return MatchOperand_Success; -} - -OperandMatchResultTy -AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { - using namespace llvm::AMDGPU::SendMsg; - - int64_t Imm16Val = 0; - SMLoc S = Parser.getTok().getLoc(); - - switch(getLexer().getKind()) { - default: - return MatchOperand_NoMatch; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(Imm16Val)) - return MatchOperand_NoMatch; - if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { - Error(S, "invalid immediate: only 16-bit values are legal"); - // Do not return error code, but create an imm operand anyway and proceed - // to the next operand, if any. That avoids unneccessary error messages. - } - break; - case AsmToken::Identifier: { - OperandInfoTy Msg(ID_UNKNOWN_); - OperandInfoTy Operation(OP_UNKNOWN_); - int64_t StreamId = STREAM_ID_DEFAULT_; - if (parseSendMsgConstruct(Msg, Operation, StreamId)) - return MatchOperand_ParseFail; - do { - // Validate and encode message ID. - if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE) - || Msg.Id == ID_SYSMSG)) { - if (Msg.IsSymbolic) - Error(S, "invalid/unsupported symbolic name of message"); - else - Error(S, "invalid/unsupported code of message"); - break; - } - Imm16Val = (Msg.Id << ID_SHIFT_); - // Validate and encode operation ID. - if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) { - if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) { - if (Operation.IsSymbolic) - Error(S, "invalid symbolic name of GS_OP"); - else - Error(S, "invalid code of GS_OP: only 2-bit values are legal"); - break; - } - if (Operation.Id == OP_GS_NOP - && Msg.Id != ID_GS_DONE) { - Error(S, "invalid GS_OP: NOP is for GS_DONE only"); - break; - } - Imm16Val |= (Operation.Id << OP_SHIFT_); - } - if (Msg.Id == ID_SYSMSG) { - if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) { - if (Operation.IsSymbolic) - Error(S, "invalid/unsupported symbolic name of SYSMSG_OP"); - else - Error(S, "invalid/unsupported code of SYSMSG_OP"); - break; - } - Imm16Val |= (Operation.Id << OP_SHIFT_); - } - // Validate and encode stream ID. - if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { - if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) { - Error(S, "invalid stream id: only 2-bit values are legal"); - break; - } - Imm16Val |= (StreamId << STREAM_ID_SHIFT_); - } - } while (false); - } - break; + errorExpTgt(); + return MatchOperand_Success; } - Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg)); - return MatchOperand_Success; + + return MatchOperand_NoMatch; } -bool AMDGPUOperand::isSendMsg() const { - return isImmTy(ImmTySendMsg); +OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) { + uint8_t Val; + StringRef Str = Parser.getTok().getString(); + + auto Res = parseExpTgtImpl(Str, Val); + if (Res != MatchOperand_Success) + return Res; + + SMLoc S = Parser.getTok().getLoc(); + Parser.Lex(); + + Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, + AMDGPUOperand::ImmTyExpTgt)); + return MatchOperand_Success; } //===----------------------------------------------------------------------===// // parser helpers //===----------------------------------------------------------------------===// +bool +AMDGPUAsmParser::isId(const AsmToken &Token, const StringRef Id) const { + return Token.is(AsmToken::Identifier) && Token.getString() == Id; +} + +bool +AMDGPUAsmParser::isId(const StringRef Id) const { + return isId(getToken(), Id); +} + +bool +AMDGPUAsmParser::isToken(const AsmToken::TokenKind Kind) const { + return getTokenKind() == Kind; +} + bool AMDGPUAsmParser::trySkipId(const StringRef Id) { - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == Id) { - Parser.Lex(); + if (isId(Id)) { + lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) { + if (isId(Id) && peekToken().is(Kind)) { + lex(); + lex(); return true; } return false; @@ -4240,8 +5202,8 @@ AMDGPUAsmParser::trySkipId(const StringRef Id) { bool AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) { - if (getLexer().getKind() == Kind) { - Parser.Lex(); + if (isToken(Kind)) { + lex(); return true; } return false; @@ -4251,7 +5213,7 @@ bool AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg) { if (!trySkipToken(Kind)) { - Error(Parser.getTok().getLoc(), ErrMsg); + Error(getLoc(), ErrMsg); return false; } return true; @@ -4264,17 +5226,54 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) { bool AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { - SMLoc S = Parser.getTok().getLoc(); - if (getLexer().getKind() == AsmToken::String) { - Val = Parser.getTok().getStringContents(); - Parser.Lex(); + if (isToken(AsmToken::String)) { + Val = getToken().getStringContents(); + lex(); return true; } else { - Error(S, ErrMsg); + Error(getLoc(), ErrMsg); return false; } } +AsmToken +AMDGPUAsmParser::getToken() const { + return Parser.getTok(); +} + +AsmToken +AMDGPUAsmParser::peekToken() { + return getLexer().peekTok(); +} + +void +AMDGPUAsmParser::peekTokens(MutableArrayRef Tokens) { + auto TokCount = getLexer().peekTokens(Tokens); + + for (auto Idx = TokCount; Idx < Tokens.size(); ++Idx) + Tokens[Idx] = AsmToken(AsmToken::Error, ""); +} + +AsmToken::TokenKind +AMDGPUAsmParser::getTokenKind() const { + return getLexer().getKind(); +} + +SMLoc +AMDGPUAsmParser::getLoc() const { + return getToken().getLoc(); +} + +StringRef +AMDGPUAsmParser::getTokenStr() const { + return getToken().getString(); +} + +void +AMDGPUAsmParser::lex() { + Parser.Lex(); +} + //===----------------------------------------------------------------------===// // swizzle //===----------------------------------------------------------------------===// @@ -4322,8 +5321,8 @@ AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) { if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX, "expected a 2-bit lane id")) { Imm = QUAD_PERM_ENC; - for (auto i = 0; i < LANE_NUM; ++i) { - Imm |= Lane[i] << (LANE_SHIFT * i); + for (unsigned I = 0; I < LANE_NUM; ++I) { + Imm |= Lane[I] << (LANE_SHIFT * I); } return true; } @@ -4518,6 +5517,88 @@ AMDGPUOperand::isSwizzle() const { return isImmTy(ImmTySwizzle); } +//===----------------------------------------------------------------------===// +// VGPR Index Mode +//===----------------------------------------------------------------------===// + +int64_t AMDGPUAsmParser::parseGPRIdxMacro() { + + using namespace llvm::AMDGPU::VGPRIndexMode; + + if (trySkipToken(AsmToken::RParen)) { + return OFF; + } + + int64_t Imm = 0; + + while (true) { + unsigned Mode = 0; + SMLoc S = Parser.getTok().getLoc(); + + for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) { + if (trySkipId(IdSymbolic[ModeId])) { + Mode = 1 << ModeId; + break; + } + } + + if (Mode == 0) { + Error(S, (Imm == 0)? + "expected a VGPR index mode or a closing parenthesis" : + "expected a VGPR index mode"); + break; + } + + if (Imm & Mode) { + Error(S, "duplicate VGPR index mode"); + break; + } + Imm |= Mode; + + if (trySkipToken(AsmToken::RParen)) + break; + if (!skipToken(AsmToken::Comma, + "expected a comma or a closing parenthesis")) + break; + } + + return Imm; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) { + + int64_t Imm = 0; + SMLoc S = Parser.getTok().getLoc(); + + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "gpr_idx" && + getLexer().peekTok().is(AsmToken::LParen)) { + + Parser.Lex(); + Parser.Lex(); + + // If parse failed, trigger an error but do not return error code + // to avoid excessive error messages. + Imm = parseGPRIdxMacro(); + + } else { + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_NoMatch; + if (Imm < 0 || !isUInt<4>(Imm)) { + Error(S, "invalid immediate: only 4-bit values are legal"); + } + } + + Operands.push_back( + AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyGprIdxMode)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isGPRIdxMode() const { + return isImmTy(ImmTyGprIdxMode); +} + //===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// @@ -4545,10 +5626,23 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { } } +//===----------------------------------------------------------------------===// +// Boolean holding registers +//===----------------------------------------------------------------------===// + +OperandMatchResultTy +AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { + return parseReg(Operands); +} + //===----------------------------------------------------------------------===// // mubuf //===----------------------------------------------------------------------===// +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC); +} + AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC); } @@ -4566,13 +5660,19 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, bool HasLdsModifier = false; OptionalImmIndexMap OptionalIdx; assert(IsAtomicReturn ? IsAtomic : true); + unsigned FirstOperandIdx = 1; - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); // Add the register arguments if (Op.isReg()) { Op.addRegOperands(Inst, 1); + // Insert a tied src for atomic return dst. + // This cannot be postponed as subsequent calls to + // addImmOperands rely on correct number of MC operands. + if (IsAtomicReturn && i == FirstOperandIdx) + Op.addRegOperands(Inst, 1); continue; } @@ -4582,7 +5682,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, continue; } - HasLdsModifier = Op.isLDS(); + HasLdsModifier |= Op.isLDS(); // Handle tokens like 'offen' which are sometimes hard-coded into the // asm string. There are no MCInst operands for these. @@ -4610,12 +5710,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, } } - // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns. - if (IsAtomicReturn) { - MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning. - Inst.insert(I, *I); - } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); if (!IsAtomic) { // glc is hard-coded. addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); @@ -4625,6 +5719,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, if (!IsLdsOpcode) { // tfe is not legal with lds opcodes addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } + + if (isGFX10()) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); } void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { @@ -4662,6 +5759,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + + if (isGFX10()) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); } //===----------------------------------------------------------------------===// @@ -4692,19 +5792,26 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, Op.addRegOperands(Inst, 1); } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; - } else { + } else if (!Op.isToken()) { llvm_unreachable("unexpected operand type"); } } + bool IsGFX10 = isGFX10(); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); + if (IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + if (IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + if (!IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16); } @@ -4742,11 +5849,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFlatOffset() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } @@ -4801,7 +5904,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, - {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, + {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr}, + {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, @@ -4816,9 +5920,11 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, + {"dim", AMDGPUOperand::ImmTyDim, false, nullptr}, {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, + {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr}, {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr}, {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, @@ -4828,7 +5934,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr}, {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr}, - {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr} + {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}, + {"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr}, + {"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr}, + {"abid", AMDGPUOperand::ImmTyABID, false, nullptr} }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { @@ -4884,7 +5993,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) Op.Type == AMDGPUOperand::ImmTyNegHi) { res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); - } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) { + } else if (Op.Type == AMDGPUOperand::ImmTyDim) { + res = parseDim(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) { res = parseDfmtNfmt(Operands); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); @@ -4964,7 +6075,7 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) } else if (Op.isInterpSlot() || Op.isInterpAttr() || Op.isAttrChan()) { - Inst.addOperand(MCOperand::createImm(Op.Imm.Val)); + Inst.addOperand(MCOperand::createImm(Op.getImm())); } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; } else { @@ -5029,14 +6140,17 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); } - // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906): + // Special case v_mac_{f16, f32} and v_fmac_{f16, f32} (gfx906/gfx10+): // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0. - if (Opc == AMDGPU::V_MAC_F32_e64_si || + if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || + Opc == AMDGPU::V_MAC_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F32_e64_vi || Opc == AMDGPU::V_MAC_F16_e64_vi || - Opc == AMDGPU::V_FMAC_F32_e64_vi) { + Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || + Opc == AMDGPU::V_FMAC_F32_e64_vi || + Opc == AMDGPU::V_FMAC_F16_e64_gfx10) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 @@ -5137,6 +6251,10 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, // dpp //===----------------------------------------------------------------------===// +bool AMDGPUOperand::isDPP8() const { + return isImmTy(ImmTyDPP8); +} + bool AMDGPUOperand::isDPPCtrl() const { using namespace AMDGPU::DPP; @@ -5154,13 +6272,27 @@ bool AMDGPUOperand::isDPPCtrl() const { (Imm == DppCtrl::ROW_MIRROR) || (Imm == DppCtrl::ROW_HALF_MIRROR) || (Imm == DppCtrl::BCAST15) || - (Imm == DppCtrl::BCAST31); + (Imm == DppCtrl::BCAST31) || + (Imm >= DppCtrl::ROW_SHARE_FIRST && Imm <= DppCtrl::ROW_SHARE_LAST) || + (Imm >= DppCtrl::ROW_XMASK_FIRST && Imm <= DppCtrl::ROW_XMASK_LAST); } return false; } -bool AMDGPUOperand::isGPRIdxMode() const { - return isImm() && isUInt<4>(getImm()); +//===----------------------------------------------------------------------===// +// mAI +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isBLGP() const { + return isImm() && getImmTy() == ImmTyBLGP && isUInt<3>(getImm()); +} + +bool AMDGPUOperand::isCBSZ() const { + return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm()); +} + +bool AMDGPUOperand::isABID() const { + return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm()); } bool AMDGPUOperand::isS16Imm() const { @@ -5171,6 +6303,108 @@ bool AMDGPUOperand::isU16Imm() const { return isImm() && isUInt<16>(getImm()); } +OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) { + if (!isGFX10()) + return MatchOperand_NoMatch; + + SMLoc S = Parser.getTok().getLoc(); + + if (getLexer().isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + if (getLexer().getTok().getString() != "dim") + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + + // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an + // integer. + std::string Token; + if (getLexer().is(AsmToken::Integer)) { + SMLoc Loc = getLexer().getTok().getEndLoc(); + Token = getLexer().getTok().getString(); + Parser.Lex(); + if (getLexer().getTok().getLoc() != Loc) + return MatchOperand_ParseFail; + } + if (getLexer().isNot(AsmToken::Identifier)) + return MatchOperand_ParseFail; + Token += getLexer().getTok().getString(); + + StringRef DimId = Token; + if (DimId.startswith("SQ_RSRC_IMG_")) + DimId = DimId.substr(12); + + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId); + if (!DimInfo) + return MatchOperand_ParseFail; + + Parser.Lex(); + + Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S, + AMDGPUOperand::ImmTyDim)); + return MatchOperand_Success; +} + +OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Prefix; + + if (getLexer().getKind() == AsmToken::Identifier) { + Prefix = Parser.getTok().getString(); + } else { + return MatchOperand_NoMatch; + } + + if (Prefix != "dpp8") + return parseDPPCtrl(Operands); + if (!isGFX10()) + return MatchOperand_NoMatch; + + // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d] + + int64_t Sels[8]; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(Sels[0])) + return MatchOperand_ParseFail; + if (0 > Sels[0] || 7 < Sels[0]) + return MatchOperand_ParseFail; + + for (size_t i = 1; i < 8; ++i) { + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(Sels[i])) + return MatchOperand_ParseFail; + if (0 > Sels[i] || 7 < Sels[i]) + return MatchOperand_ParseFail; + } + + if (getLexer().isNot(AsmToken::RBrac)) + return MatchOperand_ParseFail; + Parser.Lex(); + + unsigned DPP8 = 0; + for (size_t i = 0; i < 8; ++i) + DPP8 |= (Sels[i] << (i * 3)); + + Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8)); + return MatchOperand_Success; +} + OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { using namespace AMDGPU::DPP; @@ -5201,10 +6435,21 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { && Prefix != "wave_rol" && Prefix != "wave_shr" && Prefix != "wave_ror" - && Prefix != "row_bcast") { + && Prefix != "row_bcast" + && Prefix != "row_share" + && Prefix != "row_xmask") { return MatchOperand_NoMatch; } + if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask")) + return MatchOperand_NoMatch; + + if (!isVI() && !isGFX9() && + (Prefix == "wave_shl" || Prefix == "wave_shr" || + Prefix == "wave_rol" || Prefix == "wave_ror" || + Prefix == "row_bcast")) + return MatchOperand_NoMatch; + Parser.Lex(); if (getLexer().isNot(AsmToken::Colon)) return MatchOperand_ParseFail; @@ -5262,6 +6507,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { } else { return MatchOperand_ParseFail; } + } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) { + Int |= DppCtrl::ROW_SHARE_FIRST; + } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) { + Int |= DppCtrl::ROW_XMASK_FIRST; } else { return MatchOperand_ParseFail; } @@ -5276,6 +6525,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const { return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultEndpgmImmOperands() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyEndpgm); +} + AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const { return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); } @@ -5284,7 +6537,11 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); } -void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi); +} + +void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; unsigned I = 1; @@ -5293,6 +6550,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + int Fi = 0; for (unsigned E = Operands.size(); I != E; ++I) { auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); @@ -5303,25 +6561,49 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + if (Op.isReg() && validateVccOperand(Op.getReg())) { // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token. // Skip it. continue; - } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegWithFPInputModsOperands(Inst, 2); - } else if (Op.isDPPCtrl()) { - Op.addImmOperands(Inst, 1); - } else if (Op.isImm()) { - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = I; + } + + if (IsDPP8) { + if (Op.isDPP8()) { + Op.addImmOperands(Inst, 1); + } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isFI()) { + Fi = Op.getImm(); + } else if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + } else { + llvm_unreachable("Invalid operand type"); + } } else { - llvm_unreachable("Invalid operand type"); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isDPPCtrl()) { + Op.addImmOperands(Inst, 1); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } } } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (IsDPP8) { + using namespace llvm::AMDGPU::DPP; + Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0)); + } else { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi); + } + } } //===----------------------------------------------------------------------===// @@ -5422,7 +6704,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + if (skipVcc && !skippedVcc && Op.isReg() && + (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) { // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. @@ -5448,7 +6731,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, skippedVcc = false; } - if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 && + Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments switch (BasicInstType) { @@ -5474,7 +6758,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, break; case SIInstrFlags::VOPC: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::clamp) != -1) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; @@ -5495,6 +6780,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } } +//===----------------------------------------------------------------------===// +// mAI +//===----------------------------------------------------------------------===// + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBLGP() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyBLGP); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCBSZ() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCBSZ); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultABID() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyABID); +} + /// Force static initialization. extern "C" void LLVMInitializeAMDGPUAsmParser() { RegisterMCAsmParser A(getTheAMDGPUTarget()); @@ -5552,3 +6853,28 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Match_InvalidOperand; } } + +//===----------------------------------------------------------------------===// +// endpgm +//===----------------------------------------------------------------------===// + +OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Imm = 0; + + if (!parseExpr(Imm)) { + // The operand is optional, if not present default to 0 + Imm = 0; + } + + if (!isUInt<16>(Imm)) { + Error(S, "expected a 16-bit value"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyEndpgm)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); } diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 51c2abeac2ff..62a19d848af2 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -1,37 +1,22 @@ //===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; def MUBUFScratchOffen : ComplexPattern; def MUBUFScratchOffset : ComplexPattern; -def MUBUFOffset : ComplexPattern; +def MUBUFOffset : ComplexPattern; def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; -class MubufLoad : PatFrag < - (ops node:$ptr), (op node:$ptr), [{ - auto const AS = cast(N)->getAddressSpace(); - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; -}]>; - -def mubuf_load : MubufLoad ; -def mubuf_az_extloadi8 : MubufLoad ; -def mubuf_sextloadi8 : MubufLoad ; -def mubuf_az_extloadi16 : MubufLoad ; -def mubuf_sextloadi16 : MubufLoad ; -def mubuf_load_atomic : MubufLoad ; - def BUFAddrKind { int Offset = 0; int OffEn = 1; @@ -97,7 +82,9 @@ class MTBUF_Pseudo has_vdata = 1; bits<1> has_vaddr = 1; bits<1> has_glc = 1; + bits<1> has_dlc = 1; bits<1> glc_value = 0; // the value for glc if no such operand + bits<1> dlc_value = 0; // the value for dlc if no such operand bits<1> has_srsrc = 1; bits<1> has_soffset = 1; bits<1> has_offset = 1; @@ -120,6 +107,7 @@ class MTBUF_Real : bits<12> offset; bits<1> glc; + bits<1> dlc; bits<7> format; bits<8> vaddr; bits<8> vdata; @@ -138,17 +126,17 @@ class getMTBUFInsDA vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe), + SLC:$slc, TFE:$tfe, DLC:$dlc), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe) + SLC:$slc, TFE:$tfe, DLC:$dlc) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -199,7 +187,7 @@ class MTBUF_Load_Pseudo .ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -214,13 +202,13 @@ multiclass MTBUF_Pseudo_Loads, + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Load_Pseudo , + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Load_Pseudo ; @@ -245,7 +233,7 @@ class MTBUF_Store_Pseudo .ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -260,13 +248,13 @@ multiclass MTBUF_Pseudo_Stores, + i1:$slc, i1:$tfe, i1:$dlc))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc))]>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Store_Pseudo ; @@ -324,7 +312,9 @@ class MUBUF_Pseudo has_vdata = 1; bits<1> has_vaddr = 1; bits<1> has_glc = 1; + bits<1> has_dlc = 1; bits<1> glc_value = 0; // the value for glc if no such operand + bits<1> dlc_value = 0; // the value for dlc if no such operand bits<1> has_srsrc = 1; bits<1> has_soffset = 1; bits<1> has_offset = 1; @@ -333,7 +323,7 @@ class MUBUF_Pseudo dwords = 0; } -class MUBUF_Real op, MUBUF_Pseudo ps> : +class MUBUF_Real : InstSI { let isPseudo = 0; @@ -348,6 +338,7 @@ class MUBUF_Real op, MUBUF_Pseudo ps> : bits<12> offset; bits<1> glc; + bits<1> dlc; bits<8> vaddr; bits<8> vdata; bits<7> srsrc; @@ -358,7 +349,7 @@ class MUBUF_Real op, MUBUF_Pseudo ps> : // For cache invalidation instructions. -class MUBUF_Invalidate : +class MUBUF_Invalidate : MUBUF_Pseudo { let AsmMatchConverter = ""; @@ -373,7 +364,9 @@ class MUBUF_Invalidate : let has_vdata = 0; let has_vaddr = 0; let has_glc = 0; + let has_dlc = 0; let glc_value = 0; + let dlc_value = 0; let has_srsrc = 0; let has_soffset = 0; let has_offset = 0; @@ -400,7 +393,7 @@ class getMUBUFInsDA vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins), (ins TFE:$tfe)) + !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) ); } @@ -460,7 +453,7 @@ class MUBUF_Load_Pseudo .ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe"), + !if(isLds, " lds", "$tfe") # "$dlc", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -477,6 +470,24 @@ class MUBUF_Load_Pseudo .ret; } +class MUBUF_Offset_Load_Pat : Pat < + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) +>; + +class MUBUF_Addr64_Load_Pat : Pat < + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) +>; + +multiclass MUBUF_Pseudo_Load_Pats { + def : MUBUF_Offset_Load_Pat(BaseInst#"_OFFSET"), load_vt, ld>; + def : MUBUF_Addr64_Load_Pat(BaseInst#"_ADDR64"), load_vt, ld>; +} + + // FIXME: tfe can't be an operand because it requires a separate // opcode because it needs an N+1 register class dest register. multiclass MUBUF_Pseudo_Loads { - def _OFFSET : MUBUF_Load_Pseudo , + def _OFFSET : MUBUF_Load_Pseudo , MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; - def _ADDR64 : MUBUF_Load_Pseudo , + def _ADDR64 : MUBUF_Load_Pseudo , MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; def _OFFEN : MUBUF_Load_Pseudo ; @@ -531,7 +532,7 @@ class MUBUF_Store_Pseudo .ret, - " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe", + " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -547,12 +548,12 @@ multiclass MUBUF_Pseudo_Stores, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo , + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo ; @@ -638,6 +639,7 @@ class MUBUF_Atomic_Pseudo.ret, 0> { let PseudoInstr = opName # "_" # getAddrName.ret; let glc_value = 0; + let dlc_value = 0; let AsmMatchConverter = "cvtMubufAtomic"; } @@ -673,6 +676,7 @@ class MUBUF_AtomicRet_Pseudo.ret, 1> { let PseudoInstr = opName # "_rtn_" # getAddrName.ret; let glc_value = 1; + let dlc_value = 0; let Constraints = "$vdata = $vdata_in"; let DisableEncoding = "$vdata_in"; let AsmMatchConverter = "cvtMubufAtomicReturn"; @@ -681,34 +685,53 @@ class MUBUF_AtomicRet_Pseudo { + SDPatternOperator atomic, + bit isFP = getIsFP.ret> { + let FPAtomic = isFP in def _OFFSET : MUBUF_AtomicNoRet_Pseudo , MUBUFAddr64Table <0, NAME>; + + let FPAtomic = isFP in def _ADDR64 : MUBUF_AtomicNoRet_Pseudo , MUBUFAddr64Table <1, NAME>; + + let FPAtomic = isFP in def _OFFEN : MUBUF_AtomicNoRet_Pseudo ; + + let FPAtomic = isFP in + def _IDXEN : MUBUF_AtomicNoRet_Pseudo ; + + let FPAtomic = isFP in def _BOTHEN : MUBUF_AtomicNoRet_Pseudo ; } multiclass MUBUF_Pseudo_Atomics_RTN { + SDPatternOperator atomic, + bit isFP = getIsFP.ret> { + let FPAtomic = isFP in def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo , MUBUFAddr64Table <0, NAME # "_RTN">; + let FPAtomic = isFP in def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo , MUBUFAddr64Table <1, NAME # "_RTN">; + let FPAtomic = isFP in def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo ; + + let FPAtomic = isFP in def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo ; + + let FPAtomic = isFP in def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo ; } @@ -804,34 +827,45 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { } // End HasPackedD16VMem. defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 + "buffer_load_ubyte", VGPR_32, i32 >; defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 + "buffer_load_sbyte", VGPR_32, i32 >; defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 + "buffer_load_ushort", VGPR_32, i32 >; defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 + "buffer_load_sshort", VGPR_32, i32 >; defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds < - "buffer_load_dword", VGPR_32, i32, mubuf_load + "buffer_load_dword", VGPR_32, i32 >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load + "buffer_load_dwordx2", VReg_64, v2i32 >; defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, untyped, mubuf_load + "buffer_load_dwordx3", VReg_96, v3i32 >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load + "buffer_load_dwordx4", VReg_128, v4i32 >; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; + // This is not described in AMD documentation, // but 'lds' versions of these opcodes are available // in at least GFX8+ chips. See Bug 37653. -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads < "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1 >; @@ -856,7 +890,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < "buffer_store_dwordx2", VReg_64, v2i32, store_global >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", VReg_96, untyped, store_global + "buffer_store_dwordx3", VReg_96, v3i32, store_global >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < "buffer_store_dwordx4", VReg_128, v4i32, store_global @@ -940,11 +974,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global >; -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; } -let SubtargetPredicate = isSI in { // isn't on CI & VI +let SubtargetPredicate = isGFX6 in { // isn't on CI & VI /* defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">; defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">; @@ -1006,17 +1040,28 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < + "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global +>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global +>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { @@ -1041,19 +1086,21 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; } // End HasPackedD16VMem. -let SubtargetPredicate = isCIVI in { +let SubtargetPredicate = isGFX7Plus in { //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -// Remaining instructions: -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol>; -} // End let SubtargetPredicate = isCIVI +} // End let SubtargetPredicate = isGFX7Plus + +let SubtargetPredicate = isGFX10Plus in { + def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; + def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // MUBUF Patterns @@ -1067,6 +1114,10 @@ def extract_slc : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); }]>; +def extract_dlc : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1077,21 +1128,21 @@ multiclass MUBUF_LoadIntrinsicPat(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, imm:$cachepolicy, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, imm:$cachepolicy, imm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1100,7 +1151,7 @@ multiclass MUBUF_LoadIntrinsicPat(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; } @@ -1108,6 +1159,8 @@ defm : MUBUF_LoadIntrinsicPat defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -1131,8 +1184,14 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { @@ -1140,21 +1199,23 @@ multiclass MUBUF_StoreIntrinsicPat(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, imm:$cachepolicy, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (as_i16imm $offset), (extract_glc $cachepolicy), + (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, imm:$cachepolicy, imm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (as_i16imm $offset), (extract_glc $cachepolicy), + (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1163,8 +1224,8 @@ multiclass MUBUF_StoreIntrinsicPat(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), + (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; } @@ -1172,6 +1233,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1195,42 +1258,47 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns //===----------------------------------------------------------------------===// -multiclass BufferAtomicPatterns { +multiclass BufferAtomicPatterns { def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, 0, + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + imm:$cachepolicy, 0)), (!cast(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + imm:$cachepolicy, imm)), (!cast(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, 0, + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + imm:$cachepolicy, 0)), (!cast(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + imm:$cachepolicy, imm)), (!cast(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -1238,16 +1306,66 @@ multiclass BufferAtomicPatterns { >; } -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; + +multiclass BufferAtomicPatterns_NO_RTN { + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, 0, + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast(opcode # _OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, 0, + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast(opcode # _BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) + >; +} + +defm : BufferAtomicPatterns_NO_RTN; +defm : BufferAtomicPatterns_NO_RTN; def : GCNPat< (SIbuffer_atomic_cmpswap @@ -1298,12 +1416,11 @@ def : GCNPat< sub0) >; - class MUBUFLoad_PatternADDR64 : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) >; multiclass MUBUFLoad_Atomic_Pattern ; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) >; } -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isGFX6GFX7 in { def : MUBUFLoad_PatternADDR64 ; -def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; def : MUBUFLoad_PatternADDR64 ; -def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; -defm : MUBUFLoad_Atomic_Pattern ; -defm : MUBUFLoad_Atomic_Pattern ; -} // End SubtargetPredicate = isSICI +defm : MUBUFLoad_Atomic_Pattern ; +defm : MUBUFLoad_Atomic_Pattern ; +} // End SubtargetPredicate = isGFX6GFX7 multiclass MUBUFLoad_Pattern { def : GCNPat < (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) >; } let OtherPredicates = [Has16BitInsts] in { defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; } // End OtherPredicates = [Has16BitInsts] @@ -1357,111 +1478,79 @@ multiclass MUBUFScratchLoadPat ; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) >; } // XXX - Is it possible to have a complex pattern in a PatFrag? -multiclass MUBUFScratchLoadPat_Hi16 { - def : GCNPat < - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)))), - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)))))), - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - - def : GCNPat < - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))), - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))), - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; -} - -multiclass MUBUFScratchLoadPat_Lo16 { - def : GCNPat < - (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (vt (Hi16Elt vt:$hi))), - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) - >; - + ValueType vt, PatFrag ld_frag> { def : GCNPat < - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))))), - (f16 (Hi16Elt f16:$hi))), - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) >; def : GCNPat < - (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (vt (Hi16Elt vt:$hi))), - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))), - (f16 (Hi16Elt f16:$hi))), - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) >; } defm : MUBUFScratchLoadPat ; -defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; -defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; -defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; let OtherPredicates = [D16PreservesUnusedBits] in { -defm : MUBUFScratchLoadPat_Hi16; -defm : MUBUFScratchLoadPat_Hi16; -defm : MUBUFScratchLoadPat_Hi16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; -defm : MUBUFScratchLoadPat_Lo16; -defm : MUBUFScratchLoadPat_Lo16; -defm : MUBUFScratchLoadPat_Lo16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; +defm : MUBUFScratchLoadPat_D16; } + multiclass MUBUFStore_Atomic_Pattern { // Store follows atomic op convention so address is forst def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) >; } -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isGFX6GFX7 in { defm : MUBUFStore_Atomic_Pattern ; defm : MUBUFStore_Atomic_Pattern ; -} // End Predicates = isSICI +} // End Predicates = isGFX6GFX7 multiclass MUBUFStore_Pattern ; } @@ -1479,17 +1568,18 @@ defm : MUBUFStore_Pattern ; multiclass MUBUFScratchStorePat { + ValueType vt, PatFrag st, + RegisterClass rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) >; } @@ -1498,8 +1588,9 @@ defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; -defm : MUBUFScratchStorePat ; -defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; let OtherPredicates = [D16PreservesUnusedBits] in { @@ -1526,7 +1617,7 @@ multiclass MTBUF_LoadIntrinsicPat(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1534,7 +1625,7 @@ multiclass MTBUF_LoadIntrinsicPat(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1542,7 +1633,7 @@ multiclass MTBUF_LoadIntrinsicPat(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1552,15 +1643,17 @@ multiclass MTBUF_LoadIntrinsicPat; } defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { @@ -1582,7 +1675,7 @@ multiclass MTBUF_StoreIntrinsicPat(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1590,7 +1683,7 @@ multiclass MTBUF_StoreIntrinsicPat(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1598,7 +1691,7 @@ multiclass MTBUF_StoreIntrinsicPat(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1608,17 +1701,17 @@ multiclass MTBUF_StoreIntrinsicPat; } defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; -defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; -defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { @@ -1634,28 +1727,22 @@ let SubtargetPredicate = HasPackedD16VMem in { } // End HasPackedD16VMem. //===----------------------------------------------------------------------===// -// Target instructions, move to the appropriate target TD file +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SI +// Base ENC_MUBUF for GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -class MUBUF_Real_si op, MUBUF_Pseudo ps> : - MUBUF_Real, - Enc64, - SIMCInstr { - let AssemblerPredicate=isSICI; - let DecoderNamespace="SICI"; - +class Base_MUBUF_Real_gfx6_gfx7_gfx10 op, MUBUF_Pseudo ps, int ef> : + MUBUF_Real, Enc64, SIMCInstr { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); - let Inst{15} = ps.addr64; let Inst{16} = !if(ps.lds, 1, 0); let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding + let Inst{31-26} = 0x38; let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); @@ -1664,125 +1751,250 @@ class MUBUF_Real_si op, MUBUF_Pseudo ps> : let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -multiclass MUBUF_Real_AllAddr_si op> { - def _OFFSET_si : MUBUF_Real_si (NAME#"_OFFSET")>; - def _ADDR64_si : MUBUF_Real_si (NAME#"_ADDR64")>; - def _OFFEN_si : MUBUF_Real_si (NAME#"_OFFEN")>; - def _IDXEN_si : MUBUF_Real_si (NAME#"_IDXEN")>; - def _BOTHEN_si : MUBUF_Real_si (NAME#"_BOTHEN")>; -} - -multiclass MUBUF_Real_AllAddr_Lds_si op> { - - def _OFFSET_si : MUBUF_Real_si (NAME#"_OFFSET")>, - MUBUFLdsTable<0, NAME # "_OFFSET_si">; - def _ADDR64_si : MUBUF_Real_si (NAME#"_ADDR64")>, - MUBUFLdsTable<0, NAME # "_ADDR64_si">; - def _OFFEN_si : MUBUF_Real_si (NAME#"_OFFEN")>, - MUBUFLdsTable<0, NAME # "_OFFEN_si">; - def _IDXEN_si : MUBUF_Real_si (NAME#"_IDXEN")>, - MUBUFLdsTable<0, NAME # "_IDXEN_si">; - def _BOTHEN_si : MUBUF_Real_si (NAME#"_BOTHEN")>, - MUBUFLdsTable<0, NAME # "_BOTHEN_si">; - - def _LDS_OFFSET_si : MUBUF_Real_si (NAME#"_LDS_OFFSET")>, - MUBUFLdsTable<1, NAME # "_OFFSET_si">; - def _LDS_ADDR64_si : MUBUF_Real_si (NAME#"_LDS_ADDR64")>, - MUBUFLdsTable<1, NAME # "_ADDR64_si">; - def _LDS_OFFEN_si : MUBUF_Real_si (NAME#"_LDS_OFFEN")>, - MUBUFLdsTable<1, NAME # "_OFFEN_si">; - def _LDS_IDXEN_si : MUBUF_Real_si (NAME#"_LDS_IDXEN")>, - MUBUFLdsTable<1, NAME # "_IDXEN_si">; - def _LDS_BOTHEN_si : MUBUF_Real_si (NAME#"_LDS_BOTHEN")>, - MUBUFLdsTable<1, NAME # "_BOTHEN_si">; -} - -multiclass MUBUF_Real_Atomic_si op> : MUBUF_Real_AllAddr_si { - def _OFFSET_RTN_si : MUBUF_Real_si (NAME#"_OFFSET_RTN")>; - def _ADDR64_RTN_si : MUBUF_Real_si (NAME#"_ADDR64_RTN")>; - def _OFFEN_RTN_si : MUBUF_Real_si (NAME#"_OFFEN_RTN")>; - def _IDXEN_RTN_si : MUBUF_Real_si (NAME#"_IDXEN_RTN")>; - def _BOTHEN_RTN_si : MUBUF_Real_si (NAME#"_BOTHEN_RTN")>; -} - -defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>; -defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>; -defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>; -defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>; -defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>; -defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>; -defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>; -defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>; -defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>; -defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>; -defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>; -defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>; -defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>; -defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>; -defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>; -defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>; -defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>; -defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>; - -defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>; -defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>; -defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>; -defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>; -//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>; -defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>; -defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>; -defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>; -defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>; -defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>; -defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>; -defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>; -defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>; - -//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI -//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI -//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI -defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>; -defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>; -defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>; -defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>; -//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>; -defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>; -defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>; -defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>; -defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>; -defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>; -defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>; -defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>; -defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>; -// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI. -//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI -//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI -//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI - -def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>; -def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>; - -class MTBUF_Real_si op, MTBUF_Pseudo ps> : - MTBUF_Real, - Enc64, - SIMCInstr { - let AssemblerPredicate=isSICI; - let DecoderNamespace="SICI"; +class MUBUF_Real_gfx10 op, MUBUF_Pseudo ps> : + Base_MUBUF_Real_gfx6_gfx7_gfx10 { + let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value); + let Inst{25} = op{7}; +} + +class MUBUF_Real_gfx6_gfx7 op, MUBUF_Pseudo ps> : + Base_MUBUF_Real_gfx6_gfx7_gfx10 { + let Inst{15} = ps.addr64; +} +//===----------------------------------------------------------------------===// +// MUBUF - GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass MUBUF_Real_gfx10_with_name op, string opName, + string asmName> { + def _gfx10 : MUBUF_Real_gfx10(opName)> { + MUBUF_Pseudo ps = !cast(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass MUBUF_Real_AllAddr_gfx10 op> { + def _BOTHEN_gfx10 : + MUBUF_Real_gfx10(NAME#"_BOTHEN")>; + def _IDXEN_gfx10 : + MUBUF_Real_gfx10(NAME#"_IDXEN")>; + def _OFFEN_gfx10 : + MUBUF_Real_gfx10(NAME#"_OFFEN")>; + def _OFFSET_gfx10 : + MUBUF_Real_gfx10(NAME#"_OFFSET")>; + } + multiclass MUBUF_Real_AllAddr_Lds_gfx10 op> { + def _OFFSET_gfx10 : MUBUF_Real_gfx10(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">; + def _OFFEN_gfx10 : MUBUF_Real_gfx10(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">; + def _IDXEN_gfx10 : MUBUF_Real_gfx10(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">; + def _BOTHEN_gfx10 : MUBUF_Real_gfx10(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">; + + def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">; + def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">; + def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">; + def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">; + } + multiclass MUBUF_Real_Atomics_gfx10 op> : + MUBUF_Real_AllAddr_gfx10 { + def _BOTHEN_RTN_gfx10 : + MUBUF_Real_gfx10(NAME#"_BOTHEN_RTN")>; + def _IDXEN_RTN_gfx10 : + MUBUF_Real_gfx10(NAME#"_IDXEN_RTN")>; + def _OFFEN_RTN_gfx10 : + MUBUF_Real_gfx10(NAME#"_OFFEN_RTN")>; + def _OFFSET_RTN_gfx10 : + MUBUF_Real_gfx10(NAME#"_OFFSET_RTN")>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>; +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x020>; +defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x021>; +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x022>; +defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x023>; +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx10<0x024>; +defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x025>; +// FIXME-GFX10: Add following instructions: +//defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x026>; +//defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x080>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x081>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x082>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x083>; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x084>; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x085>; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x086>; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x087>; + +def BUFFER_GL0_INV_gfx10 : + MUBUF_Real_gfx10<0x071, BUFFER_GL0_INV>; +def BUFFER_GL1_INV_gfx10 : + MUBUF_Real_gfx10<0x072, BUFFER_GL1_INV>; + +//===----------------------------------------------------------------------===// +// MUBUF - GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6" in { + multiclass MUBUF_Real_gfx6 op> { + def _gfx6 : MUBUF_Real_gfx6_gfx7(NAME)>; + } +} // End AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6" + +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass MUBUF_Real_gfx7 op> { + def _gfx7 : MUBUF_Real_gfx6_gfx7(NAME)>; + } +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" + +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass MUBUF_Real_AllAddr_gfx6_gfx7 op> { + def _ADDR64_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_ADDR64")>; + def _BOTHEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_BOTHEN")>; + def _IDXEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_IDXEN")>; + def _OFFEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_OFFEN")>; + def _OFFSET_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_OFFSET")>; + } + multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7 op> { + def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">; + def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_ADDR64")>, + MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">; + def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">; + def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">; + def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">; + + def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">; + def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_LDS_ADDR64")>, + MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">; + def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">; + def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">; + def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">; + } + multiclass MUBUF_Real_Atomics_gfx6_gfx7 op> : + MUBUF_Real_AllAddr_gfx6_gfx7 { + def _ADDR64_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_ADDR64_RTN")>; + def _BOTHEN_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_BOTHEN_RTN")>; + def _IDXEN_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_IDXEN_RTN")>; + def _OFFEN_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_OFFEN_RTN")>; + def _OFFSET_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7(NAME#"_OFFSET_RTN")>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10 op> : + MUBUF_Real_AllAddr_gfx6_gfx7, MUBUF_Real_AllAddr_gfx10; + +multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10 op> : + MUBUF_Real_AllAddr_Lds_gfx6_gfx7, MUBUF_Real_AllAddr_Lds_gfx10; + +multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10 op> : + MUBUF_Real_Atomics_gfx6_gfx7, MUBUF_Real_Atomics_gfx10; + +// FIXME-GFX6: Following instructions are available only on GFX6. +//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>; +//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomics_gfx6 <0x054>; + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x000>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x008>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x009>; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00a>; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00b>; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00c>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00d>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00e>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00f>; +defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x018>; +defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01a>; +defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01c>; +defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01d>; +defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01e>; +defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01f>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x030>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x031>; +defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x032>; +defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x033>; +defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x035>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x036>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x037>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x038>; +defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x039>; +defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>; +defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>; +defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>; +defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>; +// FIXME-GFX6-GFX7-GFX10: Add following instructions: +//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>; +//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>; +//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>; +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x053>; +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x055>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x056>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x057>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x058>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x059>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05a>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>; +// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7. +// FIXME-GFX6-GFX7-GFX10: Add following instructions: +//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; +//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; +//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; + +defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; +defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; +def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>; + +//===----------------------------------------------------------------------===// +// Base ENC_MTBUF for GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +class Base_MTBUF_Real_gfx6_gfx7_gfx10 op, MTBUF_Pseudo ps, int ef> : + MTBUF_Real, Enc64, SIMCInstr { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); - let Inst{15} = ps.addr64; let Inst{18-16} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -1792,47 +2004,87 @@ class MTBUF_Real_si op, MTBUF_Pseudo ps> : let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -multiclass MTBUF_Real_AllAddr_si op> { - def _OFFSET_si : MTBUF_Real_si (NAME#"_OFFSET")>; - def _ADDR64_si : MTBUF_Real_si (NAME#"_ADDR64")>; - def _OFFEN_si : MTBUF_Real_si (NAME#"_OFFEN")>; - def _IDXEN_si : MTBUF_Real_si (NAME#"_IDXEN")>; - def _BOTHEN_si : MTBUF_Real_si (NAME#"_BOTHEN")>; -} +//===----------------------------------------------------------------------===// +// MTBUF - GFX10. +//===----------------------------------------------------------------------===// + +class MTBUF_Real_gfx10 op, MTBUF_Pseudo ps> : + Base_MTBUF_Real_gfx6_gfx7_gfx10 { + let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value); + let Inst{25-19} = format; + let Inst{53} = op{3}; +} + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass MTBUF_Real_AllAddr_gfx10 op> { + def _BOTHEN_gfx10 : + MTBUF_Real_gfx10(NAME#"_BOTHEN")>; + def _IDXEN_gfx10 : + MTBUF_Real_gfx10(NAME#"_IDXEN")>; + def _OFFEN_gfx10 : + MTBUF_Real_gfx10(NAME#"_OFFEN")>; + def _OFFSET_gfx10 : + MTBUF_Real_gfx10(NAME#"_OFFSET")>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x008>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x009>; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00a>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00b>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x00c>; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x00d>; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00e>; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00f>; //===----------------------------------------------------------------------===// -// CI -// MTBUF - GFX6, GFX7. +// MTBUF - GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -class MUBUF_Real_ci op, MUBUF_Pseudo ps> : - MUBUF_Real_si { - let AssemblerPredicate=isCIOnly; - let DecoderNamespace="CI"; +class MTBUF_Real_gfx6_gfx7 op, MTBUF_Pseudo ps> : + Base_MTBUF_Real_gfx6_gfx7_gfx10 { + let Inst{15} = ps.addr64; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; } -def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>; +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass MTBUF_Real_AllAddr_gfx6_gfx7 op> { + def _ADDR64_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7(NAME#"_ADDR64")>; + def _BOTHEN_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7(NAME#"_BOTHEN")>; + def _IDXEN_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7(NAME#"_IDXEN")>; + def _OFFEN_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7(NAME#"_OFFEN")>; + def _OFFSET_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7(NAME#"_OFFSET")>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass MTBUF_Real_AllAddr_gfx6_gfx7_gfx10 op> : + MTBUF_Real_AllAddr_gfx6_gfx7, MTBUF_Real_AllAddr_gfx10; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x000>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// class MUBUF_Real_vi op, MUBUF_Pseudo ps> : - MUBUF_Real, + MUBUF_Real, Enc64, SIMCInstr { - let AssemblerPredicate=isVI; - let DecoderNamespace="VI"; + let AssemblerPredicate = isGFX8GFX9; + let DecoderNamespace = "GFX8"; let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; @@ -1878,7 +2130,7 @@ multiclass MUBUF_Real_AllAddr_Lds_vi op> { } class MUBUF_Real_gfx80 op, MUBUF_Pseudo ps> : - MUBUF_Real, + MUBUF_Real, Enc64, SIMCInstr { let AssemblerPredicate=HasUnpackedD16VMem; @@ -2002,12 +2254,19 @@ def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>; def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + class MTBUF_Real_vi op, MTBUF_Pseudo ps> : MTBUF_Real, Enc64, SIMCInstr { - let AssemblerPredicate=isVI; - let DecoderNamespace="VI"; + let AssemblerPredicate = isGFX8GFX9; + let DecoderNamespace = "GFX8"; let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td index ae40c6387982..1a526675164a 100644 --- a/lib/Target/AMDGPU/CaymanInstructions.td +++ b/lib/Target/AMDGPU/CaymanInstructions.td @@ -1,9 +1,8 @@ //===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index 31d2ebef481d..c52eaaa3fdc5 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -1,9 +1,8 @@ //===-- DSInstructions.td - DS Instruction Defintions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,8 +10,6 @@ class DS_Pseudo patt InstSI , SIMCInstr { - let SubtargetPredicate = isGCN; - let LGKM_CNT = 1; let DS = 1; let Size = 8; @@ -21,6 +18,7 @@ class DS_Pseudo patt // Most instruction load and store data, so set this as the default. let mayLoad = 1; let mayStore = 1; + let maybeAtomic = 1; let hasSideEffects = 0; let SchedRW = [WriteLDS]; @@ -40,6 +38,8 @@ class DS_Pseudo patt bits<1> has_data0 = 1; bits<1> has_data1 = 1; + bits<1> has_gws_data0 = 0; // data0 is encoded as addr + bits<1> has_offset = 1; // has "offset" that should be split to offset0,1 bits<1> has_offset0 = 1; bits<1> has_offset1 = 1; @@ -61,6 +61,7 @@ class DS_Real : // copy relevant pseudo op flags let SubtargetPredicate = ds.SubtargetPredicate; + let OtherPredicates = ds.OtherPredicates; let AsmMatchConverter = ds.AsmMatchConverter; // encoding fields @@ -322,7 +323,7 @@ class DS_GWS_1D : DS_GWS { - let has_data0 = 1; + let has_gws_data0 = 1; } class DS_VOID : DS_Pseudo; defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; -def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; +let isConvergent = 1, usesCustomInserter = 1 in { +def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { + let mayLoad = 0; +} def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">; def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; +} def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; @@ -550,12 +555,14 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -let SubtargetPredicate = isCIVI in { +let SubtargetPredicate = isGFX7Plus in { defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>; defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>; +let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; +} let mayStore = 0 in { defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>; @@ -569,13 +576,13 @@ defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>; def DS_NOP : DS_VOID<"ds_nop">; -} // let SubtargetPredicate = isCIVI +} // let SubtargetPredicate = isGFX7Plus //===----------------------------------------------------------------------===// // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8Plus in { let Uses = [EXEC] in { def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32", @@ -586,7 +593,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; -} // let SubtargetPredicate = isVI +} // let SubtargetPredicate = isGFX8Plus //===----------------------------------------------------------------------===// // DS Patterns @@ -597,9 +604,9 @@ def : GCNPat < (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; -class DSReadPat : GCNPat < +class DSReadPat : GCNPat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) + (inst $ptr, (as_i16imm $offset), (i1 gds)) >; multiclass DSReadPat_mc { @@ -613,38 +620,21 @@ multiclass DSReadPat_mc { } } - -multiclass DSReadPat_Hi16 { - def : GCNPat < - (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))), - (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))), - (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) - >; -} - -multiclass DSReadPat_Lo16 { - def : GCNPat < - (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi)) - >; -} +class DSReadPat_D16 : GCNPat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), + (inst $ptr, (as_i16imm $offset), (i1 0), $in) +>; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; @@ -658,21 +648,24 @@ defm : DSReadPat_mc ; } // End AddedComplexity = 100 let OtherPredicates = [D16PreservesUnusedBits] in { -let AddedComplexity = 100 in { -defm : DSReadPat_Hi16; -defm : DSReadPat_Hi16; -defm : DSReadPat_Hi16; - -defm : DSReadPat_Lo16; -defm : DSReadPat_Lo16; -defm : DSReadPat_Lo16; - -} +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; + +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; +def : DSReadPat_D16; } -class DSWritePat : GCNPat < +class DSWritePat : GCNPat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSWritePat_mc { @@ -730,7 +723,7 @@ class DS64Bit4ByteAlignedWritePat : GCNPat< // v2i32 loads are split into i32 loads on SI during lowering, due to a bug // related to bounds checking. -let OtherPredicates = [LDSRequiresM0Init, isCIVI] in { +let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in { def : DS64Bit4ByteAlignedReadPat; def : DS64Bit4ByteAlignedWritePat; } @@ -747,260 +740,313 @@ defm : DSWritePat_mc ; defm : DSWritePat_mc ; } // End AddedComplexity = 100 -class DSAtomicRetPat : GCNPat < +class DSAtomicRetPat : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicRetPat_mc { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat(frag#"_m0")>; + def : DSAtomicRetPat(frag#"_local_m0")>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat(!cast(inst)#"_gfx9"), vt, - !cast(frag)>; + !cast(frag#"_local")>; } + + def : DSAtomicRetPat(frag#"_region_m0"), 1>; } -class DSAtomicCmpXChg : GCNPat < +class DSAtomicCmpXChg : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicCmpXChg_mc { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChg(frag#"_m0")>; + def : DSAtomicCmpXChg(frag#"_local_m0")>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChg(!cast(inst)#"_gfx9"), vt, - !cast(frag)>; + !cast(frag#"_local")>; } + + def : DSAtomicCmpXChg(frag#"_region_m0"), 1>; } // 32-bit atomics. -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicCmpXChg_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicCmpXChg_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; // 64-bit atomics. -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; - -defm : DSAtomicCmpXChg_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; + +defm : DSAtomicCmpXChg_mc; + +def : Pat < + (SIds_ordered_count i32:$value, i16:$offset), + (DS_ORDERED_COUNT $value, (as_i16imm $offset)) +>; //===----------------------------------------------------------------------===// -// Real instructions +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SIInstructions.td +// Base ENC_DS for GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -class DS_Real_si op, DS_Pseudo ds> : - DS_Real , - SIMCInstr { - let AssemblerPredicates=[isSICI]; - let DecoderNamespace="SICI"; +class Base_DS_Real_gfx6_gfx7_gfx10 op, DS_Pseudo ps, int ef> : + DS_Real, SIMCInstr { - // encoding - let Inst{7-0} = !if(ds.has_offset0, offset0, 0); - let Inst{15-8} = !if(ds.has_offset1, offset1, 0); - let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue); + let Inst{7-0} = !if(ps.has_offset0, offset0, 0); + let Inst{15-8} = !if(ps.has_offset1, offset1, 0); + let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue); let Inst{25-18} = op; - let Inst{31-26} = 0x36; // ds prefix - let Inst{39-32} = !if(ds.has_addr, addr, 0); - let Inst{47-40} = !if(ds.has_data0, data0, 0); - let Inst{55-48} = !if(ds.has_data1, data1, 0); - let Inst{63-56} = !if(ds.has_vdst, vdst, 0); + let Inst{31-26} = 0x36; + let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0)); + let Inst{47-40} = !if(ps.has_data0, data0, 0); + let Inst{55-48} = !if(ps.has_data1, data1, 0); + let Inst{63-56} = !if(ps.has_vdst, vdst, 0); } -def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>; -def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>; -def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>; -def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>; -def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>; -def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>; -def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>; -def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>; -def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>; -def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>; -def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>; -def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>; -def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>; -def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>; -def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>; -def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>; -def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>; -def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>; -def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>; -def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>; -def DS_NOP_si : DS_Real_si<0x14, DS_NOP>; -def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>; -def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>; -def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>; -def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>; -def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>; -def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>; -def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>; -def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>; -def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>; -def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>; -def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>; -def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>; -def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>; -def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>; -def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>; -def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>; -def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>; -def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>; -def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>; -def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>; -def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>; -def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>; -def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>; -def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>; -def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>; -def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>; -def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>; - -// These instruction are CI/VI only -def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>; -def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>; -def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>; - -def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>; -def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>; -def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>; -def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>; -def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>; -def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>; -def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>; -def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>; -def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>; -def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>; -def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>; -def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>; -def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>; -def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>; -def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>; -def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>; -def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>; -def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>; -def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>; -def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>; -def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>; -def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>; -def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>; -def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>; -def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>; -def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>; -def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>; -def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>; -def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>; -def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>; -def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>; - -def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>; -def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>; -def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>; -def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>; -def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>; -def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>; -def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>; -def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>; -def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>; -def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>; -def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>; -def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>; -def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>; -def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>; -def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>; -def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>; -def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>; -def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>; -def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>; -def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>; - -def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>; -def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>; -def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>; - -def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>; -def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>; -def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>; -def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>; -def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>; -def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>; -def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>; -def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>; -def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>; -def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>; -def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>; -def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>; -def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>; - -def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>; -def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>; - -def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>; -def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>; -def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>; -def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>; -def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>; -def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>; -def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>; -def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>; -def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>; -def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>; -def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>; -def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>; -def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>; - -def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>; -def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>; -def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>; -def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>; -def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>; -def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>; +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass DS_Real_gfx10 op> { + def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10(NAME), + SIEncodingFamily.GFX10>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm DS_ADD_F32 : DS_Real_gfx10<0x015>; +defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>; +defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>; +defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>; +defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>; +defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>; +defm DS_READ_U8_D16_HI : DS_Real_gfx10<0x0a3>; +defm DS_READ_I8_D16 : DS_Real_gfx10<0x0a4>; +defm DS_READ_I8_D16_HI : DS_Real_gfx10<0x0a5>; +defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>; +defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>; +defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>; +defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>; +defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>; +defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>; + +//===----------------------------------------------------------------------===// +// GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass DS_Real_gfx7 op> { + def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10(NAME), + SIEncodingFamily.SI>; + } +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" + +multiclass DS_Real_gfx7_gfx10 op> : + DS_Real_gfx7, DS_Real_gfx10; + +// FIXME-GFX7: Add tests when upstreaming this part. +defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>; +defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>; +defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>; +defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>; +defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>; +defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>; +defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>; + +//===----------------------------------------------------------------------===// +// GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass DS_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10(NAME), + SIEncodingFamily.SI>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass DS_Real_gfx6_gfx7_gfx10 op> : + DS_Real_gfx6_gfx7, DS_Real_gfx10; + +defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>; +defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>; +defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>; +defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>; +defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>; +defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>; +defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>; +defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>; +defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>; +defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>; +defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>; +defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>; +defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>; +defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>; +defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>; +defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>; +defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>; +defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>; +defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>; +defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>; +defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>; +defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>; +defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>; +defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>; +defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>; +defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>; +defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>; +defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>; +defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>; +defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>; +defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>; +defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>; +defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>; +defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>; +defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>; +defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>; +defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>; +defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>; +defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>; +defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>; +defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>; +defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>; +defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>; +defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>; +defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>; +defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>; +defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>; +defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>; +defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>; +defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>; +defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>; +defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>; +defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>; +defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>; +defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>; +defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>; +defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>; +defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>; +defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>; +defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>; +defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>; +defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>; +defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>; +defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>; +defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>; +defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>; +defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>; +defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>; +defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>; +defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>; +defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>; +defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>; +defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>; +defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>; +defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>; +defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>; +defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>; +defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>; +defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>; +defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>; +defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>; +defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>; +defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>; +defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>; +defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>; +defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>; +defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>; +defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>; +defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>; +defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>; +defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>; +defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>; +defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>; +defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>; +defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>; +defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>; +defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>; +defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>; +defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>; +defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>; +defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>; +defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>; +defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>; +defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>; +defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>; +defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>; +defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>; +defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>; +defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>; +defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>; +defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>; +defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>; +defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>; +defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>; +defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>; +defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>; +defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>; +defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>; +defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>; +defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>; +defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>; +defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>; +defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>; +defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>; +defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>; +defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>; +defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>; +defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>; +defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>; +defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>; //===----------------------------------------------------------------------===// -// VIInstructions.td +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// class DS_Real_vi op, DS_Pseudo ds> : DS_Real , SIMCInstr { - let AssemblerPredicates = [isVI]; - let DecoderNamespace="VI"; + let AssemblerPredicates = [isGFX8GFX9]; + let DecoderNamespace = "GFX8"; // encoding let Inst{7-0} = !if(ds.has_offset0, offset0, 0); @@ -1008,7 +1054,7 @@ class DS_Real_vi op, DS_Pseudo ds> : let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue); let Inst{24-17} = op; let Inst{31-26} = 0x36; // ds prefix - let Inst{39-32} = !if(ds.has_addr, addr, 0); + let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0)); let Inst{47-40} = !if(ds.has_data0, data0, 0); let Inst{55-48} = !if(ds.has_data1, data1, 0); let Inst{63-56} = !if(ds.has_vdst, vdst, 0); diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index f3de903f21b2..4ec4be9bc485 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1,9 +1,8 @@ //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,13 +21,14 @@ #include "AMDGPURegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm-c/Disassembler.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCExpr.h" @@ -52,8 +52,22 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-disassembler" +#define SGPR_MAX (isGFX10() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \ + : AMDGPU::EncValues::SGPR_MAX_SI) + using DecodeStatus = llvm::MCDisassembler::DecodeStatus; +AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, + MCContext &Ctx, + MCInstrInfo const *MCII) : + MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), + TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) { + + // ToDo: AMDGPUDisassembler supports only VI ISA. + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10()) + report_fatal_error("Disassembly not yet supported for subtarget"); +} + inline static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand& Opnd) { Inst.addOperand(Opnd); @@ -77,6 +91,8 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { auto DAsm = static_cast(Decoder); + // Our branches take a simm16, but we need two extra bits to account for the + // factor of 4. APInt SignedOffset(18, Imm * 4, true); int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue(); @@ -85,6 +101,12 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } +static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, + uint64_t Addr, const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeBoolReg(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, \ unsigned Imm, \ @@ -98,6 +120,7 @@ static DecodeStatus StaticDecoderName(MCInst &Inst, \ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VRegOrLds_32) DECODE_OPERAND_REG(VS_32) DECODE_OPERAND_REG(VS_64) DECODE_OPERAND_REG(VS_128) @@ -109,12 +132,20 @@ DECODE_OPERAND_REG(VReg_128) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) DECODE_OPERAND_REG(SReg_32_XEXEC_HI) +DECODE_OPERAND_REG(SRegOrLds_32) DECODE_OPERAND_REG(SReg_64) DECODE_OPERAND_REG(SReg_64_XEXEC) DECODE_OPERAND_REG(SReg_128) DECODE_OPERAND_REG(SReg_256) DECODE_OPERAND_REG(SReg_512) +DECODE_OPERAND_REG(AGPR_32) +DECODE_OPERAND_REG(AReg_128) +DECODE_OPERAND_REG(AReg_512) +DECODE_OPERAND_REG(AReg_1024) +DECODE_OPERAND_REG(AV_32) +DECODE_OPERAND_REG(AV_64) + static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -131,6 +162,62 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +static DecodeStatus decodeOperand_VS_16(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); +} + +static DecodeStatus decodeOperand_VS_32(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm)); +} + +static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512)); +} + +static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512)); +} + +static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512)); +} + +static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm)); +} + +static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm)); +} + #define DECODE_SDWA(DecName) \ DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) @@ -168,6 +255,16 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, return MCDisassembler::Fail; } +static bool isValidDPP8(const MCInst &MI) { + using namespace llvm::AMDGPU::DPP; + int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); + assert(FiIdx != -1); + if ((unsigned)FiIdx >= MI.getNumOperands()) + return false; + unsigned Fi = MI.getOperand(FiIdx).getImm(); + return Fi == DPP8_FI_0 || Fi == DPP8_FI_1; +} + DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes_, uint64_t Address, @@ -176,11 +273,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, CommentStream = &CS; bool IsSDWA = false; - // ToDo: AMDGPUDisassembler supports only VI ISA. - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) - report_fatal_error("Disassembly not yet supported for subtarget"); - - const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); + unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); DecodeStatus Res = MCDisassembler::Fail; @@ -192,6 +285,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (Bytes.size() >= 8) { const uint64_t QW = eatBytes(Bytes); + + Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; + + MI = MCInst(); // clear + Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); if (Res) break; @@ -201,6 +301,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); if (Res) { IsSDWA = true; break; } + Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address); + if (Res) { IsSDWA = true; break; } + + // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and + // v_mad_mixhi_f16 for FMA variants. Try to decode using this special + // table first so we print the correct name. + + if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (Res) break; + } + if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); if (Res) @@ -223,7 +335,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try decode 32-bit instruction if (Bytes.size() < 4) break; const uint32_t DW = eatBytes(Bytes); - Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address); + Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address); @@ -232,33 +344,84 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); + if (Res) break; + if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes(Bytes) << 32) | DW; - Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address); + Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address); } while (false); + if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral || + !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) { + MaxInstBytesNum = 8; + Bytes = Bytes_.slice(0, MaxInstBytesNum); + eatBytes(Bytes); + } + if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || - MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi || - MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) { + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi || + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 || + MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); } if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { - Res = convertMIMGInst(MI); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + int RsrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); + unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1; + if (VAddr0Idx >= 0 && NSAArgs > 0) { + unsigned NSAWords = (NSAArgs + 3) / 4; + if (Bytes.size() < 4 * NSAWords) { + Res = MCDisassembler::Fail; + } else { + for (unsigned i = 0; i < NSAArgs; ++i) { + MI.insert(MI.begin() + VAddr0Idx + 1 + i, + decodeOperand_VGPR_32(Bytes[i])); + } + Bytes = Bytes.slice(4 * NSAWords); + } + } + + if (Res) + Res = convertMIMGInst(MI); } if (Res && IsSDWA) Res = convertSDWAInst(MI); + int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdst_in); + if (VDstIn_Idx != -1) { + int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx, + MCOI::OperandConstraint::TIED_TO); + if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx || + !MI.getOperand(VDstIn_Idx).isReg() || + MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) { + if (MI.getNumOperands() > (unsigned)VDstIn_Idx) + MI.erase(&MI.getOperand(VDstIn_Idx)); + insertNamedMCOperand(MI, + MCOperand::createReg(MI.getOperand(Tied).getReg()), + AMDGPU::OpName::vdst_in); + } + } + // if the opcode was not recognized we'll assume a Size of 4 bytes // (unless there are fewer bytes left) Size = Res ? (MaxInstBytesNum - Bytes.size()) @@ -267,7 +430,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1) // VOPC - insert clamp insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); @@ -285,9 +449,27 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { return MCDisassembler::Success; } -// Note that MIMG format provides no information about VADDR size. -// Consequently, decoded instructions always show address -// as if it has 1 dword, which could be not really so. +DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); + + return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; +} + +// Note that before gfx10, the MIMG encoding provided no information about +// VADDR size. Consequently, decoded instructions always show address as if it +// has 1 dword, which could be not really so. DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), @@ -295,7 +477,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); - + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); @@ -308,16 +491,42 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { assert(DMaskIdx != -1); assert(TFEIdx != -1); + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); bool IsAtomic = (VDstIdx != -1); bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4; - unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf; - if (DMask == 0) - return MCDisassembler::Success; + bool IsNSA = false; + unsigned AddrSize = Info->VAddrDwords; + + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + unsigned DimIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + const AMDGPU::MIMGDimInfo *Dim = + AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm()); + + AddrSize = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? Dim->NumGradients : 0) + + (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA; + if (!IsNSA) { + if (AddrSize > 8) + AddrSize = 16; + else if (AddrSize > 4) + AddrSize = 8; + } else { + if (AddrSize > Info->VAddrDwords) { + // The NSA encoding does not contain enough operands for the combination + // of base opcode / dimension. Should this be an error? + return MCDisassembler::Success; + } + } + } - unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask); - if (DstSize == 1) - return MCDisassembler::Success; + unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf; + unsigned DstSize = IsGather4 ? 4 : std::max(countPopulation(DMask), 1u); bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm(); if (D16 && AMDGPU::hasPackedD16(STI)) { @@ -328,44 +537,64 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (MI.getOperand(TFEIdx).getImm()) return MCDisassembler::Success; - int NewOpcode = -1; + if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) + return MCDisassembler::Success; + + int NewOpcode = + AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize); + if (NewOpcode == -1) + return MCDisassembler::Success; - if (IsGather4) { - if (D16 && AMDGPU::hasPackedD16(STI)) - NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2); - else + // Widen the register to the correct number of enabled channels. + unsigned NewVdata = AMDGPU::NoRegister; + if (DstSize != Info->VDataDwords) { + auto DataRCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass; + + // Get first subregister of VData + unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); + unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); + Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; + + NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, + &MRI.getRegClass(DataRCID)); + if (NewVdata == AMDGPU::NoRegister) { + // It's possible to encode this such that the low register + enabled + // components exceeds the register count. return MCDisassembler::Success; - } else { - NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize); - if (NewOpcode == -1) + } + } + + unsigned NewVAddr0 = AMDGPU::NoRegister; + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA && + AddrSize != Info->VAddrDwords) { + unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg(); + unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0); + VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0; + + auto AddrRCID = MCII->get(NewOpcode).OpInfo[VAddr0Idx].RegClass; + NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0, + &MRI.getRegClass(AddrRCID)); + if (NewVAddr0 == AMDGPU::NoRegister) return MCDisassembler::Success; } - auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass; + MI.setOpcode(NewOpcode); - // Get first subregister of VData - unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); - unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); - Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; + if (NewVdata != AMDGPU::NoRegister) { + MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata); - // Widen the register to the correct number of enabled channels. - auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, - &MRI.getRegClass(RCID)); - if (NewVdata == AMDGPU::NoRegister) { - // It's possible to encode this such that the low register + enabled - // components exceeds the register count. - return MCDisassembler::Success; + if (IsAtomic) { + // Atomic operations have an additional operand (a copy of data) + MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata); + } } - MI.setOpcode(NewOpcode); - // vaddr will be always appear as a single VGPR. This will look different than - // how it is usually emitted because the number of register components is not - // in the instruction encoding. - MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata); - - if (IsAtomic) { - // Atomic operations have an additional operand (a copy of data) - MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata); + if (NewVAddr0 != AMDGPU::NoRegister) { + MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0); + } else if (IsNSA) { + assert(AddrSize <= Info->VAddrDwords); + MI.erase(MI.begin() + VAddr0Idx + AddrSize, + MI.begin() + VAddr0Idx + Info->VAddrDwords); } return MCDisassembler::Success; @@ -470,6 +699,34 @@ MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VRegOrLds_32(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const { + return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_1024(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_1024RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AV_32(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const { + return decodeSrcOp(OPW64, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { return createRegOperand(AMDGPU::VReg_64RegClassID, Val); } @@ -482,6 +739,14 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const { return createRegOperand(AMDGPU::VReg_128RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_256RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_512RegClassID, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { // table-gen generated disassembler doesn't care about operand types // leaving only registry class so SSrc_32 operand turns into SReg_32 @@ -501,6 +766,13 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI( return decodeOperand_SReg_32(Val); } +MCOperand AMDGPUDisassembler::decodeOperand_SRegOrLds_32(unsigned Val) const { + // table-gen generated disassembler doesn't care about operand types + // leaving only registry class so SSrc_32 operand turns into SReg_32 + // and therefore we accept immediates and literals here as well + return decodeSrcOp(OPW32, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } @@ -628,6 +900,9 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { // ToDo: case 248: 1/(2*PI) - is allowed only on VI switch (Width) { case OPW32: + case OPW128: // splat constants + case OPW512: + case OPW1024: return MCOperand::createImm(getInlineImmVal32(Imm)); case OPW64: return MCOperand::createImm(getInlineImmVal64(Imm)); @@ -654,6 +929,24 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { } } +unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: + case OPW16: + case OPWV216: + return AGPR_32RegClassID; + case OPW64: return AReg_64RegClassID; + case OPW128: return AReg_128RegClassID; + case OPW512: return AReg_512RegClassID; + case OPW1024: return AReg_1024RegClassID; + } +} + + unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { using namespace AMDGPU; @@ -691,8 +984,10 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { using namespace AMDGPU::EncValues; - unsigned TTmpMin = isGFX9() ? TTMP_GFX9_MIN : TTMP_VI_MIN; - unsigned TTmpMax = isGFX9() ? TTMP_GFX9_MAX : TTMP_VI_MAX; + unsigned TTmpMin = + (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MIN : TTMP_VI_MIN; + unsigned TTmpMax = + (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MAX : TTMP_VI_MAX; return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1; } @@ -700,10 +995,14 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const { using namespace AMDGPU::EncValues; - assert(Val < 512); // enum9 + assert(Val < 1024); // enum10 + + bool IsAGPR = Val & 512; + Val &= 511; if (VGPR_MIN <= Val && Val <= VGPR_MAX) { - return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN); + return createRegOperand(IsAGPR ? getAgprClassId(Width) + : getVgprClassId(Width), Val - VGPR_MIN); } if (Val <= SGPR_MAX) { assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. @@ -765,23 +1064,23 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { case 105: return createRegOperand(XNACK_MASK_HI); case 106: return createRegOperand(VCC_LO); case 107: return createRegOperand(VCC_HI); - case 108: assert(!isGFX9()); return createRegOperand(TBA_LO); - case 109: assert(!isGFX9()); return createRegOperand(TBA_HI); - case 110: assert(!isGFX9()); return createRegOperand(TMA_LO); - case 111: assert(!isGFX9()); return createRegOperand(TMA_HI); + case 108: return createRegOperand(TBA_LO); + case 109: return createRegOperand(TBA_HI); + case 110: return createRegOperand(TMA_LO); + case 111: return createRegOperand(TMA_HI); case 124: return createRegOperand(M0); + case 125: return createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); case 237: return createRegOperand(SRC_PRIVATE_BASE); case 238: return createRegOperand(SRC_PRIVATE_LIMIT); - // TODO: SRC_POPS_EXITING_WAVE_ID - // ToDo: no support for vccz register - case 251: break; - // ToDo: no support for execz register - case 252: break; - case 253: return createRegOperand(SCC); + case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID); + case 251: return createRegOperand(SRC_VCCZ); + case 252: return createRegOperand(SRC_EXECZ); + case 253: return createRegOperand(SRC_SCC); + case 254: return createRegOperand(LDS_DIRECT); default: break; } return errOperand(Val, "unknown operand encoding " + Twine(Val)); @@ -794,9 +1093,17 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { case 102: return createRegOperand(FLAT_SCR); case 104: return createRegOperand(XNACK_MASK); case 106: return createRegOperand(VCC); - case 108: assert(!isGFX9()); return createRegOperand(TBA); - case 110: assert(!isGFX9()); return createRegOperand(TMA); + case 108: return createRegOperand(TBA); + case 110: return createRegOperand(TMA); case 126: return createRegOperand(EXEC); + case 235: return createRegOperand(SRC_SHARED_BASE); + case 236: return createRegOperand(SRC_SHARED_LIMIT); + case 237: return createRegOperand(SRC_PRIVATE_BASE); + case 238: return createRegOperand(SRC_PRIVATE_LIMIT); + case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID); + case 251: return createRegOperand(SRC_VCCZ); + case 252: return createRegOperand(SRC_EXECZ); + case 253: return createRegOperand(SRC_SCC); default: break; } return errOperand(Val, "unknown operand encoding " + Twine(Val)); @@ -807,16 +1114,18 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, using namespace AMDGPU::SDWA; using namespace AMDGPU::EncValues; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { - // XXX: static_cast is needed to avoid stupid warning: + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + // XXX: cast to int is needed to avoid stupid warning: // compare with unsigned is always true - if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast(Val) && + if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) && Val <= SDWA9EncValues::SRC_VGPR_MAX) { return createRegOperand(getVgprClassId(Width), Val - SDWA9EncValues::SRC_VGPR_MIN); } if (SDWA9EncValues::SRC_SGPR_MIN <= Val && - Val <= SDWA9EncValues::SRC_SGPR_MAX) { + Val <= (isGFX10() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10 + : SDWA9EncValues::SRC_SGPR_MAX_SI)) { return createSRegOperand(getSgprClassId(Width), Val - SDWA9EncValues::SRC_SGPR_MIN); } @@ -852,24 +1161,34 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { using namespace AMDGPU::SDWA; - assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] && - "SDWAVopcDst should be present only on GFX9"); + assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) && + "SDWAVopcDst should be present only on GFX9+"); + + bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]; + if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; int TTmpIdx = getTTmpIdx(Val); if (TTmpIdx >= 0) { return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx); - } else if (Val > AMDGPU::EncValues::SGPR_MAX) { - return decodeSpecialReg64(Val); + } else if (Val > SGPR_MAX) { + return IsWave64 ? decodeSpecialReg64(Val) + : decodeSpecialReg32(Val); } else { - return createSRegOperand(getSgprClassId(OPW64), Val); + return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val); } } else { - return createRegOperand(AMDGPU::VCC); + return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO); } } +MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { + return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + decodeOperand_SReg_64(Val) : decodeOperand_SReg_32(Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } @@ -878,6 +1197,10 @@ bool AMDGPUDisassembler::isGFX9() const { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool AMDGPUDisassembler::isGFX10() const { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 75cfc5e11282..c5eaba615c2a 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -1,9 +1,8 @@ //===- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -42,15 +41,14 @@ class AMDGPUDisassembler : public MCDisassembler { private: std::unique_ptr const MCII; const MCRegisterInfo &MRI; + const unsigned TargetMaxInstBytes; mutable ArrayRef Bytes; mutable uint32_t Literal; mutable bool HasLiteral; public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, - MCInstrInfo const *MCII) : - MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()) {} - + MCInstrInfo const *MCII); ~AMDGPUDisassembler() override = default; DecodeStatus getInstruction(MCInst &MI, uint64_t &Size, @@ -69,9 +67,12 @@ public: uint64_t Address) const; DecodeStatus convertSDWAInst(MCInst &MI) const; + DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; MCOperand decodeOperand_VGPR_32(unsigned Val) const; + MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const; + MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; MCOperand decodeOperand_VS_128(unsigned Val) const; @@ -81,22 +82,33 @@ public: MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; MCOperand decodeOperand_VReg_128(unsigned Val) const; + MCOperand decodeOperand_VReg_256(unsigned Val) const; + MCOperand decodeOperand_VReg_512(unsigned Val) const; MCOperand decodeOperand_SReg_32(unsigned Val) const; MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const; + MCOperand decodeOperand_SRegOrLds_32(unsigned Val) const; MCOperand decodeOperand_SReg_64(unsigned Val) const; MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_128(unsigned Val) const; MCOperand decodeOperand_SReg_256(unsigned Val) const; MCOperand decodeOperand_SReg_512(unsigned Val) const; + MCOperand decodeOperand_AGPR_32(unsigned Val) const; + MCOperand decodeOperand_AReg_128(unsigned Val) const; + MCOperand decodeOperand_AReg_512(unsigned Val) const; + MCOperand decodeOperand_AReg_1024(unsigned Val) const; + MCOperand decodeOperand_AV_32(unsigned Val) const; + MCOperand decodeOperand_AV_64(unsigned Val) const; + enum OpWidthTy { OPW32, OPW64, OPW128, OPW256, OPW512, + OPW1024, OPW16, OPWV216, OPW_LAST_, @@ -104,6 +116,7 @@ public: }; unsigned getVgprClassId(const OpWidthTy Width) const; + unsigned getAgprClassId(const OpWidthTy Width) const; unsigned getSgprClassId(const OpWidthTy Width) const; unsigned getTtmpClassId(const OpWidthTy Width) const; @@ -121,11 +134,14 @@ public: MCOperand decodeSDWASrc32(unsigned Val) const; MCOperand decodeSDWAVopcDst(unsigned Val) const; + MCOperand decodeBoolReg(unsigned Val) const; + int getTTmpIdx(unsigned Val) const; bool isVI() const; bool isGFX9() const; - }; + bool isGFX10() const; +}; //===----------------------------------------------------------------------===// // AMDGPUSymbolizer diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 944f4ffe598d..0550092ce1d6 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -1,9 +1,8 @@ //===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 44040d352e6a..889f60dae920 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -1,17 +1,16 @@ //===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -def FLATAtomic : ComplexPattern; -def FLATOffset : ComplexPattern", [], [], -10>; +def FLATAtomic : ComplexPattern; +def FLATOffset : ComplexPattern", [], [SDNPWantRoot], -10>; -def FLATOffsetSigned : ComplexPattern", [], [], -10>; -def FLATSignedAtomic : ComplexPattern; +def FLATOffsetSigned : ComplexPattern", [], [SDNPWantRoot], -10>; +def FLATSignedAtomic : ComplexPattern; //===----------------------------------------------------------------------===// // FLAT classes @@ -52,6 +51,8 @@ class FLAT_Pseudo has_data = 1; bits<1> has_glc = 1; bits<1> glcValue = 0; + bits<1> has_dlc = 1; + bits<1> dlcValue = 0; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -64,6 +65,8 @@ class FLAT_Pseudo op, FLAT_Pseudo ps> : @@ -87,6 +90,7 @@ class FLAT_Real op, FLAT_Pseudo ps> : bits<1> slc; bits<1> glc; + bits<1> dlc; // Only valid on gfx9 bits<1> lds = 0; // XXX - What does this actually do? @@ -131,18 +135,16 @@ class GlobalSaddrTable { // saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo : FLAT_Pseudo< + bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), !con( !con( - !con( - !con((ins VReg_64:$vaddr), - !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), - (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, SLC:$slc)), - !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), - " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { + !con((ins VReg_64:$vaddr), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), + " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { let has_data = 0; let mayLoad = 1; let has_saddr = HasSaddr; @@ -155,16 +157,14 @@ class FLAT_Load_Pseudo : FLAT_Pseudo< + bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs), !con( - !con( - !con((ins VReg_64:$vaddr, vdataClass:$vdata), - !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), - (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, SLC:$slc)), - " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { + !con((ins VReg_64:$vaddr, vdataClass:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -176,18 +176,18 @@ class FLAT_Store_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo, + def "" : FLAT_Load_Pseudo, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Load_Pseudo, + def _SADDR : FLAT_Load_Pseudo, GlobalSaddrTable<1, opName>; } } multiclass FLAT_Global_Store_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Store_Pseudo, + def "" : FLAT_Store_Pseudo, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Store_Pseudo, + def _SADDR : FLAT_Store_Pseudo, GlobalSaddrTable<1, opName>; } } @@ -197,9 +197,9 @@ class FLAT_Scratch_Load_Pseudo { + (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc), + (ins VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc$dlc"> { let has_data = 0; let mayLoad = 1; let has_saddr = 1; @@ -213,9 +213,9 @@ class FLAT_Scratch_Store_Pseudo { + (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc), + (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -247,6 +247,8 @@ class FLAT_AtomicNoRet_Pseudo { + RegisterClass data_rc = vdst_rc, + bit isFP = getIsFP.ret> { def "" : FLAT_AtomicNoRet_Pseudo , GlobalSaddrTable<0, opName>, AtomicNoRet { let PseudoInstr = NAME; + let FPAtomic = isFP; } def _RTN : FLAT_AtomicRet_Pseudo , GlobalSaddrTable<0, opName#"_rtn">, - AtomicNoRet ; + AtomicNoRet { + let FPAtomic = isFP; + } } multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< @@ -292,27 +299,30 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterClass data_rc = vdst_rc, + bit isFP = getIsFP.ret> { def "" : FLAT_AtomicNoRet_Pseudo , GlobalSaddrTable<0, opName>, AtomicNoRet { let has_saddr = 1; let PseudoInstr = NAME; + let FPAtomic = isFP; } def _SADDR : FLAT_AtomicNoRet_Pseudo , GlobalSaddrTable<1, opName>, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR"; + let FPAtomic = isFP; } } @@ -322,28 +332,31 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterClass data_rc = vdst_rc, + bit isFP = getIsFP.ret> { def _RTN : FLAT_AtomicRet_Pseudo , GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; + let FPAtomic = isFP; } def _SADDR_RTN : FLAT_AtomicRet_Pseudo , GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = isFP; } } @@ -491,7 +504,8 @@ defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2", defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat>; -let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only? +// GFX7-, GFX10-only flat instructions. +let SubtargetPredicate = isGFX7GFX10 in { defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>; @@ -511,7 +525,7 @@ defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2", defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", VReg_64, f64>; -} // End SubtargetPredicate = isCI +} // End SubtargetPredicate = isGFX7GFX10 let SubtargetPredicate = HasFlatGlobalInsts in { defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; @@ -654,6 +668,32 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor } // End SubtargetPredicate = HasFlatScratchInsts +let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { + defm GLOBAL_ATOMIC_FCMPSWAP : + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32>; + defm GLOBAL_ATOMIC_FMIN : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>; + defm GLOBAL_ATOMIC_FMAX : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; + defm GLOBAL_ATOMIC_FCMPSWAP_X2 : + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64>; + defm GLOBAL_ATOMIC_FMIN_X2 : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>; + defm GLOBAL_ATOMIC_FMAX_X2 : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; +} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 + +let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { + +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_add_f32", VGPR_32, f32, atomic_add_global +>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global +>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -661,89 +701,51 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor // Patterns for global loads with no offset. class FlatLoadPat : GCNPat < (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 0, $slc) + (inst $vaddr, $offset, 0, 0, $slc) >; -multiclass FlatLoadPat_Hi16 { - def : GCNPat < - (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; - - def : GCNPat < - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; -} - -multiclass FlatSignedLoadPat_Hi16 { - def : GCNPat < - (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; - - def : GCNPat < - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; -} - -multiclass FlatLoadPat_Lo16 { - def : GCNPat < - (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; -} - -multiclass FlatSignedLoadPat_Lo16 { - def : GCNPat < - (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; +class FlatLoadPat_D16 : GCNPat < + (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in), + (inst $vaddr, $offset, 0, 0, $slc, $in) +>; - def : GCNPat < - (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; -} +class FlatSignedLoadPat_D16 : GCNPat < + (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in), + (inst $vaddr, $offset, 0, 0, $slc, $in) +>; class FlatLoadAtomicPat : GCNPat < - (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 0, $slc) + (vt (node (FLATAtomic (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 0, 0, $slc) >; class FlatLoadSignedPat : GCNPat < - (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 0, $slc) + (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 0, 0, $slc) >; -class FlatStorePat : GCNPat < +class FlatStorePat : GCNPat < (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; -class FlatStoreSignedPat : GCNPat < +class FlatStoreSignedPat : GCNPat < (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; -class FlatStoreAtomicPat : GCNPat < +class FlatStoreAtomicPat : GCNPat < // atomic store follows atomic binop convention so the address comes // first. (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; -class FlatStoreSignedAtomicPat : GCNPat < +class FlatStoreSignedAtomicPat : GCNPat < // atomic store follows atomic binop convention so the address comes // first. (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; class FlatAtomicPat ; +class FlatAtomicPatNoRtn : GCNPat < + (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), + (inst $vaddr, $data, $offset, $slc) +>; + class FlatSignedAtomicPat : GCNPat < (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)), @@ -760,28 +767,33 @@ class FlatSignedAtomicPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadAtomicPat ; -def : FlatLoadAtomicPat ; +def : FlatLoadAtomicPat ; +def : FlatLoadAtomicPat ; def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; +def : FlatStorePat ; +def : FlatStorePat ; +def : FlatStorePat ; -def : FlatStoreAtomicPat ; -def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; @@ -818,62 +830,77 @@ let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStorePat ; def : FlatStorePat ; -let AddedComplexity = 3 in { -defm : FlatLoadPat_Hi16 ; -defm : FlatLoadPat_Hi16 ; -defm : FlatLoadPat_Hi16 ; -} - -let AddedComplexity = 9 in { -defm : FlatLoadPat_Lo16 ; -defm : FlatLoadPat_Lo16 ; -defm : FlatLoadPat_Lo16 ; -} +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; + +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; +def : FlatLoadPat_D16 ; } } // End OtherPredicates = [HasFlatAddressSpace] +def atomic_fadd_global : global_binary_atomic_op_frag; +def atomic_pk_fadd_global : global_binary_atomic_op_frag; + let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { -def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; -def : FlatLoadAtomicPat ; -def : FlatLoadAtomicPat ; +def : FlatLoadAtomicPat ; +def : FlatLoadAtomicPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStoreSignedPat ; def : FlatStoreSignedPat ; -defm : FlatSignedLoadPat_Hi16 ; -defm : FlatSignedLoadPat_Hi16 ; -defm : FlatSignedLoadPat_Hi16 ; - -defm : FlatSignedLoadPat_Lo16 ; -defm : FlatSignedLoadPat_Lo16 ; -defm : FlatSignedLoadPat_Lo16 ; - +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; + +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; +def : FlatSignedLoadPat_D16 ; } def : FlatStoreSignedAtomicPat ; -def : FlatStoreSignedAtomicPat ; +def : FlatStoreSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; @@ -903,7 +930,10 @@ def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; -} // End OtherPredicates = [HasFlatGlobalInsts] +def : FlatAtomicPatNoRtn ; +def : FlatAtomicPatNoRtn ; + +} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 //===----------------------------------------------------------------------===// @@ -917,8 +947,8 @@ def : FlatSignedAtomicPat ; class FLAT_Real_ci op, FLAT_Pseudo ps> : FLAT_Real , SIMCInstr { - let AssemblerPredicate = isCIOnly; - let DecoderNamespace="CI"; + let AssemblerPredicate = isGFX7Only; + let DecoderNamespace="GFX7"; } def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>; @@ -985,8 +1015,8 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2 class FLAT_Real_vi op, FLAT_Pseudo ps> : FLAT_Real , SIMCInstr { - let AssemblerPredicate = isVI; - let DecoderNamespace="VI"; + let AssemblerPredicate = isGFX8GFX9; + let DecoderNamespace = "GFX8"; } multiclass FLAT_Real_AllAddr_vi op> { @@ -1133,3 +1163,200 @@ defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>; defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; + + +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +class FLAT_Real_gfx10 op, FLAT_Pseudo ps> : + FLAT_Real, SIMCInstr { + let AssemblerPredicate = isGFX10Plus; + let DecoderNamespace = "GFX10"; + + let Inst{11-0} = {offset{12}, offset{10-0}}; + let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue); + let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d); + let Inst{55} = 0; +} + + +multiclass FLAT_Real_Base_gfx10 op> { + def _gfx10 : + FLAT_Real_gfx10(NAME)>; +} + +multiclass FLAT_Real_RTN_gfx10 op> { + def _RTN_gfx10 : + FLAT_Real_gfx10(NAME#"_RTN")>; +} + +multiclass FLAT_Real_SADDR_gfx10 op> { + def _SADDR_gfx10 : + FLAT_Real_gfx10(NAME#"_SADDR")>; +} + +multiclass FLAT_Real_SADDR_RTN_gfx10 op> { + def _SADDR_RTN_gfx10 : + FLAT_Real_gfx10(NAME#"_SADDR_RTN")>; +} + + +multiclass FLAT_Real_AllAddr_gfx10 op> : + FLAT_Real_Base_gfx10, + FLAT_Real_SADDR_gfx10; + +multiclass FLAT_Real_Atomics_gfx10 op> : + FLAT_Real_Base_gfx10, + FLAT_Real_RTN_gfx10; + +multiclass FLAT_Real_GlblAtomics_gfx10 op> : + FLAT_Real_AllAddr_gfx10, + FLAT_Real_RTN_gfx10, + FLAT_Real_SADDR_RTN_gfx10; + + +// ENC_FLAT. +defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>; +defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>; +defm FLAT_LOAD_USHORT : FLAT_Real_Base_gfx10<0x00a>; +defm FLAT_LOAD_SSHORT : FLAT_Real_Base_gfx10<0x00b>; +defm FLAT_LOAD_DWORD : FLAT_Real_Base_gfx10<0x00c>; +defm FLAT_LOAD_DWORDX2 : FLAT_Real_Base_gfx10<0x00d>; +defm FLAT_LOAD_DWORDX4 : FLAT_Real_Base_gfx10<0x00e>; +defm FLAT_LOAD_DWORDX3 : FLAT_Real_Base_gfx10<0x00f>; +defm FLAT_STORE_BYTE : FLAT_Real_Base_gfx10<0x018>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Real_Base_gfx10<0x019>; +defm FLAT_STORE_SHORT : FLAT_Real_Base_gfx10<0x01a>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x01b>; +defm FLAT_STORE_DWORD : FLAT_Real_Base_gfx10<0x01c>; +defm FLAT_STORE_DWORDX2 : FLAT_Real_Base_gfx10<0x01d>; +defm FLAT_STORE_DWORDX4 : FLAT_Real_Base_gfx10<0x01e>; +defm FLAT_STORE_DWORDX3 : FLAT_Real_Base_gfx10<0x01f>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Real_Base_gfx10<0x020>; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Real_Base_gfx10<0x021>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Real_Base_gfx10<0x022>; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Real_Base_gfx10<0x023>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Real_Base_gfx10<0x024>; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x025>; +defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_gfx10<0x030>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_gfx10<0x031>; +defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_gfx10<0x032>; +defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_gfx10<0x033>; +defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_gfx10<0x035>; +defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_gfx10<0x036>; +defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_gfx10<0x037>; +defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_gfx10<0x038>; +defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_gfx10<0x039>; +defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_gfx10<0x03a>; +defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_gfx10<0x03b>; +defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_gfx10<0x03c>; +defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_gfx10<0x03d>; +defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_gfx10<0x03e>; +defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_gfx10<0x03f>; +defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_gfx10<0x040>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_gfx10<0x050>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x051>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_gfx10<0x052>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_gfx10<0x053>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_gfx10<0x055>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_gfx10<0x056>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_gfx10<0x057>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_gfx10<0x058>; +defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_gfx10<0x059>; +defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_gfx10<0x05a>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>; +defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>; + + +// ENC_FLAT_GLBL. +defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>; +defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>; +defm GLOBAL_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>; +defm GLOBAL_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>; +defm GLOBAL_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>; +defm GLOBAL_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>; +defm GLOBAL_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>; +defm GLOBAL_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>; +defm GLOBAL_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>; +defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>; +defm GLOBAL_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>; +defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>; +defm GLOBAL_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>; +defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>; +defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>; +defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>; +defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>; +defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>; +defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>; +defm GLOBAL_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>; +defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>; +defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>; +defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>; +defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>; +defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>; +defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>; +defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>; +defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>; +defm GLOBAL_ATOMIC_UMAX : FLAT_Real_GlblAtomics_gfx10<0x038>; +defm GLOBAL_ATOMIC_AND : FLAT_Real_GlblAtomics_gfx10<0x039>; +defm GLOBAL_ATOMIC_OR : FLAT_Real_GlblAtomics_gfx10<0x03a>; +defm GLOBAL_ATOMIC_XOR : FLAT_Real_GlblAtomics_gfx10<0x03b>; +defm GLOBAL_ATOMIC_INC : FLAT_Real_GlblAtomics_gfx10<0x03c>; +defm GLOBAL_ATOMIC_DEC : FLAT_Real_GlblAtomics_gfx10<0x03d>; +defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x03e>; +defm GLOBAL_ATOMIC_FMIN : FLAT_Real_GlblAtomics_gfx10<0x03f>; +defm GLOBAL_ATOMIC_FMAX : FLAT_Real_GlblAtomics_gfx10<0x040>; +defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x050>; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x051>; +defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Real_GlblAtomics_gfx10<0x052>; +defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Real_GlblAtomics_gfx10<0x053>; +defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x055>; +defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x056>; +defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x057>; +defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x058>; +defm GLOBAL_ATOMIC_AND_X2 : FLAT_Real_GlblAtomics_gfx10<0x059>; +defm GLOBAL_ATOMIC_OR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05a>; +defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>; +defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>; +defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>; +defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>; +defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>; +defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; + + +// ENC_FLAT_SCRATCH. +defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>; +defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>; +defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>; +defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>; +defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>; +defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>; +defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>; +defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>; +defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>; +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>; +defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>; +defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>; +defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>; +defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>; +defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>; +defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>; +defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>; +defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>; +defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>; +defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>; +defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>; + +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>; + +} // End SubtargetPredicate = HasAtomicFaddInsts diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp index 56071d0d2374..e1845e2e8e87 100644 --- a/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -1,37 +1,40 @@ //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 -// operand.If any of the use instruction cannot be combined with the mov the +// operand. If any of the use instruction cannot be combined with the mov the // whole sequence is reverted. // // $old = ... // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, -// dpp_controls..., $bound_ctrl -// $res = VALU $dpp_value, ... +// dpp_controls..., $row_mask, $bank_mask, $bound_ctrl +// $res = VALU $dpp_value [, src1] // // to // -// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., -// dpp_controls..., $folded_bound_ctrl +// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,] +// dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl // // Combining rules : // -// $bound_ctrl is DPP_BOUND_ZERO, $old is any -// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// if $row_mask and $bank_mask are fully enabled (0xF) and +// $bound_ctrl==DPP_BOUND_ZERO or $old==0 +// -> $combined_old = undef, +// $combined_bound_ctrl = DPP_BOUND_ZERO // -// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO -// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// if the VALU op is binary and +// $bound_ctrl==DPP_BOUND_OFF and +// $old==identity value (immediate) for the VALU op +// -> $combined_old = src1, +// $combined_bound_ctrl = DPP_BOUND_OFF // -// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF -// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// Otherwise cancel. // -// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF +// The mov_dpp instruction should reside in the same BB as all its uses //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -67,20 +70,16 @@ class GCNDPPCombine : public MachineFunctionPass { MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; - RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, - RegSubRegPair OldOpndVGPR, - MachineOperand &OldOpndValue) const; - MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, + RegSubRegPair CombOldVGPR, MachineOperand *OldOpnd, - bool BoundCtrlZero) const; + bool CombBCZ) const; MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, - bool BoundCtrlZero) const; + RegSubRegPair CombOldVGPR, + bool CombBCZ) const; bool hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, @@ -153,8 +152,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, - bool BoundCtrlZero) const { + RegSubRegPair CombOldVGPR, + bool CombBCZ) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); @@ -178,9 +177,15 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { assert(OldIdx == NumOperands); - assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); - DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); + assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg); ++NumOperands; + } else { + // TODO: this discards MAC/FMA instructions for now, let's add it later + LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," + " TBD\n"); + Fail = true; + break; } if (auto *Mod0 = TII->getNamedOperand(OrigMI, @@ -199,6 +204,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } DPPInst.add(*Src0); + DPPInst->getOperand(NumOperands).setIsKill(false); ++NumOperands; if (auto *Mod1 = TII->getNamedOperand(OrigMI, @@ -231,7 +237,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); - DPPInst.addImm(BoundCtrlZero ? 1 : 0); + DPPInst.addImm(CombBCZ ? 1 : 0); } while (false); if (Fail) { @@ -242,64 +248,81 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, return DPPInst.getInstr(); } -GCNDPPCombine::RegSubRegPair -GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, - RegSubRegPair OldOpndVGPR, - MachineOperand &OldOpndValue) const { - assert(OldOpndValue.isImm()); - switch (OrigMI.getOpcode()) { +static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { + assert(OldOpnd->isImm()); + switch (OrigMIOp) { default: break; + case AMDGPU::V_ADD_U32_e32: + case AMDGPU::V_ADD_U32_e64: + case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_ADD_I32_e64: + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: + case AMDGPU::V_SUBREV_U32_e32: + case AMDGPU::V_SUBREV_U32_e64: + case AMDGPU::V_SUBREV_I32_e32: + case AMDGPU::V_SUBREV_I32_e64: case AMDGPU::V_MAX_U32_e32: - if (OldOpndValue.getImm() == std::numeric_limits::max()) - return OldOpndVGPR; + case AMDGPU::V_MAX_U32_e64: + case AMDGPU::V_XOR_B32_e32: + case AMDGPU::V_XOR_B32_e64: + if (OldOpnd->getImm() == 0) + return true; break; - case AMDGPU::V_MAX_I32_e32: - if (OldOpndValue.getImm() == std::numeric_limits::max()) - return OldOpndVGPR; + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: + case AMDGPU::V_MIN_U32_e32: + case AMDGPU::V_MIN_U32_e64: + if (static_cast(OldOpnd->getImm()) == + std::numeric_limits::max()) + return true; break; case AMDGPU::V_MIN_I32_e32: - if (OldOpndValue.getImm() == std::numeric_limits::min()) - return OldOpndVGPR; + case AMDGPU::V_MIN_I32_e64: + if (static_cast(OldOpnd->getImm()) == + std::numeric_limits::max()) + return true; + break; + case AMDGPU::V_MAX_I32_e32: + case AMDGPU::V_MAX_I32_e64: + if (static_cast(OldOpnd->getImm()) == + std::numeric_limits::min()) + return true; break; - case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_I32_I24_e64: case AMDGPU::V_MUL_U32_U24_e32: - if (OldOpndValue.getImm() == 1) { - auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); - assert(Src1 && Src1->isReg()); - return getRegSubRegPair(*Src1); - } + case AMDGPU::V_MUL_U32_U24_e64: + if (OldOpnd->getImm() == 1) + return true; break; } - return RegSubRegPair(); + return false; } -// Cases to combine: -// $bound_ctrl is DPP_BOUND_ZERO, $old is any -// $bound_ctrl is DPP_BOUND_OFF, $old is 0 -// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO - -// $bound_ctrl is DPP_BOUND_OFF, $old is undef -// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF - -// $bound_ctrl is DPP_BOUND_OFF, $old is foldable -// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF - MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, + RegSubRegPair CombOldVGPR, MachineOperand *OldOpndValue, - bool BoundCtrlZero) const { - assert(OldOpndVGPR.Reg); - if (!BoundCtrlZero && OldOpndValue) { - assert(OldOpndValue->isImm()); - OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); - if (!OldOpndVGPR.Reg) { - LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); + bool CombBCZ) const { + assert(CombOldVGPR.Reg); + if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (!Src1 || !Src1->isReg()) { + LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n"); + return nullptr; + } + if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) { + LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n"); + return nullptr; + } + CombOldVGPR = getRegSubRegPair(*Src1); + if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) { + LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n"); return nullptr; } } - return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); + return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ); } // returns true if MI doesn't have OpndName immediate operand or the @@ -316,31 +339,64 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + + auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); + assert(DstOpnd && DstOpnd->isReg()); + auto DPPMovReg = DstOpnd->getReg(); + if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { + LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" + " for all uses\n"); + return false; + } + + auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); + assert(RowMaskOpnd && RowMaskOpnd->isImm()); + auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); + assert(BankMaskOpnd && BankMaskOpnd->isImm()); + const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF && + BankMaskOpnd->getImm() == 0xF; + auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); assert(BCZOpnd && BCZOpnd->isImm()); - bool BoundCtrlZero = 0 != BCZOpnd->getImm(); - - LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + bool BoundCtrlZero = BCZOpnd->getImm(); auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); assert(OldOpnd && OldOpnd->isReg()); - auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); - auto *OldOpndValue = getOldOpndValue(*OldOpnd); + + auto * const OldOpndValue = getOldOpndValue(*OldOpnd); + // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else + // We could use: assert(!OldOpndValue || OldOpndValue->isImm()) + // but the third option is used to distinguish undef from non-immediate + // to reuse IMPLICIT_DEF instruction later assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); - if (OldOpndValue) { - if (BoundCtrlZero) { - OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd - OldOpndValue = nullptr; - } else { - if (!OldOpndValue->isImm()) { - LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); - return false; - } - if (OldOpndValue->getImm() == 0) { - OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef - OldOpndValue = nullptr; - BoundCtrlZero = true; + + bool CombBCZ = false; + + if (MaskAllLanes && BoundCtrlZero) { // [1] + CombBCZ = true; + } else { + if (!OldOpndValue || !OldOpndValue->isImm()) { + LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n"); + return false; + } + + if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) { + LLVM_DEBUG(dbgs() << + " failed: old reg def and mov should be in the same BB\n"); + return false; + } + + if (OldOpndValue->getImm() == 0) { + if (MaskAllLanes) { + assert(!BoundCtrlZero); // by check [1] + CombBCZ = true; } + } else if (BoundCtrlZero) { + assert(!MaskAllLanes); // by check [1] + LLVM_DEBUG(dbgs() << + " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n"); + return false; } } @@ -348,25 +404,28 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { if (!OldOpndValue) dbgs() << "undef"; else - dbgs() << OldOpndValue->getImm(); - dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); - - std::vector OrigMIs, DPPMIs; - if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef - OldOpndVGPR = RegSubRegPair( + dbgs() << *OldOpndValue; + dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); + + SmallVector OrigMIs, DPPMIs; + auto CombOldVGPR = getRegSubRegPair(*OldOpnd); + // try to reuse previous old reg if its undefined (IMPLICIT_DEF) + if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef + CombOldVGPR = RegSubRegPair( MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); + TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg); DPPMIs.push_back(UndefInst.getInstr()); } OrigMIs.push_back(&MovMI); bool Rollback = true; - for (auto &Use : MRI->use_nodbg_operands( - TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { + for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { Rollback = true; auto &OrigMI = *Use.getParent(); + LLVM_DEBUG(dbgs() << " try: " << OrigMI); + auto OrigOp = OrigMI.getOpcode(); if (TII->isVOP3(OrigOp)) { if (!TII->hasVALU32BitEncoding(OrigOp)) { @@ -389,8 +448,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { LLVM_DEBUG(dbgs() << " combining: " << OrigMI); if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { - if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, - OldOpndValue, BoundCtrlZero)) { + if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, + OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } @@ -401,8 +460,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { BB->insert(OrigMI, NewMI); if (TII->commuteInstruction(*NewMI)) { LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); - if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, - OldOpndValue, BoundCtrlZero)) { + if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR, + OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index c6396de89c4f..885239e2faed 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1,9 +1,8 @@ //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,6 +20,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/MC/MCInstrDesc.h" @@ -38,6 +38,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget()), @@ -45,7 +46,8 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : TRI(TII.getRegisterInfo()), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = 5; + MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; + TSchedModel.init(&ST); } void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { @@ -88,18 +90,38 @@ static bool isSMovRel(unsigned Opcode) { } } -static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) { +static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, + const MachineInstr &MI) { + if (TII.isAlwaysGDS(MI.getOpcode())) + return true; + switch (MI.getOpcode()) { case AMDGPU::S_SENDMSG: case AMDGPU::S_SENDMSGHALT: case AMDGPU::S_TTRACEDATA: return true; + // These DS opcodes don't support GDS. + case AMDGPU::DS_NOP: + case AMDGPU::DS_PERMUTE_B32: + case AMDGPU::DS_BPERMUTE_B32: + return false; default: - // TODO: GDS + if (TII.isDS(MI.getOpcode())) { + int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::gds); + if (MI.getOperand(GDS).getImm()) + return true; + } return false; } } +static bool isPermlane(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == AMDGPU::V_PERMLANE16_B32 || + Opcode == AMDGPU::V_PERMLANEX16_B32; +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -109,6 +131,8 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { ScheduleHazardRecognizer::HazardType GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { MachineInstr *MI = SU->getInstr(); + if (MI->isBundle()) + return NoHazard; if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return NoopHazard; @@ -119,6 +143,15 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { && checkVMEMHazards(MI) > 0) return NoopHazard; + if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) + return NoopHazard; + + if (checkFPAtomicToDenormModeHazard(MI) > 0) + return NoopHazard; + + if (ST.hasNoDataDepHazard()) + return NoHazard; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) return NoopHazard; @@ -145,10 +178,16 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { checkReadM0Hazards(MI) > 0) return NoopHazard; - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) && + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && checkReadM0Hazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) + return NoopHazard; + + if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0) + return NoopHazard; + if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) return NoopHazard; @@ -158,22 +197,74 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { return NoHazard; } +static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) + .addImm(0); +} + +void GCNHazardRecognizer::processBundle() { + MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); + MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); + // Check bundled MachineInstr's for hazards. + for (; MI != E && MI->isInsideBundle(); ++MI) { + CurrCycleInstr = &*MI; + unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); + + if (IsHazardRecognizerMode) + fixHazards(CurrCycleInstr); + + for (unsigned i = 0; i < WaitStates; ++i) + insertNoopInBundle(CurrCycleInstr, TII); + + // It’s unnecessary to track more than MaxLookAhead instructions. Since we + // include the bundled MI directly after, only add a maximum of + // (MaxLookAhead - 1) noops to EmittedInstrs. + for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) + EmittedInstrs.push_front(nullptr); + + EmittedInstrs.push_front(CurrCycleInstr); + EmittedInstrs.resize(MaxLookAhead); + } + CurrCycleInstr = nullptr; +} + unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { - return PreEmitNoops(SU->getInstr()); + IsHazardRecognizerMode = false; + return PreEmitNoopsCommon(SU->getInstr()); } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + IsHazardRecognizerMode = true; + CurrCycleInstr = MI; + unsigned W = PreEmitNoopsCommon(MI); + fixHazards(MI); + CurrCycleInstr = nullptr; + return W; +} + +unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { + if (MI->isBundle()) + return 0; + int WaitStates = std::max(0, checkAnyInstHazards(MI)); if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVALU(*MI)) - WaitStates = std::max(WaitStates, checkVALUHazards(MI)); - if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (ST.hasNSAtoVMEMBug()) + WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); + + WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); + + if (ST.hasNoDataDepHazard()) + return WaitStates; + + if (SIInstrInfo::isVALU(*MI)) + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) WaitStates = std::max(WaitStates, checkDPPHazards(MI)); @@ -199,9 +290,15 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { isSMovRel(MI->getOpcode()))) return std::max(WaitStates, checkReadM0Hazards(MI)); - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI)) + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) return std::max(WaitStates, checkReadM0Hazards(MI)); + if (SIInstrInfo::isMAI(*MI)) + return std::max(WaitStates, checkMAIHazards(MI)); + + if (MI->mayLoad() || MI->mayStore()) + return std::max(WaitStates, checkMAILdStHazards(MI)); + return WaitStates; } @@ -218,10 +315,14 @@ void GCNHazardRecognizer::AdvanceCycle() { // Do not track non-instructions which do not affect the wait states. // If included, these instructions can lead to buffer overflow such that // detectable hazards are missed. - if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) + if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || + CurrCycleInstr->isKill()) return; - else if (CurrCycleInstr->isDebugInstr()) + + if (CurrCycleInstr->isBundle()) { + processBundle(); return; + } unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); @@ -252,41 +353,112 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -int GCNHazardRecognizer::getWaitStatesSince( - function_ref IsHazard) { +typedef function_ref IsExpiredFn; + +// Returns a minimum wait states since \p I walking all predecessors. +// Only scans until \p IsExpired does not return true. +// Can only be run in a hazard recognizer mode. +static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + MachineBasicBlock *MBB, + MachineBasicBlock::reverse_instr_iterator I, + int WaitStates, + IsExpiredFn IsExpired, + DenseSet &Visited) { + for (auto E = MBB->instr_rend(); I != E; ++I) { + // Don't add WaitStates for parent BUNDLE instructions. + if (I->isBundle()) + continue; + + if (IsHazard(&*I)) + return WaitStates; + + if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr()) + continue; + + WaitStates += SIInstrInfo::getNumWaitStates(*I); + + if (IsExpired(&*I, WaitStates)) + return std::numeric_limits::max(); + } + + int MinWaitStates = WaitStates; + bool Found = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (!Visited.insert(Pred).second) + continue; + + int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), + WaitStates, IsExpired, Visited); + + if (W == std::numeric_limits::max()) + continue; + + MinWaitStates = Found ? std::min(MinWaitStates, W) : W; + if (IsExpired(nullptr, MinWaitStates)) + return MinWaitStates; + + Found = true; + } + + if (Found) + return MinWaitStates; + + return std::numeric_limits::max(); +} + +static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + MachineInstr *MI, + IsExpiredFn IsExpired) { + DenseSet Visited; + return getWaitStatesSince(IsHazard, MI->getParent(), + std::next(MI->getReverseIterator()), + 0, IsExpired, Visited); +} + +int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { + if (IsHazardRecognizerMode) { + auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { + return WaitStates >= Limit; + }; + return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); + } + int WaitStates = 0; for (MachineInstr *MI : EmittedInstrs) { if (MI) { if (IsHazard(MI)) return WaitStates; - unsigned Opcode = MI->getOpcode(); - if (Opcode == AMDGPU::INLINEASM) + if (MI->isInlineAsm()) continue; } ++WaitStates; + + if (WaitStates >= Limit) + break; } return std::numeric_limits::max(); } -int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref IsHazardDef) { +int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, + IsHazardFn IsHazardDef, + int Limit) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } -int GCNHazardRecognizer::getWaitStatesSinceSetReg( - function_ref IsHazard) { +int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, + int Limit) { auto IsHazardFn = [IsHazard] (MachineInstr *MI) { return isSSetReg(MI->getOpcode()) && IsHazard(MI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } //===----------------------------------------------------------------------===// @@ -328,9 +500,9 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { // instructions in this group may return out of order and/or may be // replayed (i.e. the same instruction issued more than once). // - // In order to handle these situations correctly we need to make sure - // that when a clause has more than one instruction, no instruction in the - // clause writes to a register that is read another instruction in the clause + // In order to handle these situations correctly we need to make sure that + // when a clause has more than one instruction, no instruction in the clause + // writes to a register that is read by another instruction in the clause // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. @@ -363,13 +535,12 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { } int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { - const GCNSubtarget &ST = MF.getSubtarget(); int WaitStatesNeeded = 0; WaitStatesNeeded = checkSoftClauseHazards(SMRD); // This SMRD hazard only affects SI. - if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (!ST.hasSMRDReadVALUDefHazard()) return WaitStatesNeeded; // A read of an SGPR by SMRD instruction requires 4 wait states when the @@ -384,7 +555,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { if (!Use.isReg()) continue; int WaitStatesNeededForUse = - SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); // This fixes what appears to be undocumented hardware behavior in SI where @@ -397,7 +569,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { if (IsBufferSMRD) { int WaitStatesNeededForUse = SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), - IsBufferHazardDefFn); + IsBufferHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } } @@ -406,7 +579,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { } int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!ST.hasVMEMReadSGPRVALUDefHazard()) return 0; int WaitStatesNeeded = checkSoftClauseHazards(VMEM); @@ -415,13 +588,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { // SGPR was written by a VALU Instruction. const int VmemSgprWaitStates = 5; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; - for (const MachineOperand &Use : VMEM->uses()) { if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + VmemSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } return WaitStatesNeeded; @@ -441,13 +614,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), + [](MachineInstr *) { return true; }, + DppVgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } WaitStatesNeeded = std::max( WaitStatesNeeded, - DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn)); + DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, + DppExecWaitStates)); return WaitStatesNeeded; } @@ -459,7 +635,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { // instruction. const int DivFMasWaitStates = 4; auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; - int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn); + int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, + DivFMasWaitStates); return DivFMasWaitStates - WaitStatesNeeded; } @@ -472,7 +649,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { return GetRegHWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); return GetRegWaitStates - WaitStatesNeeded; } @@ -481,12 +658,11 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { const SIInstrInfo *TII = ST.getInstrInfo(); unsigned HWReg = getHWReg(TII, *SetRegInstr); - const int SetRegWaitStates = - ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2; + const int SetRegWaitStates = ST.getSetRegWaitStates(); auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { return HWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); return SetRegWaitStates - WaitStatesNeeded; } @@ -557,7 +733,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); }; int WaitStatesNeededForDef = - VALUWaitStates - getWaitStatesSince(IsHazardFn); + VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); return WaitStatesNeeded; @@ -622,12 +798,13 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { }; const int RWLaneWaitStates = 4; - int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn); + int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, + RWLaneWaitStates); return RWLaneWaitStates - WaitStatesSince; } int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!ST.hasRFEHazards()) return 0; const SIInstrInfo *TII = ST.getInstrInfo(); @@ -637,7 +814,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { auto IsHazardFn = [TII] (MachineInstr *MI) { return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); return RFEWaitStates - WaitStatesNeeded; } @@ -661,7 +838,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; }; int WaitStatesNeededForUse = - MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn, + MovFedWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } @@ -674,5 +852,557 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isSALU(*MI); }; - return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, + SMovRelWaitStates); +} + +void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { + fixVMEMtoScalarWriteHazards(MI); + fixVcmpxPermlaneHazards(MI); + fixSMEMtoVectorWriteHazards(MI); + fixVcmpxExecWARHazard(MI); + fixLdsBranchVmemWARHazard(MI); +} + +bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { + if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isVOPC(*MI); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + if (!MI) + return false; + unsigned Opc = MI->getOpcode(); + return SIInstrInfo::isVALU(*MI) && + Opc != AMDGPU::V_NOP_e32 && + Opc != AMDGPU::V_NOP_e64 && + Opc != AMDGPU::V_NOP_sdwa; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + // V_NOP will be discarded by SQ. + // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* + // which is always a VGPR and available. + auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + unsigned Reg = Src0->getReg(); + bool IsUndef = Src0->isUndef(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32)) + .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) + .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); + + return true; +} + +bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { + if (!ST.hasVMEMtoScalarWriteHazard()) + return false; + + if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) + return false; + + if (MI->getNumDefs() == 0) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [TRI, MI] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && + !SIInstrInfo::isFLAT(*I)) + return false; + + for (const MachineOperand &Def : MI->defs()) { + MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); + if (!Op) + continue; + return true; + } + return false; + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + return MI && (SIInstrInfo::isVALU(*MI) || + (MI->getOpcode() == AMDGPU::S_WAITCNT && + !MI->getOperand(0).getImm())); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + +bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { + if (!ST.hasSMEMtoVectorWriteHazard()) + return false; + + if (!SIInstrInfo::isVALU(*MI)) + return false; + + unsigned SDSTName; + switch (MI->getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READFIRSTLANE_B32: + SDSTName = AMDGPU::OpName::vdst; + break; + default: + SDSTName = AMDGPU::OpName::sdst; + break; + } + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); + const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); + if (!SDST) { + for (const auto &MO : MI->implicit_operands()) { + if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { + SDST = &MO; + break; + } + } + } + + if (!SDST) + return false; + + const unsigned SDSTReg = SDST->getReg(); + auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { + return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); + }; + + auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { + if (MI) { + if (TII->isSALU(*MI)) { + switch (MI->getOpcode()) { + case AMDGPU::S_SETVSKIP: + case AMDGPU::S_VERSION: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + // These instructions cannot not mitigate the hazard. + return false; + case AMDGPU::S_WAITCNT_LGKMCNT: + // Reducing lgkmcnt count to 0 always mitigates the hazard. + return (MI->getOperand(1).getImm() == 0) && + (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + case AMDGPU::S_WAITCNT: { + const int64_t Imm = MI->getOperand(0).getImm(); + AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); + return (Decoded.LgkmCnt == 0); + } + default: + // SOPP instructions cannot mitigate the hazard. + if (TII->isSOPP(*MI)) + return false; + // At this point the SALU can be assumed to mitigate the hazard + // because either: + // (a) it is independent of the at risk SMEM (breaking chain), + // or + // (b) it is dependent on the SMEM, in which case an appropriate + // s_waitcnt lgkmcnt _must_ exist between it and the at risk + // SMEM instruction. + return true; + } + } + } + return false; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) + .addImm(0); + return true; +} + +bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { + if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) + return false; + + auto IsHazardFn = [TRI] (MachineInstr *I) { + if (SIInstrInfo::isVALU(*I)) + return false; + return I->readsRegister(AMDGPU::EXEC, TRI); + }; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { + if (!MI) + return false; + if (SIInstrInfo::isVALU(*MI)) { + if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) + return true; + for (auto MO : MI->implicit_operands()) + if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) + return true; + } + if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) + return true; + return false; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xfffe); + return true; +} + +bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { + if (!ST.hasLdsBranchVmemWARHazard()) + return false; + + auto IsHazardInst = [] (const MachineInstr *MI) { + if (SIInstrInfo::isDS(*MI)) + return 1; + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) + return 2; + return 0; + }; + + auto InstType = IsHazardInst(MI); + if (!InstType) + return false; + + auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { + return I && (IsHazardInst(I) || + (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I->getOperand(1).getImm())); + }; + + auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { + if (!I->isBranch()) + return false; + + auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { + auto InstType2 = IsHazardInst(I); + return InstType2 && InstType != InstType2; + }; + + auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { + if (!I) + return false; + + auto InstType2 = IsHazardInst(I); + if (InstType == InstType2) + return true; + + return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I->getOperand(1).getImm(); + }; + + return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != + std::numeric_limits::max(); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + + return true; +} + +int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { + int NSAtoVMEMWaitStates = 1; + + if (!ST.hasNSAtoVMEMBug()) + return 0; + + if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); + if (!Offset || (Offset->getImm() & 6) == 0) + return 0; + + auto IsHazardFn = [TII] (MachineInstr *I) { + if (!SIInstrInfo::isMIMG(*I)) + return false; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); + return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && + TII->getInstSizeInBytes(*I) >= 16; + }; + + return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); +} + +int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { + int FPAtomicToDenormModeWaitStates = 3; + + if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) + return 0; + + auto IsHazardFn = [] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) + return false; + return SIInstrInfo::isFPAtomic(*I); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { + if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) + return true; + + switch (MI->getOpcode()) { + case AMDGPU::S_WAITCNT: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + case AMDGPU::S_WAITCNT_LGKMCNT: + case AMDGPU::S_WAITCNT_IDLE: + return true; + default: + break; + } + + return false; + }; + + + return FPAtomicToDenormModeWaitStates - + ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); +} + +int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { + assert(SIInstrInfo::isMAI(*MI)); + + int WaitStatesNeeded = 0; + unsigned Opc = MI->getOpcode(); + + auto IsVALUFn = [] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI); + }; + + if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write + const int LegacyVALUWritesVGPRWaitStates = 2; + const int VALUWritesExecWaitStates = 4; + const int MaxWaitStates = 4; + + int WaitStatesNeededForUse = VALUWritesExecWaitStates - + getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded < MaxWaitStates) { + for (const MachineOperand &Use : MI->explicit_uses()) { + const int MaxWaitStates = 2; + + if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + + int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - + getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + } + } + + auto IsMFMAFn = [] (MachineInstr *MI) { + return SIInstrInfo::isMAI(*MI) && + MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && + MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; + }; + + for (const MachineOperand &Op : MI->explicit_operands()) { + if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) + continue; + + if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) + continue; + + const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; + const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; + const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; + const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; + const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; + const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; + const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; + const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; + const int MaxWaitStates = 18; + unsigned Reg = Op.getReg(); + unsigned HazardDefLatency = 0; + + auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + if (DstReg == Reg) + return false; + HazardDefLatency = std::max(HazardDefLatency, + TSchedModel.computeInstrLatency(MI)); + return TRI.regsOverlap(DstReg, Reg); + }; + + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, + MaxWaitStates); + int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; + int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + int OpNo = MI->getOperandNo(&Op); + if (OpNo == SrcCIdx) { + NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; + } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; + break; + } + } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; + break; + } + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + + auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + return TRI.regsOverlap(Reg, DstReg); + }; + + const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; + const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; + const int AccVGPRWriteAccVgprReadWaitStates = 3; + NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; + if (OpNo == SrcCIdx) + NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; + else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) + NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; + + WaitStatesNeededForUse = NeedWaitStates - + getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + } + + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { + const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; + const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; + const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; + const int MaxWaitStates = 13; + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned HazardDefLatency = 0; + + auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + HazardDefLatency = std::max(HazardDefLatency, + TSchedModel.computeInstrLatency(MI)); + return TRI.regsOverlap(Reg, DstReg); + }; + + int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); + int NeedWaitStates; + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { + if (!ST.hasMAIInsts()) + return 0; + + int WaitStatesNeeded = 0; + + auto IsAccVgprReadFn = [] (MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; + }; + + for (const MachineOperand &Op : MI->explicit_uses()) { + if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) + continue; + + unsigned Reg = Op.getReg(); + + const int AccVgprReadLdStWaitStates = 2; + const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; + const int MaxWaitStates = 2; + + int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - + getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + + auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) + return false; + auto IsVALUFn = [] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); + }; + return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < + std::numeric_limits::max(); + }; + + WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - + getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; } diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h index ca17e7cb6018..6aa2e70dfbfb 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -1,9 +1,8 @@ //===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/TargetSchedule.h" #include namespace llvm { @@ -31,6 +31,13 @@ class SIRegisterInfo; class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { +public: + typedef function_ref IsHazardFn; + +private: + // Distinguish if we are called from scheduler or hazard recognizer + bool IsHazardRecognizerMode; + // This variable stores the instruction that has been emitted this cycle. It // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is // called. @@ -40,6 +47,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const GCNSubtarget &ST; const SIInstrInfo &TII; const SIRegisterInfo &TRI; + TargetSchedModel TSchedModel; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; @@ -54,11 +62,13 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { void addClauseInst(const MachineInstr &MI); - int getWaitStatesSince(function_ref IsHazard); - int getWaitStatesSinceDef(unsigned Reg, - function_ref IsHazardDef = - [](MachineInstr *) { return true; }); - int getWaitStatesSinceSetReg(function_ref IsHazard); + // Advance over a MachineInstr bundle. Look for hazards in the bundled + // instructions. + void processBundle(); + + int getWaitStatesSince(IsHazardFn IsHazard, int Limit); + int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit); + int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit); int checkSoftClauseHazards(MachineInstr *SMEM); int checkSMRDHazards(MachineInstr *SMRD); @@ -75,6 +85,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { int checkInlineAsmHazards(MachineInstr *IA); int checkAnyInstHazards(MachineInstr *MI); int checkReadM0Hazards(MachineInstr *SMovRel); + int checkNSAtoVMEMHazard(MachineInstr *MI); + int checkFPAtomicToDenormModeHazard(MachineInstr *MI); + void fixHazards(MachineInstr *MI); + bool fixVcmpxPermlaneHazards(MachineInstr *MI); + bool fixVMEMtoScalarWriteHazards(MachineInstr *MI); + bool fixSMEMtoVectorWriteHazards(MachineInstr *MI); + bool fixVcmpxExecWARHazard(MachineInstr *MI); + bool fixLdsBranchVmemWARHazard(MachineInstr *MI); + + int checkMAIHazards(MachineInstr *MI); + int checkMAILdStHazards(MachineInstr *MI); + public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. @@ -85,6 +107,7 @@ public: void EmitNoop() override; unsigned PreEmitNoops(SUnit *SU) override; unsigned PreEmitNoops(MachineInstr *) override; + unsigned PreEmitNoopsCommon(MachineInstr *); void AdvanceCycle() override; void RecedeCycle() override; }; diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp index d62dc8d86781..1eb617640c32 100644 --- a/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -1,9 +1,8 @@ //===---------------------------- GCNILPSched.cpp - -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 8e4cc391dc21..3525174223bd 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -1,9 +1,8 @@ //===- GCNIterativeScheduler.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h index 14ef5147f32a..e6f83914af5b 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -1,9 +1,8 @@ //===- GCNIterativeScheduler.h - GCN Scheduler ------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index ec6bcae33555..c469cf290e26 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -1,9 +1,8 @@ //===- GCNMinRegStrategy.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp new file mode 100644 index 000000000000..51c4c99cfb18 --- /dev/null +++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -0,0 +1,343 @@ +//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential +/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA +/// with sequential versions where possible. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-nsa-reassign" + +STATISTIC(NumNSAInstructions, + "Number of NSA instructions with non-sequential address found"); +STATISTIC(NumNSAConverted, + "Number of NSA instructions changed to sequential"); + +namespace { + +class GCNNSAReassign : public MachineFunctionPass { +public: + static char ID; + + GCNNSAReassign() : MachineFunctionPass(ID) { + initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN NSA Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + typedef enum { + NOT_NSA, // Not an NSA instruction + FIXED, // NSA which we cannot modify + NON_CONTIGUOUS, // NSA with non-sequential address which we can try + // to optimize. + CONTIGUOUS // NSA with all sequential address registers + } NSA_Status; + + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + const MCPhysReg *CSRegs; + + NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; + + bool tryAssignRegisters(SmallVectorImpl &Intervals, + unsigned StartReg) const; + + bool canAssign(unsigned StartReg, unsigned NumRegs) const; + + bool scavengeRegs(SmallVectorImpl &Intervals) const; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) + + +char GCNNSAReassign::ID = 0; + +char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; + +bool +GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, + unsigned StartReg) const { + unsigned NumRegs = Intervals.size(); + + for (unsigned N = 0; N < NumRegs; ++N) + if (VRM->hasPhys(Intervals[N]->reg)) + LRM->unassign(*Intervals[N]); + + for (unsigned N = 0; N < NumRegs; ++N) + if (LRM->checkInterference(*Intervals[N], StartReg + N)) + return false; + + for (unsigned N = 0; N < NumRegs; ++N) + LRM->assign(*Intervals[N], StartReg + N); + + return true; +} + +bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { + for (unsigned N = 0; N < NumRegs; ++N) { + unsigned Reg = StartReg + N; + if (!MRI->isAllocatable(Reg)) + return false; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return false; + } + + return true; +} + +bool +GCNNSAReassign::scavengeRegs(SmallVectorImpl &Intervals) const { + unsigned NumRegs = Intervals.size(); + + if (NumRegs > MaxNumVGPRs) + return false; + unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; + + for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { + if (!canAssign(Reg, NumRegs)) + continue; + + if (tryAssignRegisters(Intervals, Reg)) + return true; + } + + return false; +} + +GCNNSAReassign::NSA_Status +GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + return NSA_Status::NOT_NSA; + + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + + unsigned VgprBase = 0; + bool NSA = false; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return NSA_Status::FIXED; + + unsigned PhysReg = VRM->getPhys(Reg); + + if (!Fast) { + if (!PhysReg) + return NSA_Status::FIXED; + + // Bail if address is not a VGPR32. That should be possible to extend the + // optimization to work with subregs of a wider register tuples, but the + // logic to find free registers will be much more complicated with much + // less chances for success. That seems reasonable to assume that in most + // cases a tuple is used because a vector variable contains different + // parts of an address and it is either already consequitive or cannot + // be reassigned if not. If needed it is better to rely on register + // coalescer to process such address tuples. + if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) + return NSA_Status::FIXED; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return NSA_Status::FIXED; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return NSA_Status::FIXED; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return NSA_Status::FIXED; + } + + if (!LIS->hasInterval(Reg)) + return NSA_Status::FIXED; + } + + if (I == 0) + VgprBase = PhysReg; + else if (VgprBase + I != PhysReg) + NSA = true; + } + + return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; +} + +bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget(); + if (ST->getGeneration() < GCNSubtarget::GFX10) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + VRM = &getAnalysis(); + LRM = &getAnalysis(); + LIS = &getAnalysis(); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); + CSRegs = MRI->getCalleeSavedRegs(); + + using Candidate = std::pair; + SmallVector Candidates; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (CheckNSA(MI)) { + default: + continue; + case NSA_Status::CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, true)); + break; + case NSA_Status::NON_CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, false)); + ++NumNSAInstructions; + break; + } + } + } + + bool Changed = false; + for (auto &C : Candidates) { + if (C.second) + continue; + + const MachineInstr *MI = C.first; + if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { + // Already happen to be fixed. + C.second = true; + ++NumNSAConverted; + continue; + } + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); + + SmallVector Intervals; + SmallVector OrigRegs; + SlotIndex MinInd, MaxInd; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + LiveInterval *LI = &LIS->getInterval(Reg); + if (llvm::find(Intervals, LI) != Intervals.end()) { + // Same register used, unable to make sequential + Intervals.clear(); + break; + } + Intervals.push_back(LI); + OrigRegs.push_back(VRM->getPhys(Reg)); + MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); + MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); + } + + if (Intervals.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI + << "\tOriginal allocation:\t"; + for(auto *LI : Intervals) + dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); + dbgs() << '\n'); + + bool Success = scavengeRegs(Intervals); + if (!Success) { + LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); + if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. + continue; + } else { + // Check we did not make it worse for other instructions. + auto I = std::lower_bound(Candidates.begin(), &C, MinInd, + [this](const Candidate &C, SlotIndex I) { + return LIS->getInstructionIndex(*C.first) < I; + }); + for (auto E = Candidates.end(); Success && I != E && + LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { + if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { + Success = false; + LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); + } + } + } + + if (!Success) { + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + if (VRM->hasPhys(Intervals[I]->reg)) + LRM->unassign(*Intervals[I]); + + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + LRM->assign(*Intervals[I], OrigRegs[I]); + + continue; + } + + C.second = true; + ++NumNSAConverted; + LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" + << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) + << " : " + << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) + << "]\n"); + Changed = true; + } + + return Changed; +} diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td index b8142a4e4ff8..b926041afb2f 100644 --- a/lib/Target/AMDGPU/GCNProcessors.td +++ b/lib/Target/AMDGPU/GCNProcessors.td @@ -1,163 +1,185 @@ //===-- GCNProcessors.td - GCN Processor definitions ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // The code produced for "generic" is only useful for tests and cannot // reasonably be expected to execute on any particular target. def : ProcessorModel<"generic", NoSchedModel, - [FeatureGCN, FeatureWavefrontSize64] + [FeatureWavefrontSize64] >; -//===----------------------------------------------------------------------===// +def : ProcessorModel<"generic-hsa", NoSchedModel, + [FeatureWavefrontSize64, FeatureFlatAddressSpace] +>; + +//===------------------------------------------------------------===// // GCN GFX6 (Southern Islands (SI)). -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx600", SIFullSpeedModel, - [FeatureISAVersion6_0_0] + FeatureISAVersion6_0_0.Features >; def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureISAVersion6_0_0] + FeatureISAVersion6_0_0.Features >; def : ProcessorModel<"gfx601", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"hainan", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"oland", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"verde", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// // GCN GFX7 (Sea Islands (CI)). -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx700", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] + FeatureISAVersion7_0_0.Features >; def : ProcessorModel<"kaveri", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] + FeatureISAVersion7_0_0.Features >; def : ProcessorModel<"gfx701", SIFullSpeedModel, - [FeatureISAVersion7_0_1] + FeatureISAVersion7_0_1.Features >; def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureISAVersion7_0_1] + FeatureISAVersion7_0_1.Features >; def : ProcessorModel<"gfx702", SIQuarterSpeedModel, - [FeatureISAVersion7_0_2] + FeatureISAVersion7_0_2.Features >; def : ProcessorModel<"gfx703", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] + FeatureISAVersion7_0_3.Features >; def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] + FeatureISAVersion7_0_3.Features >; def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] + FeatureISAVersion7_0_3.Features >; def : ProcessorModel<"gfx704", SIQuarterSpeedModel, - [FeatureISAVersion7_0_4] + FeatureISAVersion7_0_4.Features >; def : ProcessorModel<"bonaire", SIQuarterSpeedModel, - [FeatureISAVersion7_0_4] + FeatureISAVersion7_0_4.Features >; -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// // GCN GFX8 (Volcanic Islands (VI)). -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx801", SIQuarterSpeedModel, - [FeatureISAVersion8_0_1] + FeatureISAVersion8_0_1.Features >; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, - [FeatureISAVersion8_0_1] + FeatureISAVersion8_0_1.Features >; def : ProcessorModel<"gfx802", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] + FeatureISAVersion8_0_2.Features >; def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] + FeatureISAVersion8_0_2.Features >; def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] + FeatureISAVersion8_0_2.Features >; def : ProcessorModel<"gfx803", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"fiji", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"polaris10", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"polaris11", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"gfx810", SIQuarterSpeedModel, - [FeatureISAVersion8_1_0] + FeatureISAVersion8_1_0.Features >; def : ProcessorModel<"stoney", SIQuarterSpeedModel, - [FeatureISAVersion8_1_0] + FeatureISAVersion8_1_0.Features >; -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// // GCN GFX9. -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx900", SIQuarterSpeedModel, - [FeatureISAVersion9_0_0] + FeatureISAVersion9_0_0.Features >; def : ProcessorModel<"gfx902", SIQuarterSpeedModel, - [FeatureISAVersion9_0_2] + FeatureISAVersion9_0_2.Features >; def : ProcessorModel<"gfx904", SIQuarterSpeedModel, - [FeatureISAVersion9_0_4] + FeatureISAVersion9_0_4.Features >; def : ProcessorModel<"gfx906", SIQuarterSpeedModel, - [FeatureISAVersion9_0_6] + FeatureISAVersion9_0_6.Features +>; + +def : ProcessorModel<"gfx908", SIQuarterSpeedModel, + FeatureISAVersion9_0_8.Features >; def : ProcessorModel<"gfx909", SIQuarterSpeedModel, - [FeatureISAVersion9_0_9] + FeatureISAVersion9_0_9.Features +>; + +//===----------------------------------------------------------------------===// +// GCN GFX10. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1010", GFX10SpeedModel, + FeatureISAVersion10_1_0.Features >; +def : ProcessorModel<"gfx1011", GFX10SpeedModel, + FeatureISAVersion10_1_1.Features +>; + +def : ProcessorModel<"gfx1012", GFX10SpeedModel, + FeatureISAVersion10_1_2.Features +>; diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp new file mode 100644 index 000000000000..f0d47eaa4ed1 --- /dev/null +++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -0,0 +1,800 @@ +//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ to reduce register bank +/// conflicts. +/// +/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in +/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to +/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, +/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. +/// +/// The shader can read one dword from each of these banks once per cycle. +/// If an instruction has to read more register operands from the same bank +/// an additional cycle is needed. HW attempts to pre-load registers through +/// input operand gathering, but a stall cycle may occur if that fails. For +/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, +/// potentially incuring 2 stall cycles. +/// +/// The pass tries to reassign registers to reduce bank conflicts. +/// +/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so +/// that 4 has to be subtracted from an SGPR bank number to get the real value. +/// This also corresponds to bit numbers in bank masks used in the pass. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +static cl::opt VerifyStallCycles("amdgpu-verify-regbanks-reassign", + cl::desc("Verify stall cycles in the regbanks reassign pass"), + cl::value_desc("0|1|2"), + cl::init(0), cl::Hidden); + +#define DEBUG_TYPE "amdgpu-regbanks-reassign" + +#define NUM_VGPR_BANKS 4 +#define NUM_SGPR_BANKS 8 +#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) +#define SGPR_BANK_OFFSET NUM_VGPR_BANKS +#define VGPR_BANK_MASK 0xf +#define SGPR_BANK_MASK 0xff0 +#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) + +STATISTIC(NumStallsDetected, + "Number of operand read stalls detected"); +STATISTIC(NumStallsRecovered, + "Number of operand read stalls recovered"); + +namespace { + +class GCNRegBankReassign : public MachineFunctionPass { + + class OperandMask { + public: + OperandMask(unsigned r, unsigned s, unsigned m) + : Reg(r), SubReg(s), Mask(m) {} + unsigned Reg; + unsigned SubReg; + unsigned Mask; + }; + + class Candidate { + public: + Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, + unsigned weight) + : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + + bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const GCNRegBankReassign *P) const { + MI->dump(); + dbgs() << P->printReg(Reg) << " to banks "; + dumpFreeBanks(FreeBanks); + dbgs() << " weight " << Weight << '\n'; + } +#endif + + MachineInstr *MI; + unsigned Reg; + unsigned FreeBanks; + unsigned Weight; + }; + + class CandidateList : public std::list { + public: + // Speedup subsequent sort. + void push(const Candidate&& C) { + if (C.Weight) push_back(C); + else push_front(C); + } + }; + +public: + static char ID; + +public: + GCNRegBankReassign() : MachineFunctionPass(ID) { + initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN RegBank Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + MachineLoopInfo *MLI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + unsigned MaxNumSGPRs; + + BitVector RegsUsed; + + SmallVector OperandMasks; + + CandidateList Candidates; + + const MCPhysReg *CSRegs; + + // Returns bank for a phys reg. + unsigned getPhysRegBank(unsigned Reg) const; + + // Return a bit set for each register bank used. 4 banks for VGPRs and + // 8 banks for SGPRs. + // Registers already processed and recorded in RegsUsed are excluded. + // If Bank is not -1 assume Reg:SubReg to belong to that Bank. + unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); + + // Return number of stalls in the instructions. + // UsedBanks has bits set for the banks used by all operands. + // If Reg and Bank provided substitute the Reg with the Bank. + unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, + unsigned Reg = AMDGPU::NoRegister, int Bank = -1); + + // Return true if register is regular VGPR or SGPR or their tuples. + // Returns false for special registers like m0, vcc etc. + bool isReassignable(unsigned Reg) const; + + // Check if registers' defs are old and may be pre-loaded. + // Returns 0 if both registers are old enough, 1 or 2 if one or both + // registers will not likely be pre-loaded. + unsigned getOperandGatherWeight(const MachineInstr& MI, + unsigned Reg1, + unsigned Reg2, + unsigned StallCycles) const; + + + // Find all bank bits in UsedBanks where Mask can be relocated to. + unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; + + // Find all bank bits in UsedBanks where Mask can be relocated to. + // Bank is relative to the register and not its subregister component. + // Returns 0 is a register is not reassignable. + unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, + unsigned UsedBanks) const; + + // Add cadidate instruction to the work list. + void collectCandidates(MachineInstr& MI, unsigned UsedBanks, + unsigned StallCycles); + + // Collect cadidate instructions across function. Returns a number stall + // cycles detected. Only counts stalls if Collect is false. + unsigned collectCandidates(MachineFunction &MF, bool Collect = true); + + // Remove all candidates that read specified register. + void removeCandidates(unsigned Reg); + + // Compute stalls within the uses of SrcReg replaced by a register from + // Bank. If Bank is -1 does not perform substitution. If Collect is set + // candidates are collected and added to work list. + unsigned computeStallCycles(unsigned SrcReg, + unsigned Reg = AMDGPU::NoRegister, + int Bank = -1, bool Collect = false); + + // Search for a register in Bank unused within LI. + // Returns phys reg or NoRegister. + unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + + // Try to reassign candidate. Returns number or stall cycles saved. + unsigned tryReassign(Candidate &C); + + bool verifyCycles(MachineFunction &MF, + unsigned OriginalCycles, unsigned CyclesSaved); + + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +public: + Printable printReg(unsigned Reg, unsigned SubReg = 0) const { + return Printable([Reg, SubReg, this](raw_ostream &OS) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + OS << llvm::printReg(Reg, TRI); + return; + } + if (!VRM->isAssignedReg(Reg)) + OS << " " << llvm::printReg(Reg, TRI); + else + OS << llvm::printReg(Reg, TRI) << '(' + << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; + if (SubReg) + OS << ':' << TRI->getSubRegIndexName(SubReg); + }); + } + + static Printable printBank(unsigned Bank) { + return Printable([Bank](raw_ostream &OS) { + OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); + }); + } + + static void dumpFreeBanks(unsigned FreeBanks) { + for (unsigned L = 0; L < NUM_BANKS; ++L) + if (FreeBanks & (1 << L)) + dbgs() << printBank(L) << ' '; + } +#endif +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", + false, false) + + +char GCNRegBankReassign::ID = 0; + +char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; + +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { + assert (TargetRegisterInfo::isPhysicalRegister(Reg)); + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size > 32) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + + if (TRI->hasVGPRs(RC)) { + Reg -= AMDGPU::VGPR0; + return Reg % NUM_VGPR_BANKS; + } + + Reg = TRI->getEncodingValue(Reg) / 2; + return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, + int Bank) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!VRM->isAssignedReg(Reg)) + return 0; + + Reg = VRM->getPhys(Reg); + if (!Reg) + return 0; + if (SubReg) + Reg = TRI->getSubReg(Reg, SubReg); + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getRegSizeInBits(*RC) / 32; + if (Size > 1) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + + if (TRI->hasVGPRs(RC)) { + // VGPRs have 4 banks assigned in a round-robin fashion. + Reg -= AMDGPU::VGPR0; + unsigned Mask = (1 << Size) - 1; + unsigned Used = 0; + // Bitmask lacks an extract method + for (unsigned I = 0; I < Size; ++I) + if (RegsUsed.test(Reg + I)) + Used |= 1 << I; + RegsUsed.set(Reg, Reg + Size); + Mask &= ~Used; + Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); + return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; + } + + // SGPRs have 8 banks holding 2 consequitive registers each. + Reg = TRI->getEncodingValue(Reg) / 2; + unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); + if (Reg + StartBit >= RegsUsed.size()) + return 0; + + if (Size > 1) + Size /= 2; + unsigned Mask = (1 << Size) - 1; + unsigned Used = 0; + for (unsigned I = 0; I < Size; ++I) + if (RegsUsed.test(StartBit + Reg + I)) + Used |= 1 << I; + RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); + Mask &= ~Used; + Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS + : unsigned(Bank - SGPR_BANK_OFFSET); + Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; + // Reserve 4 bank ids for VGPRs. + return Mask << SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, + unsigned& UsedBanks, + unsigned Reg, + int Bank) { + unsigned StallCycles = 0; + UsedBanks = 0; + + if (MI.isDebugValue()) + return 0; + + RegsUsed.reset(); + OperandMasks.clear(); + for (const auto& Op : MI.explicit_uses()) { + // Undef can be assigned to any register, so two vregs can be assigned + // the same phys reg within the same instruction. + if (!Op.isReg() || Op.isUndef()) + continue; + + unsigned R = Op.getReg(); + if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) + continue; + + unsigned ShiftedBank = Bank; + + if (Bank != -1 && R == Reg && Op.getSubReg()) { + unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); + if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { + // If a register spans all banks we cannot shift it to avoid conflict. + if (countPopulation(LM) >= NUM_VGPR_BANKS) + continue; + ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; + } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { + // If a register spans all banks we cannot shift it to avoid conflict. + if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) + continue; + ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + + (countTrailingZeros(LM) >> 1)) % + NUM_SGPR_BANKS; + } + } + + unsigned Mask = getRegBankMask(R, Op.getSubReg(), + (Reg == R) ? ShiftedBank : -1); + StallCycles += countPopulation(UsedBanks & Mask); + UsedBanks |= Mask; + OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); + } + + return StallCycles; +} + +unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, + unsigned Reg1, + unsigned Reg2, + unsigned StallCycles) const +{ + unsigned Defs = 0; + MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); + MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); + for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { + if (MI.isDebugInstr()) + continue; + --Def; + if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) + continue; + if (Def->modifiesRegister(Reg1, TRI)) + Defs |= 1; + if (Def->modifiesRegister(Reg2, TRI)) + Defs |= 2; + } + return countPopulation(Defs); +} + +bool GCNRegBankReassign::isReassignable(unsigned Reg) const { + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return false; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + unsigned PhysReg = VRM->getPhys(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return false; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return false; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return false; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); + if (TRI->hasVGPRs(RC)) + return true; + + unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size > 32) + PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); + + return AMDGPU::SGPR_32RegClass.contains(PhysReg); +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, + unsigned UsedBanks) const { + unsigned Size = countPopulation(Mask); + unsigned FreeBanks = 0; + unsigned Bank = findFirstSet(Mask); + + UsedBanks &= ~Mask; + + // Find free VGPR banks + if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { + for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { + if (Bank == I) + continue; + unsigned NewMask = ((1 << Size) - 1) << I; + NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; + if (!(UsedBanks & NewMask)) + FreeBanks |= 1 << I; + } + return FreeBanks; + } + + // Find free SGPR banks + // SGPR tuples must be aligned, so step is size in banks it + // crosses. + Bank -= SGPR_BANK_OFFSET; + for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { + if (Bank == I) + continue; + unsigned NewMask = ((1 << Size) - 1) << I; + NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; + if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) + FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; + } + + return FreeBanks; +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, + unsigned SubReg, + unsigned Mask, + unsigned UsedBanks) const { + if (!isReassignable(Reg)) + return 0; + + unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); + + unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); + if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { + unsigned Shift = countTrailingZeros(LM); + if (Shift >= NUM_VGPR_BANKS) + return 0; + unsigned VB = FreeBanks & VGPR_BANK_MASK; + FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & + VGPR_BANK_MASK; + } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { + unsigned Shift = countTrailingZeros(LM) >> 1; + if (Shift >= NUM_SGPR_BANKS) + return 0; + unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; + FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & + SGPR_BANK_SHIFTED_MASK; + FreeBanks <<= SGPR_BANK_OFFSET; + } + + LLVM_DEBUG(if (FreeBanks) { + dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) + << " to banks: "; dumpFreeBanks(FreeBanks); + dbgs() << '\n'; }); + + return FreeBanks; +} + +void GCNRegBankReassign::collectCandidates(MachineInstr& MI, + unsigned UsedBanks, + unsigned StallCycles) { + LLVM_DEBUG(MI.dump()); + + if (!StallCycles) + return; + + LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); + + for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { + for (unsigned J = I + 1; J != E; ++J) { + if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) + continue; + + unsigned Reg1 = OperandMasks[I].Reg; + unsigned Reg2 = OperandMasks[J].Reg; + unsigned SubReg1 = OperandMasks[I].SubReg; + unsigned SubReg2 = OperandMasks[J].SubReg; + unsigned Mask1 = OperandMasks[I].Mask; + unsigned Mask2 = OperandMasks[J].Mask; + unsigned Size1 = countPopulation(Mask1); + unsigned Size2 = countPopulation(Mask2); + + LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << + " and " << printReg(Reg2, SubReg2) << '\n'); + + unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); + Weight += MLI->getLoopDepth(MI.getParent()) * 10; + + LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); + + unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); + unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); + if (FreeBanks1) + Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight + + ((Size2 > Size1) ? 1 : 0))); + if (FreeBanks2) + Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight + + ((Size1 > Size2) ? 1 : 0))); + } + } +} + +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, + unsigned Reg, int Bank, + bool Collect) { + unsigned TotalStallCycles = 0; + unsigned UsedBanks = 0; + SmallSet Visited; + + for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { + if (MI.isBundle()) + continue; + if (!Visited.insert(&MI).second) + continue; + unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); + TotalStallCycles += StallCycles; + if (Collect) + collectCandidates(MI, UsedBanks, StallCycles); + } + + return TotalStallCycles; +} + +unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, + unsigned Bank) const { + const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); + unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs + : MaxNumSGPRs; + unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 + : AMDGPU::SGPR0); + + for (unsigned Reg : RC->getRegisters()) { + // Check occupancy limit. + if (TRI->isSubRegisterEq(Reg, MaxReg)) + break; + + if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) + continue; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return AMDGPU::NoRegister; + + LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); + + if (!LRM->checkInterference(LI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + +unsigned GCNRegBankReassign::tryReassign(Candidate &C) { + if (!LIS->hasInterval(C.Reg)) + return 0; + + LiveInterval &LI = LIS->getInterval(C.Reg); + LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); + LI.dump()); + + // For each candidate bank walk all instructions in the range of live + // interval and check if replacing the register with one belonging to + // the candidate bank reduces conflicts. + + unsigned OrigStalls = computeStallCycles(C.Reg); + LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); + if (!OrigStalls) + return 0; + + struct BankStall { + BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; + bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } + unsigned Bank; + unsigned Stalls; + }; + SmallVector BankStalls; + + for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { + if (C.FreeBanks & (1 << Bank)) { + LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); + unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); + if (Stalls < OrigStalls) { + LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " + << Stalls << '\n'); + BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); + } + } + } + std::sort(BankStalls.begin(), BankStalls.end()); + + unsigned OrigReg = VRM->getPhys(C.Reg); + LRM->unassign(LI); + while (!BankStalls.empty()) { + BankStall BS = BankStalls.pop_back_val(); + unsigned Reg = scavengeReg(LI, BS.Bank); + if (Reg == AMDGPU::NoRegister) { + LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) + << '\n'); + continue; + } + LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) + << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") + << " in bank " << printBank(BS.Bank) << '\n'); + + LRM->assign(LI, Reg); + + LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); + + return OrigStalls - BS.Stalls; + } + LRM->assign(LI, OrigReg); + + return 0; +} + +unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, + bool Collect) { + unsigned TotalStallCycles = 0; + + for (MachineBasicBlock &MBB : MF) { + + LLVM_DEBUG(if (Collect) { + if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); + else dbgs() << MBB.getName(); dbgs() << ":\n"; + }); + + for (MachineInstr &MI : MBB.instrs()) { + if (MI.isBundle()) + continue; // we analyze the instructions inside the bundle individually + + unsigned UsedBanks = 0; + unsigned StallCycles = analyzeInst(MI, UsedBanks); + + if (Collect) + collectCandidates(MI, UsedBanks, StallCycles); + + TotalStallCycles += StallCycles; + } + + LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); + } + + return TotalStallCycles; +} + +void GCNRegBankReassign::removeCandidates(unsigned Reg) { + Candidates.remove_if([Reg, this](const Candidate& C) { + return C.MI->readsRegister(Reg, TRI); + }); +} + +bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, + unsigned OriginalCycles, + unsigned CyclesSaved) { + unsigned StallCycles = collectCandidates(MF, false); + LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles + << " stall cycles left\n"); + return StallCycles + CyclesSaved == OriginalCycles; +} + +bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget(); + if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + MLI = &getAnalysis(); + VRM = &getAnalysis(); + LRM = &getAnalysis(); + LIS = &getAnalysis(); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned Occupancy = MFI->getOccupancy(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumSGPRs = ST->getMaxNumSGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); + MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); + + CSRegs = MRI->getCalleeSavedRegs(); + + RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + + TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); + + LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() + << '\n'); + + unsigned StallCycles = collectCandidates(MF); + NumStallsDetected += StallCycles; + + LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " + "function " << MF.getName() << '\n'); + + Candidates.sort(); + + LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; + for (auto C : Candidates) C.dump(this); + dbgs() << "\n\n"); + + unsigned CyclesSaved = 0; + while (!Candidates.empty()) { + Candidate C = Candidates.back(); + unsigned LocalCyclesSaved = tryReassign(C); + CyclesSaved += LocalCyclesSaved; + + if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) + report_fatal_error("RegBank reassign stall cycles verification failed."); + + Candidates.pop_back(); + if (LocalCyclesSaved) { + removeCandidates(C.Reg); + computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); + Candidates.sort(); + + LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; + for (auto C : Candidates) + C.dump(this); + dbgs() << "\n\n"); + } + } + NumStallsRecovered += CyclesSaved; + + LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved + << " cycles saved in function " << MF.getName() << '\n'); + + Candidates.clear(); + + if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) + report_fatal_error("RegBank reassign stall cycles verification failed."); + + RegsUsed.clear(); + + return CyclesSaved > 0; +} diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 3d8cacc4f02c..39460fbd8a84 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -1,9 +1,8 @@ //===- GCNRegPressure.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -64,9 +63,10 @@ void llvm::printLivesAt(SlotIndex SI, } if (!Num) dbgs() << " \n"; } +#endif -static bool isEqual(const GCNRPTracker::LiveRegSet &S1, - const GCNRPTracker::LiveRegSet &S2) { +bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2) { if (S1.size() != S2.size()) return false; @@ -77,7 +77,7 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1, } return true; } -#endif + /////////////////////////////////////////////////////////////////////////////// // GCNRegPressure @@ -89,7 +89,9 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg, auto STI = static_cast(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : - (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); + STI->hasAGPRs(RC) ? + (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, @@ -110,16 +112,18 @@ void GCNRegPressure::inc(unsigned Reg, switch (auto Kind = getRegKind(Reg, MRI)) { case SGPR32: case VGPR32: + case AGPR32: assert(PrevMask.none() && NewMask == MaxMask); Value[Kind] += Sign; break; case SGPR_TUPLE: case VGPR_TUPLE: + case AGPR_TUPLE: assert(NewMask < MaxMask || NewMask == MaxMask); assert(PrevMask < NewMask); - Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] += Sign * (~PrevMask & NewMask).getNumLanes(); if (PrevMask.none()) { diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index 357d3b7b2334..e4894418b943 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -1,9 +1,8 @@ //===- GCNRegPressure.h -----------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -32,6 +31,8 @@ struct GCNRegPressure { SGPR_TUPLE, VGPR32, VGPR_TUPLE, + AGPR32, + AGPR_TUPLE, TOTAL_KINDS }; @@ -44,9 +45,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } unsigned getSGPRNum() const { return Value[SGPR32]; } - unsigned getVGPRNum() const { return Value[VGPR32]; } + unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); } - unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE], + Value[AGPR_TUPLE]); } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } unsigned getOccupancy(const GCNSubtarget &ST) const { @@ -191,6 +193,50 @@ GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); +/// creates a map MachineInstr -> LiveRegSet +/// R - range of iterators on instructions +/// After - upon entry or exit of every instruction +/// Note: there is no entry in the map for instructions with empty live reg set +/// Complexity = O(NumVirtRegs * averageLiveRangeSegmentsPerReg * lg(R)) +template +DenseMap +getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { + std::vector Indexes; + Indexes.reserve(std::distance(R.begin(), R.end())); + auto &SII = *LIS.getSlotIndexes(); + for (MachineInstr *I : R) { + auto SI = SII.getInstructionIndex(*I); + Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex()); + } + std::sort(Indexes.begin(), Indexes.end()); + + auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo(); + DenseMap LiveRegMap; + SmallVector LiveIdxs, SRLiveIdxs; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = TargetRegisterInfo::index2VirtReg(I); + if (!LIS.hasInterval(Reg)) + continue; + auto &LI = LIS.getInterval(Reg); + LiveIdxs.clear(); + if (!LI.findIndexesLiveAt(Indexes, std::back_inserter(LiveIdxs))) + continue; + if (!LI.hasSubRanges()) { + for (auto SI : LiveIdxs) + LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] = + MRI.getMaxLaneMaskForVReg(Reg); + } else + for (const auto &S : LI.subranges()) { + // constrain search for subranges by indexes live at main range + SRLiveIdxs.clear(); + S.findIndexesLiveAt(LiveIdxs, std::back_inserter(SRLiveIdxs)); + for (auto SI : SRLiveIdxs) + LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] |= S.LaneMask; + } + } + return LiveRegMap; +} + inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, const LiveIntervals &LIS) { return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, @@ -212,6 +258,9 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, return Res; } +bool isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2); + void printLivesAt(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index f09b7f6cff22..4ea990ae490e 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1,9 +1,8 @@ //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -446,8 +445,12 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { RPTracker.reset(*MBB->begin(), &LiveIn); MBBLiveIns.erase(LiveInIt); } else { - I = Regions[CurRegion].first; - RPTracker.reset(*I); + auto &Rgn = Regions[CurRegion]; + I = Rgn.first; + auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); + auto LRS = BBLiveInMap.lookup(NonDbgMI); + assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); + RPTracker.reset(*I, &LRS); } for ( ; ; ) { @@ -478,6 +481,23 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { } } +DenseMap +GCNScheduleDAGMILive::getBBLiveInMap() const { + assert(!Regions.empty()); + std::vector BBStarters; + BBStarters.reserve(Regions.size()); + auto I = Regions.rbegin(), E = Regions.rend(); + auto *BB = I->first->getParent(); + do { + auto *MI = &*skipDebugInstructionsForward(I->first, I->second); + BBStarters.push_back(MI); + do { + ++I; + } while (I != E && I->first->getParent() == BB); + } while (I != E); + return getLiveRegMap(BBStarters, false /*After*/, *LIS); +} + void GCNScheduleDAGMILive::finalizeSchedule() { GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); @@ -485,6 +505,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() { LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); + if (!Regions.empty()) + BBLiveInMap = getBBLiveInMap(); + do { Stage++; RegionIdx = 0; diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 3ac6af89cb9b..eaf3dee9ba5d 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -1,9 +1,8 @@ //===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,7 +26,7 @@ class GCNSubtarget; /// and the GenericScheduler is that GCNSchedStrategy uses different /// heuristics to determine excess/critical pressure sets. Its goal is to /// maximize kernel occupancy (i.e. maximum number of waves per simd). -class GCNMaxOccupancySchedStrategy : public GenericScheduler { +class GCNMaxOccupancySchedStrategy final : public GenericScheduler { friend class GCNScheduleDAGMILive; SUnit *pickNodeBidirectional(bool &IsTopNode); @@ -60,7 +59,7 @@ public: void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } }; -class GCNScheduleDAGMILive : public ScheduleDAGMILive { +class GCNScheduleDAGMILive final : public ScheduleDAGMILive { const GCNSubtarget &ST; @@ -78,7 +77,7 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { // Current region index. size_t RegionIdx; - // Vecor of regions recorder for later rescheduling + // Vector of regions recorder for later rescheduling SmallVector, 32> Regions; @@ -91,6 +90,9 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { // Temporary basic block live-in cache. DenseMap MBBLiveIns; + DenseMap BBLiveInMap; + DenseMap getBBLiveInMap() const; + // Return current region pressure. GCNRegPressure getRealRegPressure() const; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp deleted file mode 100644 index fab0f87dfcbe..000000000000 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ /dev/null @@ -1,1413 +0,0 @@ -//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUInstPrinter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "Utils/AMDGPUAsmUtils.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include - -using namespace llvm; -using namespace llvm::AMDGPU; - -void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { - OS.flush(); - printInstruction(MI, STI, OS); - printAnnotation(OS, Annot); -} - -void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xf); -} - -void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); -} - -void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // It's possible to end up with a 32-bit literal used with a 16-bit operand - // with ignored high bits. Print as 32-bit anyway in that case. - int64_t Imm = MI->getOperand(OpNo).getImm(); - if (isInt<16>(Imm) || isUInt<16>(Imm)) - O << formatHex(static_cast(Imm & 0xffff)); - else - printU32ImmOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xf); -} - -void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); -} - -void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); -} - -void AMDGPUInstPrinter::printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); -} - -void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); -} - -void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, - raw_ostream &O, StringRef BitName) { - if (MI->getOperand(OpNo).getImm()) { - O << ' ' << BitName; - } -} - -void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "offen"); -} - -void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "idxen"); -} - -void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "addr64"); -} - -void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint16_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm != 0) { - O << ((OpNo == 0)? "offset:" : " offset:"); - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint16_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm != 0) { - O << ((OpNo == 0)? "offset:" : " offset:"); - printS13ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset0:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset1:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printU32ImmOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printU32ImmOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printU32ImmOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "gds"); -} - -void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "glc"); -} - -void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "slc"); -} - -void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "tfe"); -} - -void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " dmask:"; - printU16ImmOperand(MI, OpNo, STI, O); - } -} - -void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "unorm"); -} - -void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "da"); -} - -void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - if (STI.hasFeature(AMDGPU::FeatureR128A16)) - printNamedBit(MI, OpNo, O, "a16"); - else - printNamedBit(MI, OpNo, O, "r128"); -} - -void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "lwe"); -} - -void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "d16"); -} - -void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " compr"; -} - -void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " vm"; -} - -void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (unsigned Val = MI->getOperand(OpNo).getImm()) { - O << " dfmt:" << (Val & 15); - O << ", nfmt:" << (Val >> 4); - } -} - -void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, - const MCRegisterInfo &MRI) { - switch (RegNo) { - case AMDGPU::VCC: - O << "vcc"; - return; - case AMDGPU::SCC: - O << "scc"; - return; - case AMDGPU::EXEC: - O << "exec"; - return; - case AMDGPU::M0: - O << "m0"; - return; - case AMDGPU::FLAT_SCR: - O << "flat_scratch"; - return; - case AMDGPU::XNACK_MASK: - O << "xnack_mask"; - return; - case AMDGPU::VCC_LO: - O << "vcc_lo"; - return; - case AMDGPU::VCC_HI: - O << "vcc_hi"; - return; - case AMDGPU::TBA_LO: - O << "tba_lo"; - return; - case AMDGPU::TBA_HI: - O << "tba_hi"; - return; - case AMDGPU::TMA_LO: - O << "tma_lo"; - return; - case AMDGPU::TMA_HI: - O << "tma_hi"; - return; - case AMDGPU::EXEC_LO: - O << "exec_lo"; - return; - case AMDGPU::EXEC_HI: - O << "exec_hi"; - return; - case AMDGPU::FLAT_SCR_LO: - O << "flat_scratch_lo"; - return; - case AMDGPU::FLAT_SCR_HI: - O << "flat_scratch_hi"; - return; - case AMDGPU::XNACK_MASK_LO: - O << "xnack_mask_lo"; - return; - case AMDGPU::XNACK_MASK_HI: - O << "xnack_mask_hi"; - return; - case AMDGPU::FP_REG: - case AMDGPU::SP_REG: - case AMDGPU::SCRATCH_WAVE_OFFSET_REG: - case AMDGPU::PRIVATE_RSRC_REG: - llvm_unreachable("pseudo-register should not ever be emitted"); - default: - break; - } - - // The low 8 bits of the encoding value is the register index, for both VGPRs - // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1); - - unsigned NumRegs; - if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) { - O <<'v'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 3; - } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 16; - } else { - O << getRegisterName(RegNo); - return; - } - - if (NumRegs == 1) { - O << RegIdx; - return; - } - - O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; -} - -void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) - O << "_e64 "; - else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) - O << "_dpp "; - else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) - O << "_sdwa "; - else - O << "_e32 "; - - printOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI)) - O << " "; - else - O << "_e32 "; - - printOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int16_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == 0x3C00) - O<< "1.0"; - else if (Imm == 0xBC00) - O<< "-1.0"; - else if (Imm == 0x3800) - O<< "0.5"; - else if (Imm == 0xB800) - O<< "-0.5"; - else if (Imm == 0x4000) - O<< "2.0"; - else if (Imm == 0xC000) - O<< "-2.0"; - else if (Imm == 0x4400) - O<< "4.0"; - else if (Imm == 0xC400) - O<< "-4.0"; - else if (Imm == 0x3118) { - assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]); - O << "0.15915494"; - } else - O << formatHex(static_cast(Imm)); -} - -void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint16_t Lo16 = static_cast(Imm); - printImmediate16(Lo16, STI, O); -} - -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int32_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == FloatToBits(0.0f)) - O << "0.0"; - else if (Imm == FloatToBits(1.0f)) - O << "1.0"; - else if (Imm == FloatToBits(-1.0f)) - O << "-1.0"; - else if (Imm == FloatToBits(0.5f)) - O << "0.5"; - else if (Imm == FloatToBits(-0.5f)) - O << "-0.5"; - else if (Imm == FloatToBits(2.0f)) - O << "2.0"; - else if (Imm == FloatToBits(-2.0f)) - O << "-2.0"; - else if (Imm == FloatToBits(4.0f)) - O << "4.0"; - else if (Imm == FloatToBits(-4.0f)) - O << "-4.0"; - else if (Imm == 0x3e22f983 && - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) - O << "0.15915494"; - else - O << formatHex(static_cast(Imm)); -} - -void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int64_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == DoubleToBits(0.0)) - O << "0.0"; - else if (Imm == DoubleToBits(1.0)) - O << "1.0"; - else if (Imm == DoubleToBits(-1.0)) - O << "-1.0"; - else if (Imm == DoubleToBits(0.5)) - O << "0.5"; - else if (Imm == DoubleToBits(-0.5)) - O << "-0.5"; - else if (Imm == DoubleToBits(2.0)) - O << "2.0"; - else if (Imm == DoubleToBits(-2.0)) - O << "-2.0"; - else if (Imm == DoubleToBits(4.0)) - O << "4.0"; - else if (Imm == DoubleToBits(-4.0)) - O << "-4.0"; - else if (Imm == 0x3fc45f306dc9c882 && - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) - O << "0.15915494"; - else { - assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882); - - // In rare situations, we will have a 32-bit literal in a 64-bit - // operand. This is technically allowed for the encoding of s_mov_b64. - O << formatHex(static_cast(Imm)); - } -} - -void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (OpNo >= MI->getNumOperands()) { - O << "/*Missing OP" << OpNo << "*/"; - return; - } - - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegOperand(Op.getReg(), O, MRI); - } else if (Op.isImm()) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - switch (Desc.OpInfo[OpNo].OperandType) { - case AMDGPU::OPERAND_REG_IMM_INT32: - case AMDGPU::OPERAND_REG_IMM_FP32: - case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: - case MCOI::OPERAND_IMMEDIATE: - printImmediate32(Op.getImm(), STI, O); - break; - case AMDGPU::OPERAND_REG_IMM_INT64: - case AMDGPU::OPERAND_REG_IMM_FP64: - case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: - printImmediate64(Op.getImm(), STI, O); - break; - case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: - printImmediate16(Op.getImm(), STI, O); - break; - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - printImmediateV216(Op.getImm(), STI, O); - break; - case MCOI::OPERAND_UNKNOWN: - case MCOI::OPERAND_PCREL: - O << formatDec(Op.getImm()); - break; - case MCOI::OPERAND_REGISTER: - // FIXME: This should be removed and handled somewhere else. Seems to come - // from a disassembler bug. - O << "/*invalid immediate*/"; - break; - default: - // We hit this for the immediate instruction bits that don't yet have a - // custom printer. - llvm_unreachable("unexpected immediate operand type"); - } - } else if (Op.isFPImm()) { - // We special case 0.0 because otherwise it will be printed as an integer. - if (Op.getFPImm() == 0.0) - O << "0.0"; - else { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int RCID = Desc.OpInfo[OpNo].RegClass; - unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); - if (RCBits == 32) - printImmediate32(FloatToBits(Op.getFPImm()), STI, O); - else if (RCBits == 64) - printImmediate64(DoubleToBits(Op.getFPImm()), STI, O); - else - llvm_unreachable("Invalid register class size"); - } - } else if (Op.isExpr()) { - const MCExpr *Exp = Op.getExpr(); - Exp->print(O, &MAI); - } else { - O << "/*INV_OP*/"; - } -} - -void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, - unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - - // Use 'neg(...)' instead of '-' to avoid ambiguity. - // This is important for integer literals because - // -1 is not the same value as neg(1). - bool NegMnemo = false; - - if (InputModifiers & SISrcMods::NEG) { - if (OpNo + 1 < MI->getNumOperands() && - (InputModifiers & SISrcMods::ABS) == 0) { - const MCOperand &Op = MI->getOperand(OpNo + 1); - NegMnemo = Op.isImm() || Op.isFPImm(); - } - if (NegMnemo) { - O << "neg("; - } else { - O << '-'; - } - } - - if (InputModifiers & SISrcMods::ABS) - O << '|'; - printOperand(MI, OpNo + 1, STI, O); - if (InputModifiers & SISrcMods::ABS) - O << '|'; - - if (NegMnemo) { - O << ')'; - } -} - -void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, - unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & SISrcMods::SEXT) - O << "sext("; - printOperand(MI, OpNo + 1, STI, O); - if (InputModifiers & SISrcMods::SEXT) - O << ')'; -} - -void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - using namespace AMDGPU::DPP; - - unsigned Imm = MI->getOperand(OpNo).getImm(); - if (Imm <= DppCtrl::QUAD_PERM_LAST) { - O << " quad_perm:["; - O << formatDec(Imm & 0x3) << ','; - O << formatDec((Imm & 0xc) >> 2) << ','; - O << formatDec((Imm & 0x30) >> 4) << ','; - O << formatDec((Imm & 0xc0) >> 6) << ']'; - } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) && - (Imm <= DppCtrl::ROW_SHL_LAST)) { - O << " row_shl:"; - printU4ImmDecOperand(MI, OpNo, O); - } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) && - (Imm <= DppCtrl::ROW_SHR_LAST)) { - O << " row_shr:"; - printU4ImmDecOperand(MI, OpNo, O); - } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) && - (Imm <= DppCtrl::ROW_ROR_LAST)) { - O << " row_ror:"; - printU4ImmDecOperand(MI, OpNo, O); - } else if (Imm == DppCtrl::WAVE_SHL1) { - O << " wave_shl:1"; - } else if (Imm == DppCtrl::WAVE_ROL1) { - O << " wave_rol:1"; - } else if (Imm == DppCtrl::WAVE_SHR1) { - O << " wave_shr:1"; - } else if (Imm == DppCtrl::WAVE_ROR1) { - O << " wave_ror:1"; - } else if (Imm == DppCtrl::ROW_MIRROR) { - O << " row_mirror"; - } else if (Imm == DppCtrl::ROW_HALF_MIRROR) { - O << " row_half_mirror"; - } else if (Imm == DppCtrl::BCAST15) { - O << " row_bcast:15"; - } else if (Imm == DppCtrl::BCAST31) { - O << " row_bcast:31"; - } else { - O << " /* Invalid dpp_ctrl value */"; - } -} - -void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << " row_mask:"; - printU4ImmOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << " bank_mask:"; - printU4ImmOperand(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNo).getImm(); - if (Imm) { - O << " bound_ctrl:0"; // XXX - this syntax is used in sp3 - } -} - -void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - using namespace llvm::AMDGPU::SDWA; - - unsigned Imm = MI->getOperand(OpNo).getImm(); - switch (Imm) { - case SdwaSel::BYTE_0: O << "BYTE_0"; break; - case SdwaSel::BYTE_1: O << "BYTE_1"; break; - case SdwaSel::BYTE_2: O << "BYTE_2"; break; - case SdwaSel::BYTE_3: O << "BYTE_3"; break; - case SdwaSel::WORD_0: O << "WORD_0"; break; - case SdwaSel::WORD_1: O << "WORD_1"; break; - case SdwaSel::DWORD: O << "DWORD"; break; - default: llvm_unreachable("Invalid SDWA data select operand"); - } -} - -void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "dst_sel:"; - printSDWASel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "src0_sel:"; - printSDWASel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "src1_sel:"; - printSDWASel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - using namespace llvm::AMDGPU::SDWA; - - O << "dst_unused:"; - unsigned Imm = MI->getOperand(OpNo).getImm(); - switch (Imm) { - case DstUnused::UNUSED_PAD: O << "UNUSED_PAD"; break; - case DstUnused::UNUSED_SEXT: O << "UNUSED_SEXT"; break; - case DstUnused::UNUSED_PRESERVE: O << "UNUSED_PRESERVE"; break; - default: llvm_unreachable("Invalid SDWA dest_unused operand"); - } -} - -template -void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Opc = MI->getOpcode(); - int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en); - unsigned En = MI->getOperand(EnIdx).getImm(); - - int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr); - - // If compr is set, print as src0, src0, src1, src1 - if (MI->getOperand(ComprIdx).getImm()) { - if (N == 1 || N == 2) - --OpNo; - else if (N == 3) - OpNo -= 2; - } - - if (En & (1 << N)) - printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); - else - O << "off"; -} - -void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printExpSrcN<0>(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printExpSrcN<1>(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printExpSrcN<2>(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printExpSrcN<3>(MI, OpNo, STI, O); -} - -void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // This is really a 6 bit field. - uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1); - - if (Tgt <= 7) - O << " mrt" << Tgt; - else if (Tgt == 8) - O << " mrtz"; - else if (Tgt == 9) - O << " null"; - else if (Tgt >= 12 && Tgt <= 15) - O << " pos" << Tgt - 12; - else if (Tgt >= 32 && Tgt <= 63) - O << " param" << Tgt - 32; - else { - // Reserved values 10, 11 - O << " invalid_target_" << Tgt; - } -} - -static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod, - bool IsPacked, bool HasDstSel) { - int DefaultValue = IsPacked && (Mod == SISrcMods::OP_SEL_1); - - for (int I = 0; I < NumOps; ++I) { - if (!!(Ops[I] & Mod) != DefaultValue) - return false; - } - - if (HasDstSel && (Ops[0] & SISrcMods::DST_OP_SEL) != 0) - return false; - - return true; -} - -void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, - StringRef Name, - unsigned Mod, - raw_ostream &O) { - unsigned Opc = MI->getOpcode(); - int NumOps = 0; - int Ops[3]; - - for (int OpName : { AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers }) { - int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); - if (Idx == -1) - break; - - Ops[NumOps++] = MI->getOperand(Idx).getImm(); - } - - const bool HasDstSel = - NumOps > 0 && - Mod == SISrcMods::OP_SEL_0 && - MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3_OPSEL; - - const bool IsPacked = - MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsPacked; - - if (allOpsDefaultValue(Ops, NumOps, Mod, IsPacked, HasDstSel)) - return; - - O << Name; - for (int I = 0; I < NumOps; ++I) { - if (I != 0) - O << ','; - - O << !!(Ops[I] & Mod); - } - - if (HasDstSel) { - O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL); - } - - O << ']'; -} - -void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); -} - -void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O); -} - -void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O); -} - -void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); -} - -void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - switch (Imm) { - case 0: - O << "p10"; - break; - case 1: - O << "p20"; - break; - case 2: - O << "p0"; - break; - default: - O << "invalid_param_" << Imm; - } -} - -void AMDGPUInstPrinter::printInterpAttr(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Attr = MI->getOperand(OpNum).getImm(); - O << "attr" << Attr; -} - -void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Chan = MI->getOperand(OpNum).getImm(); - O << '.' << "xyzw"[Chan & 0x3]; -} - -void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - if (Val == 0) { - O << " 0"; - return; - } - - if (Val & VGPRIndexMode::DST_ENABLE) - O << " dst"; - - if (Val & VGPRIndexMode::SRC0_ENABLE) - O << " src0"; - - if (Val & VGPRIndexMode::SRC1_ENABLE) - O << " src1"; - - if (Val & VGPRIndexMode::SRC2_ENABLE) - O << " src2"; -} - -void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printOperand(MI, OpNo, STI, O); - O << ", "; - printOperand(MI, OpNo + 1, STI, O); -} - -void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, - raw_ostream &O, StringRef Asm, - StringRef Default) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm()); - if (Op.getImm() == 1) { - O << Asm; - } else { - O << Default; - } -} - -void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, - raw_ostream &O, char Asm) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm()); - if (Op.getImm() == 1) - O << Asm; -} - -void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " high"; -} - -void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " clamp"; -} - -void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int Imm = MI->getOperand(OpNo).getImm(); - if (Imm == SIOutMods::MUL2) - O << " mul:2"; - else if (Imm == SIOutMods::MUL4) - O << " mul:4"; - else if (Imm == SIOutMods::DIV2) - O << " div:2"; -} - -void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - using namespace llvm::AMDGPU::SendMsg; - - const unsigned SImm16 = MI->getOperand(OpNo).getImm(); - const unsigned Id = SImm16 & ID_MASK_; - do { - if (Id == ID_INTERRUPT) { - if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0. - break; - O << "sendmsg(" << IdSymbolic[Id] << ')'; - return; - } - if (Id == ID_GS || Id == ID_GS_DONE) { - if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0. - break; - const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_; - const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; - if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only. - break; - if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits. - break; - O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs]; - if (OpGs != OP_GS_NOP) { O << ", " << StreamId; } - O << ')'; - return; - } - if (Id == ID_SYSMSG) { - if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0. - break; - const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_; - if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown. - break; - O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')'; - return; - } - } while (false); - O << SImm16; // Unknown simm16 code. -} - -static void printSwizzleBitmask(const uint16_t AndMask, - const uint16_t OrMask, - const uint16_t XorMask, - raw_ostream &O) { - using namespace llvm::AMDGPU::Swizzle; - - uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask; - uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask; - - O << "\""; - - for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) { - uint16_t p0 = Probe0 & Mask; - uint16_t p1 = Probe1 & Mask; - - if (p0 == p1) { - if (p0 == 0) { - O << "0"; - } else { - O << "1"; - } - } else { - if (p0 == 0) { - O << "p"; - } else { - O << "i"; - } - } - } - - O << "\""; -} - -void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - using namespace llvm::AMDGPU::Swizzle; - - uint16_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm == 0) { - return; - } - - O << " offset:"; - - if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { - - O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; - for (auto i = 0; i < LANE_NUM; ++i) { - O << ","; - O << formatDec(Imm & LANE_MASK); - Imm >>= LANE_SHIFT; - } - O << ")"; - - } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) { - - uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK; - uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK; - uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK; - - if (AndMask == BITMASK_MAX && - OrMask == 0 && - countPopulation(XorMask) == 1) { - - O << "swizzle(" << IdSymbolic[ID_SWAP]; - O << ","; - O << formatDec(XorMask); - O << ")"; - - } else if (AndMask == BITMASK_MAX && - OrMask == 0 && XorMask > 0 && - isPowerOf2_64(XorMask + 1)) { - - O << "swizzle(" << IdSymbolic[ID_REVERSE]; - O << ","; - O << formatDec(XorMask + 1); - O << ")"; - - } else { - - uint16_t GroupSize = BITMASK_MAX - AndMask + 1; - if (GroupSize > 1 && - isPowerOf2_64(GroupSize) && - OrMask < GroupSize && - XorMask == 0) { - - O << "swizzle(" << IdSymbolic[ID_BROADCAST]; - O << ","; - O << formatDec(GroupSize); - O << ","; - O << formatDec(OrMask); - O << ")"; - - } else { - O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM]; - O << ","; - printSwizzleBitmask(AndMask, OrMask, XorMask, O); - O << ")"; - } - } - } else { - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU()); - - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Vmcnt, Expcnt, Lgkmcnt; - decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); - - bool NeedSpace = false; - - if (Vmcnt != getVmcntBitMask(ISA)) { - O << "vmcnt(" << Vmcnt << ')'; - NeedSpace = true; - } - - if (Expcnt != getExpcntBitMask(ISA)) { - if (NeedSpace) - O << ' '; - O << "expcnt(" << Expcnt << ')'; - NeedSpace = true; - } - - if (Lgkmcnt != getLgkmcntBitMask(ISA)) { - if (NeedSpace) - O << ' '; - O << "lgkmcnt(" << Lgkmcnt << ')'; - } -} - -void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - using namespace llvm::AMDGPU::Hwreg; - - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_; - const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_; - const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; - - O << "hwreg("; - unsigned Last = ID_SYMBOLIC_LAST_; - if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI)) - Last = ID_SYMBOLIC_FIRST_GFX9_; - if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) { - O << IdSymbolic[Id]; - } else { - O << Id; - } - if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) { - O << ", " << Offset << ", " << Width; - } - O << ')'; -} - -#include "AMDGPUGenAsmWriter.inc" - -void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - O.flush(); - printInstruction(MI, O); - printAnnotation(O, Annot); -} - -void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|'); -} - -void R600InstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int BankSwizzle = MI->getOperand(OpNo).getImm(); - switch (BankSwizzle) { - case 1: - O << "BS:VEC_021/SCL_122"; - break; - case 2: - O << "BS:VEC_120/SCL_212"; - break; - case 3: - O << "BS:VEC_102/SCL_221"; - break; - case 4: - O << "BS:VEC_201"; - break; - case 5: - O << "BS:VEC_210"; - break; - default: - break; - } -} - -void R600InstPrinter::printClamp(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "_SAT"); -} - -void R600InstPrinter::printCT(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned CT = MI->getOperand(OpNo).getImm(); - switch (CT) { - case 0: - O << 'U'; - break; - case 1: - O << 'N'; - break; - default: - break; - } -} - -void R600InstPrinter::printKCache(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int KCacheMode = MI->getOperand(OpNo).getImm(); - if (KCacheMode > 0) { - int KCacheBank = MI->getOperand(OpNo - 2).getImm(); - O << "CB" << KCacheBank << ':'; - int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); - int LineSize = (KCacheMode == 1) ? 16 : 32; - O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; - } -} - -void R600InstPrinter::printLast(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "*", " "); -} - -void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm() || Op.isExpr()); - if (Op.isImm()) { - int64_t Imm = Op.getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; - } - if (Op.isExpr()) { - Op.getExpr()->print(O << '@', &MAI); - } -} - -void R600InstPrinter::printNeg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '-'); -} - -void R600InstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - switch (MI->getOperand(OpNo).getImm()) { - default: break; - case 1: - O << " * 2.0"; - break; - case 2: - O << " * 4.0"; - break; - case 3: - O << " / 2.0"; - break; - } -} - -void R600InstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printOperand(MI, OpNo, O); - O << ", "; - printOperand(MI, OpNo + 1, O); -} - -void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (OpNo >= MI->getNumOperands()) { - O << "/*Missing OP" << OpNo << "*/"; - return; - } - - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - switch (Op.getReg()) { - // This is the default predicate state, so we don't need to print it. - case R600::PRED_SEL_OFF: - break; - - default: - O << getRegisterName(Op.getReg()); - break; - } - } else if (Op.isImm()) { - O << Op.getImm(); - } else if (Op.isFPImm()) { - // We special case 0.0 because otherwise it will be printed as an integer. - if (Op.getFPImm() == 0.0) - O << "0.0"; - else { - O << Op.getFPImm(); - } - } else if (Op.isExpr()) { - const MCExpr *Exp = Op.getExpr(); - Exp->print(O, &MAI); - } else { - O << "/*INV_OP*/"; - } -} - -void R600InstPrinter::printRel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '+'); -} - -void R600InstPrinter::printRSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Sel = MI->getOperand(OpNo).getImm(); - switch (Sel) { - case 0: - O << 'X'; - break; - case 1: - O << 'Y'; - break; - case 2: - O << 'Z'; - break; - case 3: - O << 'W'; - break; - case 4: - O << '0'; - break; - case 5: - O << '1'; - break; - case 7: - O << '_'; - break; - default: - break; - } -} - -void R600InstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "ExecMask,"); -} - -void R600InstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "Pred,"); -} - -void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.getImm() == 0) { - O << " (MASKED)"; - } -} - -#include "R600GenAsmWriter.inc" diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h deleted file mode 100644 index 0ba74ca0f3e1..000000000000 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ /dev/null @@ -1,250 +0,0 @@ -//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H -#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class AMDGPUInstPrinter : public MCInstPrinter { -public: - AMDGPUInstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - //Autogenerated by tblgen - void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - static void printRegOperand(unsigned RegNo, raw_ostream &O, - const MCRegisterInfo &MRI); - -private: - void printU4ImmOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU32ImmOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, - StringRef BitName); - void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - - void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printSMRDOffset8(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSMRDOffset20(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printLWE(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printD16(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpCompr(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpVM(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printFORMAT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printRegOperand(unsigned RegNo, raw_ostream &O); - void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); - void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); - void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); - void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printBankMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printBoundCtrl(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSDWADstSel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSDWADstUnused(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, - raw_ostream &O); - void printOpSel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printOpSelHi(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNegLo(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNegHi(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printInterpSlot(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printInterpAttr(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printInterpAttrChan(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printVGPRIndexMode(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printMemOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - - - template - void printExpSrcN(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpSrc0(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpSrc1(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpSrc2(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpSrc3(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpTgt(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - -public: - static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, - StringRef Asm, StringRef Default = ""); - static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, - char Asm); -protected: - void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printUpdateExecMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printUpdatePred(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printBankSwizzle(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printWaitFlag(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); -}; - -class R600InstPrinter : public MCInstPrinter { -public: - R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index abc88c02adca..57c0ba26cc3a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -19,8 +18,10 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/TargetRegistry.h" +#include "Utils/AMDGPUBaseInfo.h" using namespace llvm; +using namespace llvm::AMDGPU; namespace { @@ -36,17 +37,13 @@ public: const MCSubtargetInfo *STI) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override { - return false; - } + const MCAsmLayout &Layout) const override; + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override { - llvm_unreachable("Not implemented"); - } + MCInst &Res) const override; + bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } + const MCSubtargetInfo &STI) const override; unsigned getMinimumNopSize() const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; @@ -56,6 +53,36 @@ public: } //End anonymous namespace +void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI, + MCInst &Res) const { + unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()); + Res.setOpcode(RelaxedOpcode); + Res.addOperand(Inst.getOperand(0)); + return; +} + +bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + // if the branch target has an offset of x3f this needs to be relaxed to + // add a s_nop 0 immediately after branch to effectively increment offset + // for hardware workaround in gfx1010 + return (((int64_t(Value)/4)-1) == 0x3f); +} + +bool AMDGPUAsmBackend::mayNeedRelaxation(const MCInst &Inst, + const MCSubtargetInfo &STI) const { + if (!STI.getFeatureBits()[AMDGPU::FeatureOffset3fBug]) + return false; + + if (AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()) >= 0) + return true; + + return false; +} + static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { case AMDGPU::fixup_si_sopp_br: @@ -173,11 +200,13 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { bool Is64Bit; bool HasRelocationAddend; uint8_t OSABI = ELF::ELFOSABI_NONE; + uint8_t ABIVersion = 0; public: - ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) : + ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) : AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), - HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { + HasRelocationAddend(TT.getOS() == Triple::AMDHSA), + ABIVersion(ABIVersion) { switch (TT.getOS()) { case Triple::AMDHSA: OSABI = ELF::ELFOSABI_AMDGPU_HSA; @@ -195,7 +224,8 @@ public: std::unique_ptr createObjectTargetWriter() const override { - return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend); + return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, + ABIVersion); } }; @@ -206,5 +236,6 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { // Use 64-bit ELF for amdgcn - return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple()); + return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(), + IsaInfo::hasCodeObjectV3(&STI) ? 1 : 0); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index c85a1ea5b054..6549a8d7d592 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -23,7 +22,8 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: - AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend); + AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend, + uint8_t ABIVersion); protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, @@ -35,9 +35,10 @@ protected: AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend) + bool HasRelocationAddend, + uint8_t ABIVersion) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU, - HasRelocationAddend) {} + HasRelocationAddend, ABIVersion) {} unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, @@ -84,7 +85,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend) { + bool HasRelocationAddend, + uint8_t ABIVersion) { return llvm::make_unique(Is64Bit, OSABI, - HasRelocationAddend); + HasRelocationAddend, + ABIVersion); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index c627a08e7463..40437d8fa1a4 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -1,9 +1,8 @@ //===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h index 41e9063a759e..9fbf53c944ef 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -1,9 +1,8 @@ //===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 20c1adfbc6b9..d49bb196ab3a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -1,9 +1,8 @@ //===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp new file mode 100644 index 000000000000..01b53432cbb7 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -0,0 +1,1568 @@ +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "Utils/AMDGPUAsmUtils.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; +using namespace llvm::AMDGPU; + +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + OS.flush(); + printInstruction(MI, STI, OS); + printAnnotation(OS, Annot); +} + +void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xf); +} + +void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // It's possible to end up with a 32-bit literal used with a 16-bit operand + // with ignored high bits. Print as 32-bit anyway in that case. + int64_t Imm = MI->getOperand(OpNo).getImm(); + if (isInt<16>(Imm) || isUInt<16>(Imm)) + O << formatHex(static_cast(Imm & 0xffff)); + else + printU32ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xf); +} + +void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); +} + +void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef BitName) { + if (MI->getOperand(OpNo).getImm()) { + O << ' ' << BitName; + } +} + +void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "offen"); +} + +void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "idxen"); +} + +void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "addr64"); +} + +void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << ((OpNo == 0)? "offset:" : " offset:"); + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << ((OpNo == 0)? "offset:" : " offset:"); + + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + bool IsFlatSeg = !(Desc.TSFlags & SIInstrFlags::IsNonFlatSeg); + + if (IsFlatSeg) { // Unsigned offset + printU16ImmDecOperand(MI, OpNo, O); + } else { // Signed offset + if (AMDGPU::isGFX10(STI)) { + O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm())); + } else { + O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); + } + } + } +} + +void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "gds"); +} + +void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (AMDGPU::isGFX10(STI)) + printNamedBit(MI, OpNo, O, "dlc"); +} + +void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "glc"); +} + +void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "slc"); +} + +void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "tfe"); +} + +void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " dmask:"; + printU16ImmOperand(MI, OpNo, STI, O); + } +} + +void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + unsigned Dim = MI->getOperand(OpNo).getImm(); + O << " dim:SQ_RSRC_IMG_"; + + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); + if (DimInfo) + O << DimInfo->AsmSuffix; + else + O << Dim; +} + +void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "unorm"); +} + +void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "da"); +} + +void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (STI.hasFeature(AMDGPU::FeatureR128A16)) + printNamedBit(MI, OpNo, O, "a16"); + else + printNamedBit(MI, OpNo, O, "r128"); +} + +void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "lwe"); +} + +void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "d16"); +} + +void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " compr"; +} + +void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " vm"; +} + +void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (unsigned Val = MI->getOperand(OpNo).getImm()) { + if (AMDGPU::isGFX10(STI)) + O << " format:" << Val; + else { + O << " dfmt:" << (Val & 15); + O << ", nfmt:" << (Val >> 4); + } + } +} + +void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, + const MCRegisterInfo &MRI) { +#if !defined(NDEBUG) + switch (RegNo) { + case AMDGPU::FP_REG: + case AMDGPU::SP_REG: + case AMDGPU::SCRATCH_WAVE_OFFSET_REG: + case AMDGPU::PRIVATE_RSRC_REG: + llvm_unreachable("pseudo-register should not ever be emitted"); + case AMDGPU::SCC: + llvm_unreachable("pseudo scc should not ever be emitted"); + default: + break; + } +#endif + + unsigned AltName = AMDGPU::Reg32; + + if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg64; + else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg128; + else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg96; + else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg160; + else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg256; + else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg512; + else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg1024; + + O << getRegisterName(RegNo, AltName); +} + +void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (OpNo == 0) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) + O << "_dpp "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) + O << "_sdwa "; + else + O << "_e32 "; + } + + printOperand(MI, OpNo, STI, O); + + // Print default vcc/vcc_lo operand. + switch (MI->getOpcode()) { + default: break; + + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: + printDefaultVccOperand(1, STI, O); + break; + } +} + +void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI)) + O << " "; + else + O << "_e32 "; + + printOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == 0x3C00) + O<< "1.0"; + else if (Imm == 0xBC00) + O<< "-1.0"; + else if (Imm == 0x3800) + O<< "0.5"; + else if (Imm == 0xB800) + O<< "-0.5"; + else if (Imm == 0x4000) + O<< "2.0"; + else if (Imm == 0xC000) + O<< "-2.0"; + else if (Imm == 0x4400) + O<< "4.0"; + else if (Imm == 0xC400) + O<< "-4.0"; + else if (Imm == 0x3118) { + assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]); + O << "0.15915494"; + } else + O << formatHex(static_cast(Imm)); +} + +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Lo16 = static_cast(Imm); + printImmediate16(Lo16, STI, O); +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int32_t SImm = static_cast(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == FloatToBits(0.0f)) + O << "0.0"; + else if (Imm == FloatToBits(1.0f)) + O << "1.0"; + else if (Imm == FloatToBits(-1.0f)) + O << "-1.0"; + else if (Imm == FloatToBits(0.5f)) + O << "0.5"; + else if (Imm == FloatToBits(-0.5f)) + O << "-0.5"; + else if (Imm == FloatToBits(2.0f)) + O << "2.0"; + else if (Imm == FloatToBits(-2.0f)) + O << "-2.0"; + else if (Imm == FloatToBits(4.0f)) + O << "4.0"; + else if (Imm == FloatToBits(-4.0f)) + O << "-4.0"; + else if (Imm == 0x3e22f983 && + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + O << "0.15915494"; + else + O << formatHex(static_cast(Imm)); +} + +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int64_t SImm = static_cast(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == DoubleToBits(0.0)) + O << "0.0"; + else if (Imm == DoubleToBits(1.0)) + O << "1.0"; + else if (Imm == DoubleToBits(-1.0)) + O << "-1.0"; + else if (Imm == DoubleToBits(0.5)) + O << "0.5"; + else if (Imm == DoubleToBits(-0.5)) + O << "-0.5"; + else if (Imm == DoubleToBits(2.0)) + O << "2.0"; + else if (Imm == DoubleToBits(-2.0)) + O << "-2.0"; + else if (Imm == DoubleToBits(4.0)) + O << "4.0"; + else if (Imm == DoubleToBits(-4.0)) + O << "-4.0"; + else if (Imm == 0x3fc45f306dc9c882 && + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + O << "0.15915494309189532"; + else { + assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882); + + // In rare situations, we will have a 32-bit literal in a 64-bit + // operand. This is technically allowed for the encoding of s_mov_b64. + O << formatHex(static_cast(Imm)); + } +} + +void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " blgp:" << Imm; +} + +void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " cbsz:" << Imm; +} + +void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " abid:" << Imm; +} + +void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (OpNo > 0) + O << ", "; + printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI); + if (OpNo == 0) + O << ", "; +} + +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Print default vcc/vcc_lo operand of VOPC. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) && + (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || + Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO))) + printDefaultVccOperand(OpNo, STI, O); + + if (OpNo >= MI->getNumOperands()) { + O << "/*Missing OP" << OpNo << "*/"; + return; + } + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegOperand(Op.getReg(), O, MRI); + } else if (Op.isImm()) { + switch (Desc.OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case MCOI::OPERAND_IMMEDIATE: + printImmediate32(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + printImmediate64(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + printImmediate16(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + if (!isUInt<16>(Op.getImm()) && + STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) { + printImmediate32(Op.getImm(), STI, O); + break; + } + LLVM_FALLTHROUGH; + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + printImmediateV216(Op.getImm(), STI, O); + break; + case MCOI::OPERAND_UNKNOWN: + case MCOI::OPERAND_PCREL: + O << formatDec(Op.getImm()); + break; + case MCOI::OPERAND_REGISTER: + // FIXME: This should be removed and handled somewhere else. Seems to come + // from a disassembler bug. + O << "/*invalid immediate*/"; + break; + default: + // We hit this for the immediate instruction bits that don't yet have a + // custom printer. + llvm_unreachable("unexpected immediate operand type"); + } + } else if (Op.isFPImm()) { + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int RCID = Desc.OpInfo[OpNo].RegClass; + unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); + if (RCBits == 32) + printImmediate32(FloatToBits(Op.getFPImm()), STI, O); + else if (RCBits == 64) + printImmediate64(DoubleToBits(Op.getFPImm()), STI, O); + else + llvm_unreachable("Invalid register class size"); + } + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O, &MAI); + } else { + O << "/*INV_OP*/"; + } + + // Print default vcc/vcc_lo operand of v_cndmask_b32_e32. + switch (MI->getOpcode()) { + default: break; + + case AMDGPU::V_CNDMASK_B32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: + + case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7: + case AMDGPU::V_CNDMASK_B32_e32_vi: + if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1)) + printDefaultVccOperand(OpNo, STI, O); + break; + } +} + +void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + + // Use 'neg(...)' instead of '-' to avoid ambiguity. + // This is important for integer literals because + // -1 is not the same value as neg(1). + bool NegMnemo = false; + + if (InputModifiers & SISrcMods::NEG) { + if (OpNo + 1 < MI->getNumOperands() && + (InputModifiers & SISrcMods::ABS) == 0) { + const MCOperand &Op = MI->getOperand(OpNo + 1); + NegMnemo = Op.isImm() || Op.isFPImm(); + } + if (NegMnemo) { + O << "neg("; + } else { + O << '-'; + } + } + + if (InputModifiers & SISrcMods::ABS) + O << '|'; + printOperand(MI, OpNo + 1, STI, O); + if (InputModifiers & SISrcMods::ABS) + O << '|'; + + if (NegMnemo) { + O << ')'; + } +} + +void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & SISrcMods::SEXT) + O << "sext("; + printOperand(MI, OpNo + 1, STI, O); + if (InputModifiers & SISrcMods::SEXT) + O << ')'; + + // Print default vcc/vcc_lo operand of VOP2b. + switch (MI->getOpcode()) { + default: break; + + case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: + if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1)) + printDefaultVccOperand(OpNo, STI, O); + break; + } +} + +void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (!AMDGPU::isGFX10(STI)) + llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10"); + + unsigned Imm = MI->getOperand(OpNo).getImm(); + O << " dpp8:[" << formatDec(Imm & 0x7); + for (size_t i = 1; i < 8; ++i) { + O << ',' << formatDec((Imm >> (3 * i)) & 0x7); + } + O << ']'; +} + +void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace AMDGPU::DPP; + + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm <= DppCtrl::QUAD_PERM_LAST) { + O << " quad_perm:["; + O << formatDec(Imm & 0x3) << ','; + O << formatDec((Imm & 0xc) >> 2) << ','; + O << formatDec((Imm & 0x30) >> 4) << ','; + O << formatDec((Imm & 0xc0) >> 6) << ']'; + } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) && + (Imm <= DppCtrl::ROW_SHL_LAST)) { + O << " row_shl:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) && + (Imm <= DppCtrl::ROW_SHR_LAST)) { + O << " row_shr:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) && + (Imm <= DppCtrl::ROW_ROR_LAST)) { + O << " row_ror:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if (Imm == DppCtrl::WAVE_SHL1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_shl is not supported starting from GFX10 */"; + return; + } + O << " wave_shl:1"; + } else if (Imm == DppCtrl::WAVE_ROL1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_rol is not supported starting from GFX10 */"; + return; + } + O << " wave_rol:1"; + } else if (Imm == DppCtrl::WAVE_SHR1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_shr is not supported starting from GFX10 */"; + return; + } + O << " wave_shr:1"; + } else if (Imm == DppCtrl::WAVE_ROR1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_ror is not supported starting from GFX10 */"; + return; + } + O << " wave_ror:1"; + } else if (Imm == DppCtrl::ROW_MIRROR) { + O << " row_mirror"; + } else if (Imm == DppCtrl::ROW_HALF_MIRROR) { + O << " row_half_mirror"; + } else if (Imm == DppCtrl::BCAST15) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* row_bcast is not supported starting from GFX10 */"; + return; + } + O << " row_bcast:15"; + } else if (Imm == DppCtrl::BCAST31) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* row_bcast is not supported starting from GFX10 */"; + return; + } + O << " row_bcast:31"; + } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) && + (Imm <= DppCtrl::ROW_SHARE_LAST)) { + if (!AMDGPU::isGFX10(STI)) { + O << " /* row_share is not supported on ASICs earlier than GFX10 */"; + return; + } + O << " row_share:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) && + (Imm <= DppCtrl::ROW_XMASK_LAST)) { + if (!AMDGPU::isGFX10(STI)) { + O << " /* row_xmask is not supported on ASICs earlier than GFX10 */"; + return; + } + O << "row_xmask:"; + printU4ImmDecOperand(MI, OpNo, O); + } else { + O << " /* Invalid dpp_ctrl value */"; + } +} + +void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << " row_mask:"; + printU4ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << " bank_mask:"; + printU4ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm) { + O << " bound_ctrl:0"; // XXX - this syntax is used in sp3 + } +} + +void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::DPP; + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) { + O << " fi:1"; + } +} + +void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + using namespace llvm::AMDGPU::SDWA; + + unsigned Imm = MI->getOperand(OpNo).getImm(); + switch (Imm) { + case SdwaSel::BYTE_0: O << "BYTE_0"; break; + case SdwaSel::BYTE_1: O << "BYTE_1"; break; + case SdwaSel::BYTE_2: O << "BYTE_2"; break; + case SdwaSel::BYTE_3: O << "BYTE_3"; break; + case SdwaSel::WORD_0: O << "WORD_0"; break; + case SdwaSel::WORD_1: O << "WORD_1"; break; + case SdwaSel::DWORD: O << "DWORD"; break; + default: llvm_unreachable("Invalid SDWA data select operand"); + } +} + +void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "dst_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "src0_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "src1_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::SDWA; + + O << "dst_unused:"; + unsigned Imm = MI->getOperand(OpNo).getImm(); + switch (Imm) { + case DstUnused::UNUSED_PAD: O << "UNUSED_PAD"; break; + case DstUnused::UNUSED_SEXT: O << "UNUSED_SEXT"; break; + case DstUnused::UNUSED_PRESERVE: O << "UNUSED_PRESERVE"; break; + default: llvm_unreachable("Invalid SDWA dest_unused operand"); + } +} + +template +void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en); + unsigned En = MI->getOperand(EnIdx).getImm(); + + int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr); + + // If compr is set, print as src0, src0, src1, src1 + if (MI->getOperand(ComprIdx).getImm()) { + if (N == 1 || N == 2) + --OpNo; + else if (N == 3) + OpNo -= 2; + } + + if (En & (1 << N)) + printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); + else + O << "off"; +} + +void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<0>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<1>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<2>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<3>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // This is really a 6 bit field. + uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1); + + if (Tgt <= 7) + O << " mrt" << Tgt; + else if (Tgt == 8) + O << " mrtz"; + else if (Tgt == 9) + O << " null"; + else if ((Tgt >= 12 && Tgt <= 15) || (Tgt == 16 && AMDGPU::isGFX10(STI))) + O << " pos" << Tgt - 12; + else if (AMDGPU::isGFX10(STI) && Tgt == 20) + O << " prim"; + else if (Tgt >= 32 && Tgt <= 63) + O << " param" << Tgt - 32; + else { + // Reserved values 10, 11 + O << " invalid_target_" << Tgt; + } +} + +static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod, + bool IsPacked, bool HasDstSel) { + int DefaultValue = IsPacked && (Mod == SISrcMods::OP_SEL_1); + + for (int I = 0; I < NumOps; ++I) { + if (!!(Ops[I] & Mod) != DefaultValue) + return false; + } + + if (HasDstSel && (Ops[0] & SISrcMods::DST_OP_SEL) != 0) + return false; + + return true; +} + +void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, + StringRef Name, + unsigned Mod, + raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + int NumOps = 0; + int Ops[3]; + + for (int OpName : { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx == -1) + break; + + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + } + + const bool HasDstSel = + NumOps > 0 && + Mod == SISrcMods::OP_SEL_0 && + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3_OPSEL; + + const bool IsPacked = + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsPacked; + + if (allOpsDefaultValue(Ops, NumOps, Mod, IsPacked, HasDstSel)) + return; + + O << Name; + for (int I = 0; I < NumOps; ++I) { + if (I != 0) + O << ','; + + O << !!(Ops[I] & Mod); + } + + if (HasDstSel) { + O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL); + } + + O << ']'; +} + +void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || + Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) { + auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0); + unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0); + if (FI || BC) + O << " op_sel:[" << FI << ',' << BC << ']'; + return; + } + + printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); +} + +void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O); +} + +void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O); +} + +void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); +} + +void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + switch (Imm) { + case 0: + O << "p10"; + break; + case 1: + O << "p20"; + break; + case 2: + O << "p0"; + break; + default: + O << "invalid_param_" << Imm; + } +} + +void AMDGPUInstPrinter::printInterpAttr(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Attr = MI->getOperand(OpNum).getImm(); + O << "attr" << Attr; +} + +void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Chan = MI->getOperand(OpNum).getImm(); + O << '.' << "xyzw"[Chan & 0x3]; +} + +void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::VGPRIndexMode; + unsigned Val = MI->getOperand(OpNo).getImm(); + + if ((Val & ~ENABLE_MASK) != 0) { + O << " " << formatHex(static_cast(Val)); + } else { + O << " gpr_idx("; + bool NeedComma = false; + for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) { + if (Val & (1 << ModeId)) { + if (NeedComma) + O << ','; + O << IdSymbolic[ModeId]; + NeedComma = true; + } + } + O << ')'; + } +} + +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printOperand(MI, OpNo, STI, O); + O << ", "; + printOperand(MI, OpNo + 1, STI, O); +} + +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef Asm, + StringRef Default) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) { + O << Asm; + } else { + O << Default; + } +} + +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, char Asm) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) + O << Asm; +} + +void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " high"; +} + +void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " clamp"; +} + +void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int Imm = MI->getOperand(OpNo).getImm(); + if (Imm == SIOutMods::MUL2) + O << " mul:2"; + else if (Imm == SIOutMods::MUL4) + O << " mul:4"; + else if (Imm == SIOutMods::DIV2) + O << " div:2"; +} + +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::SendMsg; + + const unsigned Imm16 = MI->getOperand(OpNo).getImm(); + + uint16_t MsgId; + uint16_t OpId; + uint16_t StreamId; + decodeMsg(Imm16, MsgId, OpId, StreamId); + + if (isValidMsgId(MsgId, STI) && + isValidMsgOp(MsgId, OpId) && + isValidMsgStream(MsgId, OpId, StreamId)) { + O << "sendmsg(" << getMsgName(MsgId); + if (msgRequiresOp(MsgId)) { + O << ", " << getMsgOpName(MsgId, OpId); + if (msgSupportsStream(MsgId, OpId)) { + O << ", " << StreamId; + } + } + O << ')'; + } else if (encodeMsg(MsgId, OpId, StreamId) == Imm16) { + O << "sendmsg(" << MsgId << ", " << OpId << ", " << StreamId << ')'; + } else { + O << Imm16; // Unknown imm16 code. + } +} + +static void printSwizzleBitmask(const uint16_t AndMask, + const uint16_t OrMask, + const uint16_t XorMask, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask; + uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask; + + O << "\""; + + for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) { + uint16_t p0 = Probe0 & Mask; + uint16_t p1 = Probe1 & Mask; + + if (p0 == p1) { + if (p0 == 0) { + O << "0"; + } else { + O << "1"; + } + } else { + if (p0 == 0) { + O << "p"; + } else { + O << "i"; + } + } + } + + O << "\""; +} + +void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << " offset:"; + + if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { + + O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; + for (unsigned I = 0; I < LANE_NUM; ++I) { + O << ","; + O << formatDec(Imm & LANE_MASK); + Imm >>= LANE_SHIFT; + } + O << ")"; + + } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) { + + uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK; + uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK; + uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK; + + if (AndMask == BITMASK_MAX && + OrMask == 0 && + countPopulation(XorMask) == 1) { + + O << "swizzle(" << IdSymbolic[ID_SWAP]; + O << ","; + O << formatDec(XorMask); + O << ")"; + + } else if (AndMask == BITMASK_MAX && + OrMask == 0 && XorMask > 0 && + isPowerOf2_64(XorMask + 1)) { + + O << "swizzle(" << IdSymbolic[ID_REVERSE]; + O << ","; + O << formatDec(XorMask + 1); + O << ")"; + + } else { + + uint16_t GroupSize = BITMASK_MAX - AndMask + 1; + if (GroupSize > 1 && + isPowerOf2_64(GroupSize) && + OrMask < GroupSize && + XorMask == 0) { + + O << "swizzle(" << IdSymbolic[ID_BROADCAST]; + O << ","; + O << formatDec(GroupSize); + O << ","; + O << formatDec(OrMask); + O << ")"; + + } else { + O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM]; + O << ","; + printSwizzleBitmask(AndMask, OrMask, XorMask, O); + O << ")"; + } + } + } else { + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU()); + + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Vmcnt, Expcnt, Lgkmcnt; + decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); + + bool NeedSpace = false; + + if (Vmcnt != getVmcntBitMask(ISA)) { + O << "vmcnt(" << Vmcnt << ')'; + NeedSpace = true; + } + + if (Expcnt != getExpcntBitMask(ISA)) { + if (NeedSpace) + O << ' '; + O << "expcnt(" << Expcnt << ')'; + NeedSpace = true; + } + + if (Lgkmcnt != getLgkmcntBitMask(ISA)) { + if (NeedSpace) + O << ' '; + O << "lgkmcnt(" << Lgkmcnt << ')'; + } +} + +void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + unsigned Id; + unsigned Offset; + unsigned Width; + + using namespace llvm::AMDGPU::Hwreg; + unsigned Val = MI->getOperand(OpNo).getImm(); + decodeHwreg(Val, Id, Offset, Width); + StringRef HwRegName = getHwreg(Id, STI); + + O << "hwreg("; + if (!HwRegName.empty()) { + O << HwRegName; + } else { + O << Id; + } + if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) { + O << ", " << Offset << ", " << Width; + } + O << ')'; +} + +void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << ' ' << formatDec(Imm); +} + +#include "AMDGPUGenAsmWriter.inc" + +void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + O.flush(); + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|'); +} + +void R600InstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int BankSwizzle = MI->getOperand(OpNo).getImm(); + switch (BankSwizzle) { + case 1: + O << "BS:VEC_021/SCL_122"; + break; + case 2: + O << "BS:VEC_120/SCL_212"; + break; + case 3: + O << "BS:VEC_102/SCL_221"; + break; + case 4: + O << "BS:VEC_201"; + break; + case 5: + O << "BS:VEC_210"; + break; + default: + break; + } +} + +void R600InstPrinter::printClamp(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "_SAT"); +} + +void R600InstPrinter::printCT(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CT = MI->getOperand(OpNo).getImm(); + switch (CT) { + case 0: + O << 'U'; + break; + case 1: + O << 'N'; + break; + default: + break; + } +} + +void R600InstPrinter::printKCache(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int KCacheMode = MI->getOperand(OpNo).getImm(); + if (KCacheMode > 0) { + int KCacheBank = MI->getOperand(OpNo - 2).getImm(); + O << "CB" << KCacheBank << ':'; + int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; + } +} + +void R600InstPrinter::printLast(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "*", " "); +} + +void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() || Op.isExpr()); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; + } + if (Op.isExpr()) { + Op.getExpr()->print(O << '@', &MAI); + } +} + +void R600InstPrinter::printNeg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '-'); +} + +void R600InstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + switch (MI->getOperand(OpNo).getImm()) { + default: break; + case 1: + O << " * 2.0"; + break; + case 2: + O << " * 4.0"; + break; + case 3: + O << " / 2.0"; + break; + } +} + +void R600InstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo + 1, O); +} + +void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (OpNo >= MI->getNumOperands()) { + O << "/*Missing OP" << OpNo << "*/"; + return; + } + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + switch (Op.getReg()) { + // This is the default predicate state, so we don't need to print it. + case R600::PRED_SEL_OFF: + break; + + default: + O << getRegisterName(Op.getReg()); + break; + } + } else if (Op.isImm()) { + O << Op.getImm(); + } else if (Op.isFPImm()) { + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + O << Op.getFPImm(); + } + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O, &MAI); + } else { + O << "/*INV_OP*/"; + } +} + +void R600InstPrinter::printRel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '+'); +} + +void R600InstPrinter::printRSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Sel = MI->getOperand(OpNo).getImm(); + switch (Sel) { + case 0: + O << 'X'; + break; + case 1: + O << 'Y'; + break; + case 2: + O << 'Z'; + break; + case 3: + O << 'W'; + break; + case 4: + O << '0'; + break; + case 5: + O << '1'; + break; + case 7: + O << '_'; + break; + default: + break; + } +} + +void R600InstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "ExecMask,"); +} + +void R600InstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "Pred,"); +} + +void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.getImm() == 0) { + O << " (MASKED)"; + } +} + +#include "R600GenAsmWriter.inc" diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h new file mode 100644 index 000000000000..b544d1ef3605 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -0,0 +1,268 @@ +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class AMDGPUInstPrinter : public MCInstPrinter { +public: + AMDGPUInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + //Autogenerated by tblgen + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AMDGPU::NoRegAltName); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + static void printRegOperand(unsigned RegNo, raw_ostream &O, + const MCRegisterInfo &MRI); + +private: + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU32ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, + StringRef BitName); + void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + + void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSMRDOffset8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSMRDOffset20(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printLWE(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpCompr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpVM(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFORMAT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printRegOperand(unsigned RegNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printDPP8(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printBankMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBoundCtrl(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWADstSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWADstUnused(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, + raw_ostream &O); + void printOpSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInterpSlot(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInterpAttr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInterpAttrChan(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printVGPRIndexMode(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + + + template + void printExpSrcN(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc0(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc1(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc3(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpTgt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + +public: + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + StringRef Asm, StringRef Default = ""); + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + char Asm); +protected: + void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printUpdateExecMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printUpdatePred(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printBankSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printWaitFlag(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); +}; + +class R600InstPrinter : public MCInstPrinter { +public: + R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 2364e7b7b5fb..9e04ab9bae93 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -1,15 +1,16 @@ //===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// #include "AMDGPUMCAsmInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" using namespace llvm; @@ -19,7 +20,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MinInstAlignment = 4; - MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16; + + // This is the maximum instruction encoded size for gfx10. With a known + // subtarget, it can be reduced to 8 bytes. + MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 20 : 16; SeparatorString = "\n"; CommentString = ";"; PrivateLabelPrefix = ""; @@ -45,3 +49,18 @@ bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { SectionName == ".hsarodata_readonly_agent" || MCAsmInfo::shouldOmitSectionDirective(SectionName); } + +unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const { + if (!STI || STI->getTargetTriple().getArch() == Triple::r600) + return MaxInstLength; + + // Maximum for NSA encoded images + if (STI->getFeatureBits()[AMDGPU::FeatureNSAEncoding]) + return 20; + + // 64-bit instruction with 32-bit literal. + if (STI->getFeatureBits()[AMDGPU::FeatureVOP3Literal]) + return 12; + + return 8; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index 8cb33a3179cd..71e63ec27a8f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -28,6 +27,7 @@ class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(const Triple &TT); bool shouldOmitSectionDirective(StringRef SectionName) const override; + unsigned getMaxInstLength(const MCSubtargetInfo *STI) const override; }; } // namespace llvm #endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index cae7a7a6c7e7..f3d945cc0764 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index dcc10a032afe..62757a707890 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -1,9 +1,8 @@ //===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,10 +63,17 @@ public: return 0; } + virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + protected: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index c579c7d60e16..88df64d18cc5 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,13 +13,15 @@ #include "AMDGPUMCTargetDesc.h" #include "AMDGPUELFStreamer.h" +#include "AMDGPUInstPrinter.h" #include "AMDGPUMCAsmInfo.h" #include "AMDGPUTargetStreamer.h" -#include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" @@ -104,6 +105,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, std::move(Emitter), RelaxAll); } +namespace { + +class AMDGPUMCInstrAnalysis : public MCInstrAnalysis { +public: + explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info) + : MCInstrAnalysis(Info) {} + + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const override { + if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() || + Info->get(Inst.getOpcode()).OpInfo[0].OperandType != + MCOI::OPERAND_PCREL) + return false; + + int64_t Imm = Inst.getOperand(0).getImm(); + // Our branches take a simm16, but we need two extra bits to account for + // the factor of 4. + APInt SignedOffset(18, Imm * 4, true); + Target = (SignedOffset.sext(64) + Addr + Size).getZExtValue(); + return true; + } +}; + +} // end anonymous namespace + +static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) { + return new AMDGPUMCInstrAnalysis(Info); +} + extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo); @@ -114,6 +144,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCInstrAnalysis(*T, createAMDGPUMCInstrAnalysis); TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index f3628d96d6e9..9754d31fee60 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,9 +33,6 @@ class Target; class Triple; class raw_pwrite_stream; -Target &getTheAMDGPUTarget(); -Target &getTheGCNTarget(); - MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); @@ -53,7 +49,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, std::unique_ptr createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend); + bool HasRelocationAddend, uint8_t ABIVersion); } // End llvm namespace #define GET_REGINFO_ENUM diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index c17fe126546c..8f11433476f4 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Metadata.h" @@ -52,51 +50,53 @@ bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { } bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) { - std::shared_ptr HSAMetadataRoot; - yaml::Input YIn(HSAMetadataString); - YIn >> HSAMetadataRoot; - if (YIn.error()) + msgpack::Document HSAMetadataDoc; + if (!HSAMetadataDoc.fromYAML(HSAMetadataString)) return false; - return EmitHSAMetadata(HSAMetadataRoot, false); + return EmitHSAMetadata(HSAMetadataDoc, false); } StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { AMDGPU::GPUKind AK; switch (ElfMach) { - case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; - case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; - case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; - case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; - case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; - case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; - case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; - case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; - case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; - case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; - case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; - case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; - case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; - case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; - case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; - case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; - case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; + case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; + case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; + case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; + case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; + case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; + case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; + case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; + case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; + case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; + case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; + case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; + case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; + case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; + case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; + case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; + case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; + case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } StringRef GPUName = getArchNameAMDGCN(AK); @@ -142,7 +142,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902; case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; + case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908; case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; + case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; + case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -157,6 +161,14 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : AMDGPUTargetStreamer(S), OS(OS) { } +// A hook for emitting stuff at the end. +// We use it for emitting the accumulated PAL metadata as directives. +void AMDGPUTargetAsmStreamer::finish() { + std::string S; + getPALMetadata()->toString(S); + OS << S; +} + void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) { OS << "\t.amdgcn_target \"" << Target << "\"\n"; } @@ -196,6 +208,12 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } } +void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, + unsigned Align) { + OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align + << '\n'; +} + bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) { OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n"; return true; @@ -214,15 +232,14 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( } bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( - std::shared_ptr &HSAMetadataRoot, bool Strict) { + msgpack::Document &HSAMetadataDoc, bool Strict) { V3::MetadataVerifier Verifier(Strict); - if (!Verifier.verify(*HSAMetadataRoot)) + if (!Verifier.verify(HSAMetadataDoc.getRoot())) return false; std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); - yaml::Output YOut(StrOS); - YOut << HSAMetadataRoot; + HSAMetadataDoc.toYAML(StrOS); OS << '\t' << V3::AssemblerDirectiveBegin << '\n'; OS << StrOS.str() << '\n'; @@ -230,13 +247,10 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( return true; } -bool AMDGPUTargetAsmStreamer::EmitPALMetadata( - const PALMD::Metadata &PALMetadata) { - std::string PALMetadataString; - if (PALMD::toString(PALMetadata, PALMetadataString)) - return false; - - OS << '\t' << PALMD::AssemblerDirective << PALMetadataString << '\n'; +bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { + const uint32_t Encoded_s_code_end = 0xbf9f0000; + OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; + OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n'; return true; } @@ -278,6 +292,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (IVersion.Major >= 10) + PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); PRINT_FIELD( OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, compute_pgm_rsrc2, @@ -331,6 +349,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + if (IVersion.Major >= 10) { + PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE); + PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED); + PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); + } PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, compute_pgm_rsrc2, @@ -387,6 +416,19 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast(Streamer); } +// A hook for emitting stuff at the end. +// We use it for emitting the accumulated PAL metadata as a .note record. +void AMDGPUTargetELFStreamer::finish() { + std::string Blob; + const char *Vendor = getPALMetadata()->getVendor(); + unsigned Type = getPALMetadata()->getType(); + getPALMetadata()->toBlob(Type, Blob); + if (Blob.empty()) + return; + EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type, + [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); }); +} + void AMDGPUTargetELFStreamer::EmitNote( StringRef Name, const MCExpr *DescSZ, unsigned NoteType, function_ref EmitDesc) { @@ -463,6 +505,27 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, Symbol->setType(Type); } +void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, + unsigned Align) { + assert(isPowerOf2_32(Align)); + + MCSymbolELF *SymbolELF = cast(Symbol); + SymbolELF->setType(ELF::STT_OBJECT); + + if (!SymbolELF->isBindingSet()) { + SymbolELF->setBinding(ELF::STB_GLOBAL); + SymbolELF->setExternal(true); + } + + if (SymbolELF->declareCommon(Size, Align, true)) { + report_fatal_error("Symbol: " + Symbol->getName() + + " redeclared as different type"); + } + + SymbolELF->setIndex(ELF::SHN_AMDGPU_LDS); + SymbolELF->setSize(MCConstantExpr::create(Size, getContext())); +} + bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. @@ -482,16 +545,14 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { return true; } -bool AMDGPUTargetELFStreamer::EmitHSAMetadata( - std::shared_ptr &HSAMetadataRoot, bool Strict) { +bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc, + bool Strict) { V3::MetadataVerifier Verifier(Strict); - if (!Verifier.verify(*HSAMetadataRoot)) + if (!Verifier.verify(HSAMetadataDoc.getRoot())) return false; std::string HSAMetadataString; - raw_string_ostream StrOS(HSAMetadataString); - msgpack::Writer MPWriter(StrOS); - HSAMetadataRoot->write(MPWriter); + HSAMetadataDoc.writeToBlob(HSAMetadataString); // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. @@ -505,7 +566,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA, [&](MCELFStreamer &OS) { OS.EmitLabel(DescBegin); - OS.EmitBytes(StrOS.str()); + OS.EmitBytes(HSAMetadataString); OS.EmitLabel(DescEnd); }); return true; @@ -535,15 +596,15 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( return true; } -bool AMDGPUTargetELFStreamer::EmitPALMetadata( - const PALMD::Metadata &PALMetadata) { - EmitNote(ElfNote::NoteNameV2, - MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), - getContext()), - ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) { - for (auto I : PALMetadata) - OS.EmitIntValue(I, sizeof(uint32_t)); - }); +bool AMDGPUTargetELFStreamer::EmitCodeEnd() { + const uint32_t Encoded_s_code_end = 0xbf9f0000; + + MCStreamer &OS = getStreamer(); + OS.PushSection(); + OS.EmitValueToAlignment(64, Encoded_s_code_end, 4); + for (unsigned I = 0; I < 32; ++I) + OS.EmitIntValue(Encoded_s_code_end, 4); + OS.PopSection(); return true; } @@ -555,16 +616,25 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); + MCSymbolELF *KernelCodeSymbol = cast( + Context.getOrCreateSymbol(Twine(KernelName))); MCSymbolELF *KernelDescriptorSymbol = cast( Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd"))); - KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL); + + // Copy kernel descriptor symbol's binding, other and visibility from the + // kernel code symbol. + KernelDescriptorSymbol->setBinding(KernelCodeSymbol->getBinding()); + KernelDescriptorSymbol->setOther(KernelCodeSymbol->getOther()); + KernelDescriptorSymbol->setVisibility(KernelCodeSymbol->getVisibility()); + // Kernel descriptor symbol's type and size are fixed. KernelDescriptorSymbol->setType(ELF::STT_OBJECT); KernelDescriptorSymbol->setSize( MCConstantExpr::create(sizeof(KernelDescriptor), Context)); - MCSymbolELF *KernelCodeSymbol = cast( - Context.getOrCreateSymbol(Twine(KernelName))); - KernelCodeSymbol->setBinding(ELF::STB_LOCAL); + // The visibility of the kernel code symbol must be protected or less to allow + // static relocations from the kernel descriptor to be used. + if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT) + KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED); Streamer.EmitLabel(KernelDescriptorSymbol); Streamer.EmitBytes(StringRef( diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 9a807c804f9f..683b3e363b9a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -1,9 +1,8 @@ //===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,7 +10,8 @@ #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #include "AMDKernelCodeT.h" -#include "llvm/BinaryFormat/MsgPackTypes.h" +#include "Utils/AMDGPUPALMetadata.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -29,12 +29,16 @@ class Module; class Type; class AMDGPUTargetStreamer : public MCTargetStreamer { + AMDGPUPALMetadata PALMetadata; + protected: MCContext &getContext() const { return Streamer.getContext(); } public: AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; } + virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0; virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -49,6 +53,9 @@ public: virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, + unsigned Align) = 0; + /// \returns True on success, false on failure. virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; @@ -65,14 +72,13 @@ public: /// the \p HSAMetadata structure is updated with the correct types. /// /// \returns True on success, false on failure. - virtual bool EmitHSAMetadata(std::shared_ptr &HSAMetadata, - bool Strict) = 0; + virtual bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) = 0; /// \returns True on success, false on failure. virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; /// \returns True on success, false on failure. - virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0; + virtual bool EmitCodeEnd() = 0; virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, @@ -89,6 +95,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { public: AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + void finish() override; + void EmitDirectiveAMDGCNTarget(StringRef Target) override; void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -102,18 +110,19 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; /// \returns True on success, false on failure. - bool EmitHSAMetadata(std::shared_ptr &HSAMetadata, - bool Strict) override; + bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; + bool EmitCodeEnd() override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, @@ -133,6 +142,8 @@ public: MCELFStreamer &getStreamer(); + void finish() override; + void EmitDirectiveAMDGCNTarget(StringRef Target) override; void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -146,18 +157,19 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; /// \returns True on success, false on failure. - bool EmitHSAMetadata(std::shared_ptr &HSAMetadata, - bool Strict) override; + bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; + bool EmitCodeEnd() override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 28d4bc1829e2..2f1f4e7a0392 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -65,9 +64,10 @@ private: uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp index 1c99a708e5ac..a4809af29daa 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 36913bd04274..f8ec3c36f019 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPURegisterInfo.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -77,6 +78,10 @@ public: unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const override; + + unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -233,6 +238,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: return getLit32Encoding(static_cast(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT64: @@ -245,12 +252,21 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: // FIXME Is this correct? What do inline immediates do on SI for f16 src // which does not have f16 support? return getLit16Encoding(static_cast(Imm), STI); + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) + return getLit32Encoding(static_cast(Imm), STI); + LLVM_FALLTHROUGH; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint16_t Lo16 = static_cast(Imm); uint32_t Encoding = getLit16Encoding(Lo16, STI); return Encoding; @@ -274,7 +290,25 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); } - if (bytes > 4) + // NSA encoding. + if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) { + int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr0); + int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::srsrc); + assert(vaddr0 >= 0 && srsrc > vaddr0); + unsigned NumExtraAddrs = srsrc - vaddr0 - 1; + unsigned NumPadding = (-NumExtraAddrs) & 3; + + for (unsigned i = 0; i < NumExtraAddrs; ++i) + OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), + Fixups, STI)); + for (unsigned i = 0; i < NumPadding; ++i) + OS.write(0); + } + + if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) || + (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])) return; // Check for additional literals in SRC0/1/2 (Op 1/2/3) @@ -366,7 +400,7 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, const MCOperand &MO = MI.getOperand(OpNo); unsigned Reg = MO.getReg(); - if (Reg != AMDGPU::VCC) { + if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) { RegEnc |= MRI.getEncodingValue(Reg); RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; @@ -374,10 +408,31 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, return RegEnc; } +unsigned +SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + unsigned Reg = MI.getOperand(OpNo).getReg(); + uint64_t Enc = MRI.getEncodingValue(Reg); + + // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma + // instructions use acc[0:1] modifier bits to distinguish. These bits are + // encoded as a virtual 9th bit of the register for these operands. + if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg)) + Enc |= 512; + + return Enc; +} + static bool needsPCRel(const MCExpr *Expr) { switch (Expr->getKind()) { - case MCExpr::SymbolRef: - return true; + case MCExpr::SymbolRef: { + auto *SE = cast(Expr); + MCSymbolRefExpr::VariantKind Kind = SE->getKind(); + return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO && + Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; + } case MCExpr::Binary: { auto *BE = cast(Expr); if (BE->getOpcode() == MCBinaryExpr::Sub) @@ -416,7 +471,13 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, Kind = FK_PCRel_4; else Kind = FK_Data_4; - Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc())); + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint32_t Offset = Desc.getSize(); + assert(Offset == 4 || Offset == 8); + + Fixups.push_back( + MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check @@ -429,7 +490,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (AMDGPU::isSISrcOperand(Desc, OpNo)) { uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); - if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) + if (Enc != ~0U && + (Enc != 255 || Desc.getSize() == 4 || Desc.getSize() == 8)) return Enc; } else if (MO.isImm()) diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 1c68dbd78e75..4735e6cb2446 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -1,9 +1,8 @@ //===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -12,10 +11,14 @@ // // - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8) // - MIMGEncGfx8: encoding introduced with gfx8 for atomics +// - MIMGEncGfx10Default: gfx default (non-NSA) encoding +// - MIMGEncGfx10NSA: gfx10 NSA encoding class MIMGEncoding; def MIMGEncGfx6 : MIMGEncoding; def MIMGEncGfx8 : MIMGEncoding; +def MIMGEncGfx10Default : MIMGEncoding; +def MIMGEncGfx10NSA : MIMGEncoding; def MIMGEncoding : GenericEnum { let FilterClass = "MIMGEncoding"; @@ -60,13 +63,28 @@ def MIMGDim : GenericEnum { def MIMGDimInfoTable : GenericTable { let FilterClass = "AMDGPUDimProps"; let CppTypeName = "MIMGDimInfo"; - let Fields = ["Dim", "NumCoords", "NumGradients", "DA"]; + let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"]; GenericEnum TypeOf_Dim = MIMGDim; let PrimaryKey = ["Dim"]; let PrimaryKeyName = "getMIMGDimInfo"; } +def getMIMGDimInfoByEncoding : SearchIndex { + let Table = MIMGDimInfoTable; + let Key = ["Encoding"]; +} + +def getMIMGDimInfoByAsmSuffix : SearchIndex { + let Table = MIMGDimInfoTable; + let Key = ["AsmSuffix"]; +} + +class mimg si_gfx10, bits<8> vi = si_gfx10> { + field bits<8> SI_GFX10 = si_gfx10; + field bits<8> VI = vi; +} + class MIMGLZMapping { MIMGBaseOpcode L = l; MIMGBaseOpcode LZ = lz; @@ -83,12 +101,23 @@ def MIMGLZMappingTable : GenericTable { let PrimaryKeyName = "getMIMGLZMappingInfo"; } -class mimg si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; +class MIMGMIPMapping { + MIMGBaseOpcode MIP = mip; + MIMGBaseOpcode NONMIP = nonmip; } -class MIMG +def MIMGMIPMappingTable : GenericTable { + let FilterClass = "MIMGMIPMapping"; + let CppTypeName = "MIMGMIPMappingInfo"; + let Fields = ["MIP", "NONMIP"]; + GenericEnum TypeOf_MIP = MIMGBaseOpcode; + GenericEnum TypeOf_NONMIP = MIMGBaseOpcode; + + let PrimaryKey = ["MIP"]; + let PrimaryKeyName = "getMIMGMIPMappingInfo"; +} + +class MIMG_Base : InstSI { let VM_CNT = 1; @@ -97,20 +126,24 @@ class MIMG let Uses = [EXEC]; let mayLoad = 1; let mayStore = 0; - let hasPostISelHook = 1; let SchedRW = [WriteVMEM]; let UseNamedOperandTable = 1; let hasSideEffects = 0; // XXX ???? - let SubtargetPredicate = isGCN; let DecoderNamespace = dns; let isAsmParserOnly = !if(!eq(dns,""), 1, 0); - let AsmMatchConverter = "cvtMIMG"; let usesCustomInserter = 1; +} + +class MIMG + : MIMG_Base { + + let hasPostISelHook = 1; + let AsmMatchConverter = "cvtMIMG"; Instruction Opcode = !cast(NAME); MIMGBaseOpcode BaseOpcode; - MIMGEncoding MIMGEncoding = MIMGEncGfx6; + MIMGEncoding MIMGEncoding; bits<8> VDataDwords; bits<8> VAddrDwords; } @@ -131,15 +164,66 @@ def getMIMGInfo : SearchIndex { let Key = ["Opcode"]; } -class MIMG_NoSampler_Helper op, string asm, +// This is a separate class so that TableGen memoizes the computations. +class MIMGNSAHelper { + list AddrAsmNames = + !foldl([], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], lhs, i, + !if(!lt(i, num_addrs), !listconcat(lhs, ["vaddr"#!size(lhs)]), lhs)); + dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames); + string AddrAsm = "[" # !foldl("$" # !head(AddrAsmNames), !tail(AddrAsmNames), lhs, rhs, + lhs # ", $" # rhs) # "]"; + + int NSA = !if(!le(num_addrs, 1), ?, + !if(!le(num_addrs, 5), 1, + !if(!le(num_addrs, 9), 2, + !if(!le(num_addrs, 13), 3, ?)))); +} + +// Base class of all pre-gfx10 MIMG instructions. +class MIMG_gfx6789 op, dag outs, string dns = ""> + : MIMG, MIMGe_gfx6789 { + let SubtargetPredicate = isGFX6GFX7GFX8GFX9; + let AssemblerPredicates = [isGFX6GFX7GFX8GFX9]; + + let MIMGEncoding = MIMGEncGfx6; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); +} + +// Base class of all non-NSA gfx10 MIMG instructions. +class MIMG_gfx10 + : MIMG, MIMGe_gfx10 { + let SubtargetPredicate = isGFX10Plus; + let AssemblerPredicates = [isGFX10Plus]; + + let MIMGEncoding = MIMGEncGfx10Default; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let nsa = 0; +} + +// Base class for all NSA MIMG instructions. Note that 1-dword addresses always +// use non-NSA variants. +class MIMG_nsa_gfx10 + : MIMG, MIMGe_gfx10 { + let SubtargetPredicate = isGFX10Plus; + let AssemblerPredicates = [isGFX10Plus]; + + let MIMGEncoding = MIMGEncGfx10NSA; + + MIMGNSAHelper nsah = MIMGNSAHelper; + dag AddrIns = nsah.AddrIns; + string AddrAsm = nsah.AddrAsm; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let nsa = nsah.NSA; +} + +class MIMG_NoSampler_Helper op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, string dns=""> - : MIMG <(outs dst_rc:$vdata), dns>, - MIMGe { - let ssamp = 0; - let d16 = !if(BaseOpcode.HasD16, ?, 0); - + : MIMG_gfx6789 { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -148,23 +232,66 @@ class MIMG_NoSampler_Helper op, string asm, #!if(BaseOpcode.HasD16, "$d16", ""); } -multiclass MIMG_NoSampler_Src_Helper op, string asm, +class MIMG_NoSampler_gfx10 + : MIMG_gfx10 { + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_NoSampler_nsa_gfx10 + : MIMG_nsa_gfx10 { + let InOperandList = !con(AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +multiclass MIMG_NoSampler_Src_Helper op, string asm, RegisterClass dst_rc, bit enableDisasm> { - let VAddrDwords = 1 in - def NAME # _V1 : MIMG_NoSampler_Helper ; - let VAddrDwords = 2 in - def NAME # _V2 : MIMG_NoSampler_Helper ; - let VAddrDwords = 3 in - def NAME # _V3 : MIMG_NoSampler_Helper ; - let VAddrDwords = 4 in - def NAME # _V4 : MIMG_NoSampler_Helper ; -} - -multiclass MIMG_NoSampler op, string asm, bit has_d16, bit mip = 0, + let ssamp = 0 in { + let VAddrDwords = 1 in { + def _V1 : MIMG_NoSampler_Helper ; + def _V1_gfx10 : MIMG_NoSampler_gfx10; + } + + let VAddrDwords = 2 in { + def _V2 : MIMG_NoSampler_Helper ; + def _V2_gfx10 : MIMG_NoSampler_gfx10; + def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10; + } + + let VAddrDwords = 3 in { + def _V3 : MIMG_NoSampler_Helper ; + def _V3_gfx10 : MIMG_NoSampler_gfx10; + def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10; + } + + let VAddrDwords = 4 in { + def _V4 : MIMG_NoSampler_Helper ; + def _V4_gfx10 : MIMG_NoSampler_gfx10; + def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10; + } + } +} + +multiclass MIMG_NoSampler op, string asm, bit has_d16, bit mip = 0, bit isResInfo = 0> { - def "" : MIMGBaseOpcode { + def "" : MIMGBaseOpcode, PredicateControl { let Coordinates = !if(isResInfo, 0, 1); let LodOrClampOrMip = mip; let HasD16 = has_d16; @@ -180,26 +307,16 @@ multiclass MIMG_NoSampler op, string asm, bit has_d16, bit mip = 0, defm _V3 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper ; - let VDataDwords = 8 in - defm _V8 : MIMG_NoSampler_Src_Helper ; + let VDataDwords = 5 in + defm _V5 : MIMG_NoSampler_Src_Helper ; } } -class MIMG_Store_Helper op, string asm, +class MIMG_Store_Helper op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns = ""> - : MIMG <(outs), dns>, - MIMGe { - let ssamp = 0; - let d16 = !if(BaseOpcode.HasD16, ?, 0); - - let mayLoad = 0; - let mayStore = 1; - let hasSideEffects = 0; - let hasPostISelHook = 0; - let DisableWQM = 1; - + : MIMG_gfx6789 { let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -208,21 +325,63 @@ class MIMG_Store_Helper op, string asm, #!if(BaseOpcode.HasD16, "$d16", ""); } -multiclass MIMG_Store_Addr_Helper op, string asm, +class MIMG_Store_gfx10 + : MIMG_gfx10 { + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, + GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Store_nsa_gfx10 + : MIMG_nsa_gfx10 { + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +multiclass MIMG_Store_Addr_Helper { - let VAddrDwords = 1 in - def NAME # _V1 : MIMG_Store_Helper ; - let VAddrDwords = 2 in - def NAME # _V2 : MIMG_Store_Helper ; - let VAddrDwords = 3 in - def NAME # _V3 : MIMG_Store_Helper ; - let VAddrDwords = 4 in - def NAME # _V4 : MIMG_Store_Helper ; -} - -multiclass MIMG_Store op, string asm, bit has_d16, bit mip = 0> { + let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0, + DisableWQM = 1, ssamp = 0 in { + let VAddrDwords = 1 in { + def _V1 : MIMG_Store_Helper ; + def _V1_gfx10 : MIMG_Store_gfx10 ; + } + let VAddrDwords = 2 in { + def _V2 : MIMG_Store_Helper ; + def _V2_gfx10 : MIMG_Store_gfx10 ; + def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 ; + } + let VAddrDwords = 3 in { + def _V3 : MIMG_Store_Helper ; + def _V3_gfx10 : MIMG_Store_gfx10 ; + def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 ; + } + let VAddrDwords = 4 in { + def _V4 : MIMG_Store_Helper ; + def _V4_gfx10 : MIMG_Store_gfx10 ; + def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 ; + } + } +} + +multiclass MIMG_Store op, string asm, bit has_d16, bit mip = 0> { def "" : MIMGBaseOpcode { let Store = 1; let LodOrClampOrMip = mip; @@ -241,15 +400,9 @@ multiclass MIMG_Store op, string asm, bit has_d16, bit mip = 0> { } } -class MIMG_Atomic_Helper - : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> { - let mayLoad = 1; - let mayStore = 1; - let hasSideEffects = 1; // FIXME: Remove this - let hasPostISelHook = 0; - let DisableWQM = 1; +class MIMG_Atomic_gfx6789_base op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, string dns=""> + : MIMG_gfx6789 { let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; @@ -259,39 +412,80 @@ class MIMG_Atomic_Helper { - let ssamp = 0, d16 = 0 in { - def _si : MIMG_Atomic_Helper, - SIMCInstr, - MIMGe { - let AssemblerPredicates = [isSICI]; - let DisableDecoder = DisableSIDecoder; - } +class MIMG_Atomic_si + : MIMG_Atomic_gfx6789_base { + let AssemblerPredicates = [isGFX6GFX7]; +} - def _vi : MIMG_Atomic_Helper, - SIMCInstr, - MIMGe { - let AssemblerPredicates = [isVI]; - let DisableDecoder = DisableVIDecoder; - let MIMGEncoding = MIMGEncGfx8; - } - } +class MIMG_Atomic_vi + : MIMG_Atomic_gfx6789_base { + let AssemblerPredicates = [isGFX8GFX9]; + let MIMGEncoding = MIMGEncGfx8; +} + +class MIMG_Atomic_gfx10 + : MIMG_gfx10(op.SI_GFX10), (outs DataRC:$vdst), + !if(enableDisasm, "AMDGPU", "")> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, + GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe); + let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"; +} + +class MIMG_Atomic_nsa_gfx10 + : MIMG_nsa_gfx10(op.SI_GFX10), (outs DataRC:$vdst), num_addrs, + !if(enableDisasm, "AMDGPU", "")> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe)); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"; } multiclass MIMG_Atomic_Addr_Helper_m { - // _V* variants have different address size, but the size is not encoded. - // So only one variant can be disassembled. V1 looks the safest to decode. - let VAddrDwords = 1 in - defm _V1 : MIMG_Atomic_Helper_m ; - let VAddrDwords = 2 in - defm _V2 : MIMG_Atomic_Helper_m ; - let VAddrDwords = 3 in - defm _V3 : MIMG_Atomic_Helper_m ; - let VAddrDwords = 4 in - defm _V4 : MIMG_Atomic_Helper_m ; + let hasSideEffects = 1, // FIXME: remove this + mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, + ssamp = 0 in { + let VAddrDwords = 1 in { + def _V1_si : MIMG_Atomic_si ; + def _V1_vi : MIMG_Atomic_vi ; + def _V1_gfx10 : MIMG_Atomic_gfx10 ; + } + let VAddrDwords = 2 in { + def _V2_si : MIMG_Atomic_si ; + def _V2_vi : MIMG_Atomic_vi ; + def _V2_gfx10 : MIMG_Atomic_gfx10 ; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; + } + let VAddrDwords = 3 in { + def _V3_si : MIMG_Atomic_si ; + def _V3_vi : MIMG_Atomic_vi ; + def _V3_gfx10 : MIMG_Atomic_gfx10 ; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; + } + let VAddrDwords = 4 in { + def _V4_si : MIMG_Atomic_si ; + def _V4_vi : MIMG_Atomic_vi ; + def _V4_gfx10 : MIMG_Atomic_gfx10 ; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; + } + } } multiclass MIMG_Atomic { // 64-bit atomics @@ -311,12 +505,9 @@ multiclass MIMG_Atomic { // 64-bit atom } } -class MIMG_Sampler_Helper op, string asm, RegisterClass dst_rc, +class MIMG_Sampler_Helper op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> - : MIMG <(outs dst_rc:$vdata), dns>, - MIMGe { - let d16 = !if(BaseOpcode.HasD16, ?, 0); - + : MIMG_gfx6789 { let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -325,6 +516,33 @@ class MIMG_Sampler_Helper op, string asm, RegisterClass dst_rc, #!if(BaseOpcode.HasD16, "$d16", ""); } +class MIMG_Sampler_gfx10 + : MIMG_gfx10 { + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp, + DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, + GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm" + #"$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Sampler_nsa_gfx10 + : MIMG_nsa_gfx10 { + let InOperandList = !con(AddrIns, + (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm" + #"$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + class MIMGAddrSize { int NumWords = dw; @@ -341,6 +559,11 @@ class MIMGAddrSize { bit Disassemble = enable_disasm; } +// Return whether x is in lst. +class isIntInList lst> { + bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y))); +} + // Return whether a value inside the range [min, max] (endpoints inclusive) // is in the given list. class isRangeInList lst> { @@ -376,16 +599,41 @@ class MIMG_Sampler_AddrSizes { !listconcat(lhs.List, [MIMGAddrSize]), !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords lhs)).List; -} -multiclass MIMG_Sampler_Src_Helper op, string asm, + // For NSA, generate machine instructions for all possible numbers of words + // except 1 (which is already covered by the non-NSA case). + // The disassembler defaults to the largest number of arguments among the + // variants with the same number of NSA words, and custom code then derives + // the exact variant based on the sample variant and the image dimension. + list NSAInstrs = + !foldl([], [[12, 11, 10], [9, 8, 7, 6], [5, 4, 3, 2]], prev, nsa_group, + !listconcat(prev, + !foldl([], nsa_group, lhs, dw, + !if(isIntInList.ret, + !listconcat(lhs, [MIMGAddrSize]), + lhs)))); +} + +multiclass MIMG_Sampler_Src_Helper op, string asm, AMDGPUSampleVariant sample, RegisterClass dst_rc, bit enableDisasm = 0> { foreach addr = MIMG_Sampler_AddrSizes.MachineInstrs in { - let VAddrDwords = addr.NumWords in - def _V # addr.NumWords - : MIMG_Sampler_Helper ; + let VAddrDwords = addr.NumWords in { + def _V # addr.NumWords + : MIMG_Sampler_Helper ; + def _V # addr.NumWords # _gfx10 + : MIMG_Sampler_gfx10 ; + } + } + + foreach addr = MIMG_Sampler_AddrSizes.NSAInstrs in { + let VAddrDwords = addr.NumWords in { + def _V # addr.NumWords # _nsa_gfx10 + : MIMG_Sampler_nsa_gfx10; + } } } @@ -397,7 +645,7 @@ class MIMG_Sampler_BaseOpcode let LodOrClampOrMip = !ne(sample.LodOrClamp, ""); } -multiclass MIMG_Sampler op, AMDGPUSampleVariant sample, bit wqm = 0, +multiclass MIMG_Sampler op, AMDGPUSampleVariant sample, bit wqm = 0, bit isGetLod = 0, string asm = "image_sample"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode { @@ -414,15 +662,15 @@ multiclass MIMG_Sampler op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V3 : MIMG_Sampler_Src_Helper; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper; } } -multiclass MIMG_Sampler_WQM op, AMDGPUSampleVariant sample> +multiclass MIMG_Sampler_WQM op, AMDGPUSampleVariant sample> : MIMG_Sampler; -multiclass MIMG_Gather op, AMDGPUSampleVariant sample, bit wqm = 0, +multiclass MIMG_Gather op, AMDGPUSampleVariant sample, bit wqm = 0, string asm = "image_gather4"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode { let HasD16 = 1; @@ -435,12 +683,12 @@ multiclass MIMG_Gather op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V2 : MIMG_Sampler_Src_Helper; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper; } } -multiclass MIMG_Gather_WQM op, AMDGPUSampleVariant sample> +multiclass MIMG_Gather_WQM op, AMDGPUSampleVariant sample> : MIMG_Gather; //===----------------------------------------------------------------------===// @@ -473,9 +721,11 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic , "image_atomic_or">; defm IMAGE_ATOMIC_XOR : MIMG_Atomic , "image_atomic_xor">; defm IMAGE_ATOMIC_INC : MIMG_Atomic , "image_atomic_inc">; defm IMAGE_ATOMIC_DEC : MIMG_Atomic , "image_atomic_dec">; +//let FPAtomic = 1 in { //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI +//} // End let FPAtomic = 1 defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; @@ -581,3 +831,7 @@ def : MIMGLZMapping; def : MIMGLZMapping; def : MIMGLZMapping; def : MIMGLZMapping; + +// MIP to NONMIP Optimization Mapping +def : MIMGMIPMapping; +def : MIMGMIPMapping; diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td index 5c9c1c1ed504..1d11da969474 100644 --- a/lib/Target/AMDGPU/R600.td +++ b/lib/Target/AMDGPU/R600.td @@ -1,9 +1,8 @@ //===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp index 68f8c30775b8..3fb18862fca8 100644 --- a/lib/Target/AMDGPU/R600AsmPrinter.cpp +++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h index 079fc707b03c..0da9526d716e 100644 --- a/lib/Target/AMDGPU/R600AsmPrinter.h +++ b/lib/Target/AMDGPU/R600AsmPrinter.h @@ -1,9 +1,8 @@ //===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 0c62d6a4b3d9..290a960ae901 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -1,9 +1,8 @@ //===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index a19020276f35..8098b81d1ea2 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -1,9 +1,8 @@ //===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h index 0d33d82e8e0f..d72534908dcf 100644 --- a/lib/Target/AMDGPU/R600Defines.h +++ b/lib/Target/AMDGPU/R600Defines.h @@ -1,9 +1,8 @@ //===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 679cf18d2c20..b97e3c8b8dd7 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -1,9 +1,8 @@ //===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index b924ff019dd1..c6e8a060d8a0 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -1,9 +1,8 @@ //===- R600ExpandSpecialInstrs.cpp - Expand special instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp index 37787b3c5f72..d9aa9ebe878d 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -1,9 +1,8 @@ //===----------------------- R600FrameLowering.cpp ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h index fe367d73682f..950e238f4979 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.h +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -1,9 +1,8 @@ //===--------------------- R600FrameLowering.h ------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index e2a0f05d2b34..f80a53ba1dc6 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1240,11 +1239,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); + const bool TruncatingStore = StoreNode->isTruncatingStore(); + // Neither LOCAL nor PRIVATE can do vectors at the moment - if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS || + TruncatingStore) && VT.isVector()) { - if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && - StoreNode->isTruncatingStore()) { + if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); // TODO: can the chain be replaced without creating a new store? @@ -1260,7 +1261,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { unsigned Align = StoreNode->getAlignment(); if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + !allowsMisalignedMemoryAccesses( + MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { return expandUnalignedStore(StoreNode, DAG); } @@ -1270,7 +1272,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW - if (StoreNode->isTruncatingStore()) { + if (TruncatingStore) { assert(VT.bitsLE(MVT::i32)); SDValue MaskConstant; if (MemVT == MVT::i8) { @@ -1310,8 +1312,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); - if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { - llvm_unreachable("Truncated and indexed stores not supported yet"); + if (StoreNode->isIndexed()) { + llvm_unreachable("Indexed stores not supported yet"); } else { Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); } @@ -1662,10 +1664,9 @@ bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, return true; } -bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { +bool R600TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1713,6 +1714,12 @@ static SDValue CompactSwizzlableVector( if (NewBldVec[i].isUndef()) continue; + // Fix spurious warning with gcc 7.3 -O3 + // warning: array subscript is above array bounds [-Warray-bounds] + // if (NewBldVec[i] == NewBldVec[j]) { + // ~~~~~~~~~~~^ + if (i >= 4) + continue; for (unsigned j = 0; j < i; j++) { if (NewBldVec[i] == NewBldVec[j]) { NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 767c3c7bd5bf..b560da8e91d9 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -1,9 +1,8 @@ //===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,9 +49,10 @@ public: bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, - bool *IsFast) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *IsFast = nullptr) const override; private: unsigned Gen; diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td index 687a9affa138..f62e6313b148 100644 --- a/lib/Target/AMDGPU/R600InstrFormats.td +++ b/lib/Target/AMDGPU/R600InstrFormats.td @@ -1,9 +1,8 @@ //===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 9cc3e5f3c314..d9e839fe2035 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -1,9 +1,8 @@ //===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -402,6 +401,7 @@ Swizzle(std::vector> Src, } static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { + assert(Op < 3 && "Out of range swizzle index"); switch (Swz) { case R600InstrInfo::ALU_VEC_012_SCL_210: { unsigned Cycles[3] = { 2, 1, 0}; diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index e6e34dc125f4..00d96c9676aa 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -1,9 +1,8 @@ //===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 10e873755222..f40eece859ee 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -1,9 +1,8 @@ //===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -296,6 +295,34 @@ class VTX_READ pattern> let VTXInst = 1; } +// FIXME: Deprecated. +class LocalLoad : LoadFrag , LocalAddress; + +class AZExtLoadBase : PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ + LoadSDNode *L = cast(N); + return L->getExtensionType() == ISD::ZEXTLOAD || + L->getExtensionType() == ISD::EXTLOAD; +}]>; + +def az_extload : AZExtLoadBase ; + +def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +// FIXME: These are deprecated +def az_extloadi8_local : LocalLoad ; +def az_extloadi16_local : LocalLoad ; + class LoadParamFrag : PatFrag < (ops node:$ptr), (load_type node:$ptr), [{ return isConstantLoad(cast(N), 0) || diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp index 3ca319c6c6c2..65011a9eadf8 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h index 29ac0920f997..6a5ac9023329 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index 7769a35aadce..34267a909b5e 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -1,9 +1,8 @@ //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h index 8a9a8d3d1e23..bc66f2ef5907 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.h +++ b/lib/Target/AMDGPU/R600MachineScheduler.h @@ -1,9 +1,8 @@ //===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index 7de5e2c9577d..1fe92d2269d3 100644 --- a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -1,9 +1,8 @@ //===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 692451cb8fe0..9f1cb6582b5c 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -1,9 +1,8 @@ //===- R600MergeVectorRegisters.cpp ---------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,17 +56,12 @@ using namespace llvm; #define DEBUG_TYPE "vec-merger" -static bool -isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { - for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), - E = MRI.def_instr_end(); It != E; ++It) { - return (*It).isImplicitDef(); - } - if (MRI.isReserved(Reg)) { +static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { + assert(MRI.isSSA()); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) return false; - } - llvm_unreachable("Reg without a def"); - return false; + const MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + return MI && MI->isImplicitDef(); } namespace { diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 612c62b514fd..df200baf11c1 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -1,9 +1,8 @@ //===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -187,8 +186,8 @@ public: // Does MII and MIJ share the same pred_sel ? int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel), OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel); - unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, - PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + Register PredI = (OpI > -1)?MII->getOperand(OpI).getReg() : Register(), + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg() : Register(); if (PredI != PredJ) return false; if (SUJ->isSucc(SUI)) { diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td index f39b3dc1bfd4..fff884e4848e 100644 --- a/lib/Target/AMDGPU/R600Processors.td +++ b/lib/Target/AMDGPU/R600Processors.td @@ -1,9 +1,8 @@ //===-- R600Processors.td - R600 Processor definitions --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -41,23 +40,24 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug", "GPU has CF_ALU bug" >; -class R600SubtargetFeatureGeneration Implies> : - SubtargetFeatureGeneration ; + SubtargetFeatureGeneration ; -def FeatureR600 : R600SubtargetFeatureGeneration<"R600", +def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600", [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] >; -def FeatureR700 : R600SubtargetFeatureGeneration<"R700", +def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700", [FeatureFetchLimit16, FeatureLocalMemorySize0] >; -def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", +def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen", [FeatureFetchLimit16, FeatureLocalMemorySize32768] >; def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + "northern-islands", [FeatureFetchLimit16, FeatureWavefrontSize64, FeatureLocalMemorySize32768] >; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index 38933e7616a0..685df74490fe 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -68,7 +67,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( return &CalleeSavedReg; } -unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return R600::NoRegister; } diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h index c4c77172b299..9378b70ca580 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -1,9 +1,8 @@ //===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,7 +26,7 @@ struct R600RegisterInfo final : public R600GenRegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; /// get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td index 70fb46c1a7d6..c998fe848193 100644 --- a/lib/Target/AMDGPU/R600Schedule.td +++ b/lib/Target/AMDGPU/R600Schedule.td @@ -1,9 +1,8 @@ //===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R700Instructions.td b/lib/Target/AMDGPU/R700Instructions.td index 613a0d729bb3..9c9a03209ec2 100644 --- a/lib/Target/AMDGPU/R700Instructions.td +++ b/lib/Target/AMDGPU/R700Instructions.td @@ -1,9 +1,8 @@ //===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp index 69cafef4a351..f8094e35816c 100644 --- a/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -1,9 +1,8 @@ //===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 98e9ea662324..b764ca7d7061 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -1,9 +1,8 @@ //===- SIAnnotateControlFlow.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,12 +12,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" @@ -38,6 +38,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include #include @@ -56,13 +57,13 @@ class SIAnnotateControlFlow : public FunctionPass { Type *Boolean; Type *Void; - Type *Int64; + Type *IntMask; Type *ReturnStruct; ConstantInt *BoolTrue; ConstantInt *BoolFalse; UndefValue *BoolUndef; - Constant *Int64Zero; + Constant *IntMaskZero; Function *If; Function *Else; @@ -75,6 +76,8 @@ class SIAnnotateControlFlow : public FunctionPass { LoopInfo *LI; + void initialize(Module &M, const GCNSubtarget &ST); + bool isUniform(BranchInst *T); bool isTopOfStack(BasicBlock *BB); @@ -104,8 +107,6 @@ public: SIAnnotateControlFlow() : FunctionPass(ID) {} - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "SI annotate control flow"; } @@ -115,6 +116,7 @@ public: AU.addRequired(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); FunctionPass::getAnalysisUsage(AU); } }; @@ -125,31 +127,34 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) char SIAnnotateControlFlow::ID = 0; /// Initialize all the types and constants used in the pass -bool SIAnnotateControlFlow::doInitialization(Module &M) { +void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { LLVMContext &Context = M.getContext(); Void = Type::getVoidTy(Context); Boolean = Type::getInt1Ty(Context); - Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64); + IntMask = ST.isWave32() ? Type::getInt32Ty(Context) + : Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, IntMask); BoolTrue = ConstantInt::getTrue(Context); BoolFalse = ConstantInt::getFalse(Context); BoolUndef = UndefValue::get(Boolean); - Int64Zero = ConstantInt::get(Int64, 0); - - If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if); - Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else); - IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break); - Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf); - return false; + IntMaskZero = ConstantInt::get(IntMask, 0); + + If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask }); + Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else, + { IntMask, IntMask }); + IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, + { IntMask, IntMask }); + Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); + EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); } /// Is the branch condition uniform or did the StructurizeCFG pass @@ -259,14 +264,23 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { return; BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); + PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front()); Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); Value *Arg = handleLoopCondition(Cond, Broken, L, Term); - for (BasicBlock *Pred : predecessors(Target)) - Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred); + for (BasicBlock *Pred : predecessors(Target)) { + Value *PHIValue = IntMaskZero; + if (Pred == BB) // Remember the value of the previous iteration. + PHIValue = Arg; + // If the backedge from Pred to Target could be executed before the exit + // of the loop at BB, it should not reset or change "Broken", which keeps + // track of the number of threads exited the loop at BB. + else if (L->contains(Pred) && DT->dominates(Pred, BB)) + PHIValue = Broken; + Broken->addIncoming(PHIValue, Pred); + } Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); @@ -308,6 +322,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); DA = &getAnalysis(); + TargetPassConfig &TPC = getAnalysis(); + const TargetMachine &TM = TPC.getTM(); + + initialize(*F.getParent(), TM.getSubtarget(F)); for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp deleted file mode 100644 index 7e884ad93a23..000000000000 --- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp +++ /dev/null @@ -1,97 +0,0 @@ -//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Inserts one nop instruction for each high level source statement for -/// debugger usage. -/// -/// Tools, such as a debugger, need to pause execution based on user input (i.e. -/// breakpoint). In order to do this, one nop instruction is inserted before the -/// first isa instruction of each high level source statement. Further, the -/// debugger may replace nop instructions with trap instructions based on user -/// input. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -using namespace llvm; - -#define DEBUG_TYPE "si-debugger-insert-nops" -#define PASS_NAME "SI Debugger Insert Nops" - -namespace { - -class SIDebuggerInsertNops : public MachineFunctionPass { -public: - static char ID; - - SIDebuggerInsertNops() : MachineFunctionPass(ID) { } - StringRef getPassName() const override { return PASS_NAME; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // anonymous namespace - -INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false) - -char SIDebuggerInsertNops::ID = 0; -char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID; - -FunctionPass *llvm::createSIDebuggerInsertNopsPass() { - return new SIDebuggerInsertNops(); -} - -bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { - // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not - // specified. - const GCNSubtarget &ST = MF.getSubtarget(); - if (!ST.debuggerInsertNops()) - return false; - - // Skip machine functions without debug info. - if (!MF.getMMI().hasDebugInfo()) - return false; - - // Target instruction info. - const SIInstrInfo *TII = ST.getInstrInfo(); - - // Set containing line numbers that have nop inserted. - DenseSet NopInserted; - - for (auto &MBB : MF) { - for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { - // Skip debug instructions and instructions without location. - if (MI->isDebugInstr() || !MI->getDebugLoc()) - continue; - - // Insert nop instruction if line number does not have nop inserted. - auto DL = MI->getDebugLoc(); - if (NopInserted.find(DL.getLine()) == NopInserted.end()) { - BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP)) - .addImm(0); - NopInserted.insert(DL.getLine()); - } - } - } - - return true; -} diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 7f6abc34cff3..a0e1ec6ac235 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -1,9 +1,8 @@ //===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -90,13 +89,22 @@ enum : uint64_t { // Is a D16 buffer instruction. D16Buf = UINT64_C(1) << 50, + // FLAT instruction accesses FLAT_GLBL or FLAT_SCRATCH segment. + IsNonFlatSeg = UINT64_C(1) << 51, + // Uses floating point double precision rounding mode - FPDPRounding = UINT64_C(1) << 51 + FPDPRounding = UINT64_C(1) << 52, + + // Instruction is FP atomic. + FPAtomic = UINT64_C(1) << 53, + + // Is a MFMA instruction. + IsMAI = UINT64_C(1) << 54 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. // The result is true if any of these tests are true. -enum ClassFlags { +enum ClassFlags : unsigned { S_NAN = 1 << 0, // Signaling NaN Q_NAN = 1 << 1, // Quiet NaN N_INFINITY = 1 << 2, // Negative infinity @@ -111,7 +119,7 @@ enum ClassFlags { } namespace AMDGPU { - enum OperandType { + enum OperandType : unsigned { /// Operands with register or 32-bit immediate OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, OPERAND_REG_IMM_INT64, @@ -119,6 +127,8 @@ namespace AMDGPU { OPERAND_REG_IMM_FP32, OPERAND_REG_IMM_FP64, OPERAND_REG_IMM_FP16, + OPERAND_REG_IMM_V2FP16, + OPERAND_REG_IMM_V2INT16, /// Operands with register or inline constant OPERAND_REG_INLINE_C_INT16, @@ -130,11 +140,22 @@ namespace AMDGPU { OPERAND_REG_INLINE_C_V2FP16, OPERAND_REG_INLINE_C_V2INT16, + /// Operands with an AccVGPR register or inline constant + OPERAND_REG_INLINE_AC_INT16, + OPERAND_REG_INLINE_AC_INT32, + OPERAND_REG_INLINE_AC_FP16, + OPERAND_REG_INLINE_AC_FP32, + OPERAND_REG_INLINE_AC_V2FP16, + OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, - OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, + OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16, + + OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16, + OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -151,17 +172,10 @@ namespace AMDGPU { }; } -namespace SIStackID { -enum StackTypes : uint8_t { - SCRATCH = 0, - SGPR_SPILL = 1 -}; -} - // Input operand modifiers bit-masks // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { - enum { + enum : unsigned { NEG = 1 << 0, // Floating-point negate modifier ABS = 1 << 1, // Floating-point absolute modifier SEXT = 1 << 0, // Integer sign-extend modifier @@ -173,7 +187,7 @@ namespace SISrcMods { } namespace SIOutMods { - enum { + enum : unsigned { NONE = 0, MUL2 = 1, MUL4 = 2, @@ -181,17 +195,33 @@ namespace SIOutMods { }; } +namespace AMDGPU { namespace VGPRIndexMode { - enum { - SRC0_ENABLE = 1 << 0, - SRC1_ENABLE = 1 << 1, - SRC2_ENABLE = 1 << 2, - DST_ENABLE = 1 << 3 - }; -} + +enum Id : unsigned { // id of symbolic names + ID_SRC0 = 0, + ID_SRC1, + ID_SRC2, + ID_DST, + + ID_MIN = ID_SRC0, + ID_MAX = ID_DST +}; + +enum EncBits : unsigned { + OFF = 0, + SRC0_ENABLE = 1 << ID_SRC0, + SRC1_ENABLE = 1 << ID_SRC1, + SRC2_ENABLE = 1 << ID_SRC2, + DST_ENABLE = 1 << ID_DST, + ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE +}; + +} // namespace VGPRIndexMode +} // namespace AMDGPU namespace AMDGPUAsmVariants { - enum { + enum : unsigned { DEFAULT = 0, VOP3 = 1, SDWA = 2, @@ -203,13 +233,14 @@ namespace AMDGPUAsmVariants { namespace AMDGPU { namespace EncValues { // Encoding values of enum9/8/7 operands -enum { +enum : unsigned { SGPR_MIN = 0, - SGPR_MAX = 101, + SGPR_MAX_SI = 101, + SGPR_MAX_GFX10 = 105, TTMP_VI_MIN = 112, TTMP_VI_MAX = 123, - TTMP_GFX9_MIN = 108, - TTMP_GFX9_MAX = 123, + TTMP_GFX9_GFX10_MIN = 108, + TTMP_GFX9_GFX10_MAX = 123, INLINE_INTEGER_C_MIN = 128, INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64 INLINE_INTEGER_C_MAX = 208, @@ -231,6 +262,8 @@ enum Id { // Message ID, width(4) [3:0]. ID_INTERRUPT = 1, ID_GS, ID_GS_DONE, + ID_GS_ALLOC_REQ = 9, + ID_GET_DOORBELL = 10, ID_SYSMSG = 15, ID_GAPS_LAST_, // Indicate that sequence has gaps. ID_GAPS_FIRST_ = ID_INTERRUPT, @@ -242,27 +275,28 @@ enum Id { // Message ID, width(4) [3:0]. enum Op { // Both GS and SYS operation IDs. OP_UNKNOWN_ = -1, OP_SHIFT_ = 4, - // width(2) [5:4] + OP_NONE_ = 0, + // Bits used for operation encoding + OP_WIDTH_ = 3, + OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_), + // GS operations are encoded in bits 5:4 OP_GS_NOP = 0, OP_GS_CUT, OP_GS_EMIT, OP_GS_EMIT_CUT, OP_GS_LAST_, OP_GS_FIRST_ = OP_GS_NOP, - OP_GS_WIDTH_ = 2, - OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_), - // width(3) [6:4] + // SYS operations are encoded in bits 6:4 OP_SYS_ECC_ERR_INTERRUPT = 1, OP_SYS_REG_RD, OP_SYS_HOST_TRAP_ACK, OP_SYS_TTRACE_PC, OP_SYS_LAST_, OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT, - OP_SYS_WIDTH_ = 3, - OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_) }; -enum StreamId { // Stream ID, (2) [9:8]. +enum StreamId : unsigned { // Stream ID, (2) [9:8]. + STREAM_ID_NONE_ = 0, STREAM_ID_DEFAULT_ = 0, STREAM_ID_LAST_ = 4, STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_, @@ -287,23 +321,34 @@ enum Id { // HwRegCode, (6) [5:0] ID_IB_STS = 7, ID_MEM_BASES = 15, ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES, - ID_SYMBOLIC_LAST_ = 16, + ID_TBA_LO = 16, + ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO, + ID_TBA_HI = 17, + ID_TMA_LO = 18, + ID_TMA_HI = 19, + ID_FLAT_SCR_LO = 20, + ID_FLAT_SCR_HI = 21, + ID_XNACK_MASK = 22, + ID_POPS_PACKER = 25, + ID_SYMBOLIC_LAST_ = 26, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) }; -enum Offset { // Offset, (5) [10:6] +enum Offset : unsigned { // Offset, (5) [10:6] OFFSET_DEFAULT_ = 0, OFFSET_SHIFT_ = 6, OFFSET_WIDTH_ = 5, OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + OFFSET_MEM_VIOL = 8, + OFFSET_SRC_SHARED_BASE = 16, OFFSET_SRC_PRIVATE_BASE = 0 }; -enum WidthMinusOne { // WidthMinusOne, (5) [15:11] +enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11] WIDTH_M1_DEFAULT_ = 31, WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, @@ -313,11 +358,16 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] WIDTH_M1_SRC_PRIVATE_BASE = 15 }; +// Some values from WidthMinusOne mapped into Width domain. +enum Width : unsigned { + WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1, +}; + } // namespace Hwreg namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. -enum Id { // id of symbolic names +enum Id : unsigned { // id of symbolic names ID_QUAD_PERM = 0, ID_BITMASK_PERM, ID_SWAP, @@ -325,7 +375,7 @@ enum Id { // id of symbolic names ID_BROADCAST }; -enum EncBits { +enum EncBits : unsigned { // swizzle mode encodings @@ -357,7 +407,7 @@ enum EncBits { namespace SDWA { -enum SdwaSel { +enum SdwaSel : unsigned { BYTE_0 = 0, BYTE_1 = 1, BYTE_2 = 2, @@ -367,13 +417,13 @@ enum SdwaSel { DWORD = 6, }; -enum DstUnused { +enum DstUnused : unsigned { UNUSED_PAD = 0, UNUSED_SEXT = 1, UNUSED_PRESERVE = 2, }; -enum SDWA9EncValues{ +enum SDWA9EncValues : unsigned { SRC_SGPR_MASK = 0x100, SRC_VGPR_MASK = 0xFF, VOPC_DST_VCC_MASK = 0x80, @@ -382,7 +432,8 @@ enum SDWA9EncValues{ SRC_VGPR_MIN = 0, SRC_VGPR_MAX = 255, SRC_SGPR_MIN = 256, - SRC_SGPR_MAX = 357, + SRC_SGPR_MAX_SI = 357, + SRC_SGPR_MAX_GFX10 = 361, SRC_TTMP_MIN = 364, SRC_TTMP_MAX = 379, }; @@ -391,7 +442,7 @@ enum SDWA9EncValues{ namespace DPP { -enum DppCtrl { +enum DppCtrl : unsigned { QUAD_PERM_FIRST = 0, QUAD_PERM_LAST = 0xFF, DPP_UNUSED1 = 0x100, @@ -422,7 +473,20 @@ enum DppCtrl { ROW_HALF_MIRROR = 0x141, BCAST15 = 0x142, BCAST31 = 0x143, - DPP_LAST = BCAST31 + DPP_UNUSED8_FIRST = 0x144, + DPP_UNUSED8_LAST = 0x14F, + ROW_SHARE_FIRST = 0x150, + ROW_SHARE_LAST = 0x15F, + ROW_XMASK_FIRST = 0x160, + ROW_XMASK_LAST = 0x16F, + DPP_LAST = ROW_XMASK_LAST +}; + +enum DppFiMode { + DPP_FI_0 = 0, + DPP_FI_1 = 1, + DPP8_FI_0 = 0xE9, + DPP8_FI_1 = 0xEA, }; } // namespace DPP @@ -505,6 +569,15 @@ enum DppCtrl { #define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) #define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) #define C_00B848_IEEE_MODE 0xFF7FFFFF +#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29) +#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1) +#define C_00B848_WGP_MODE 0xDFFFFFFF +#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30) +#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1) +#define C_00B848_MEM_ORDERED 0xBFFFFFFF +#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31) +#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1) +#define C_00B848_FWD_PROGRESS 0x7FFFFFFF // Helpers for setting FLOAT_MODE @@ -535,6 +608,15 @@ enum DppCtrl { #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 +#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) +#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22) +#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23) +#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8 +#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15) +#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800 +#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15) + #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 809f5bab4693..624953963cf4 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1,9 +1,8 @@ //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -104,7 +103,7 @@ using namespace llvm; static cl::opt EnableM0Merge( "amdgpu-enable-merge-m0", cl::desc("Merge and hoist M0 initializations"), - cl::init(false)); + cl::init(true)); namespace { @@ -144,14 +143,15 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() { return new SIFixSGPRCopies(); } -static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { +static bool hasVectorOperands(const MachineInstr &MI, + const SIRegisterInfo *TRI) { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; - if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) return true; } return false; @@ -184,14 +184,14 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && - TRI.hasVGPRs(SrcRC); + TRI.hasVectorRegisters(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && - TRI.hasVGPRs(DstRC); + TRI.hasVectorRegisters(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, @@ -278,6 +278,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + bool IsAGPR = TRI->hasAGPRs(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); @@ -296,6 +297,17 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, TmpReg) .add(MI.getOperand(I)); + if (IsAGPR) { + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); + unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC); + unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), + TmpAReg) + .addReg(TmpReg, RegState::Kill); + TmpReg = TmpAReg; + } + MI.getOperand(I).setReg(TmpReg); } @@ -440,18 +452,32 @@ static bool isReachable(const MachineInstr *From, (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); } +// Return the first non-prologue instruction in the block. +static MachineBasicBlock::iterator +getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) + ++I; + + return I; +} + // Hoist and merge identical SGPR initializations into a common predecessor. // This is intended to combine M0 initializations, but can work with any // SGPR. A VGPR cannot be processed since we cannot guarantee vector // executioon. static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, - MachineDominatorTree &MDT) { + MachineDominatorTree &MDT, + const TargetInstrInfo *TII) { // List of inits by immediate value. using InitListMap = std::map>; InitListMap Inits; // List of clobbering instructions. SmallVector Clobbers; + // List of instructions marked for deletion. + SmallSet MergedInstrs; + bool Changed = false; for (auto &MI : MRI.def_instructions(Reg)) { @@ -480,8 +506,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, MachineInstr *MI2 = *I2; // Check any possible interference - auto intereferes = [&](MachineBasicBlock::iterator From, - MachineBasicBlock::iterator To) -> bool { + auto interferes = [&](MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To) -> bool { assert(MDT.dominates(&*To, &*From)); @@ -513,23 +539,23 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, }; if (MDT.dominates(MI1, MI2)) { - if (!intereferes(MI2, MI1)) { + if (!interferes(MI2, MI1)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI2->getParent()) << " " << *MI2); - MI2->eraseFromParent(); - Defs.erase(I2++); + MergedInstrs.insert(MI2); Changed = true; + ++I2; continue; } } else if (MDT.dominates(MI2, MI1)) { - if (!intereferes(MI1, MI2)) { + if (!interferes(MI1, MI2)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI1->getParent()) << " " << *MI1); - MI1->eraseFromParent(); - Defs.erase(I1++); + MergedInstrs.insert(MI1); Changed = true; + ++I1; break; } } else { @@ -540,8 +566,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, continue; } - MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); - if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); + if (!interferes(MI1, I) && !interferes(MI2, I)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI1->getParent()) << " " << *MI1 @@ -549,9 +575,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, << printMBBReference(*MI2->getParent()) << " to " << printMBBReference(*I->getParent()) << " " << *MI2); I->getParent()->splice(I, MI2->getParent(), MI2); - MI1->eraseFromParent(); - Defs.erase(I1++); + MergedInstrs.insert(MI1); Changed = true; + ++I1; break; } } @@ -561,6 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } } + for (auto MI : MergedInstrs) + MI->removeFromParent(); + if (Changed) MRI.clearKillFlags(Reg); @@ -679,11 +708,12 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI, MDT); } + break; } case AMDGPU::REG_SEQUENCE: - if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) { + if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || + !hasVectorOperands(MI, TRI)) { foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; } @@ -698,7 +728,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && - (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + (TRI->hasVectorRegisters(Src0RC) || + TRI->hasVectorRegisters(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI, MDT); } @@ -709,7 +740,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII); return true; } diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp index 15ba78edf919..29484668a01d 100644 --- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -1,9 +1,8 @@ //===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp deleted file mode 100644 index 7761418c5336..000000000000 --- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ /dev/null @@ -1,418 +0,0 @@ -//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Computations in WWM can overwrite values in inactive channels for -/// variables that the register allocator thinks are dead. This pass adds fake -/// uses of those variables to their def(s) to make sure that they aren't -/// overwritten. -/// -/// As an example, consider this snippet: -/// %vgpr0 = V_MOV_B32_e32 0.0 -/// if (...) { -/// %vgpr1 = ... -/// %vgpr2 = WWM killed %vgpr1 -/// ... = killed %vgpr2 -/// %vgpr0 = V_MOV_B32_e32 1.0 -/// } -/// ... = %vgpr0 -/// -/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally, -/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since -/// writing %vgpr1 would only write to channels that would be clobbered by the -/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, -/// it would clobber even the inactive channels for which the if-condition is -/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use -/// of %vgpr0 to its def to make sure they aren't allocated to the -/// same register. -/// -/// In general, we need to figure out what registers might have their inactive -/// channels which are eventually used accidentally clobbered by a WWM -/// instruction. We do that by spotting three separate cases of registers: -/// -/// 1. A "then phi": the value resulting from phi elimination of a phi node at -/// the end of an if..endif. If there is WWM code in the "then", then we -/// make the def at the end of the "then" branch a partial def by adding an -/// implicit use of the register. -/// -/// 2. A "loop exit register": a value written inside a loop but used outside the -/// loop, where there is WWM code inside the loop (the case in the example -/// above). We add an implicit_def of the register in the loop pre-header, -/// and make the original def a partial def by adding an implicit use of the -/// register. -/// -/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node -/// in a loop header. If there is WWM code inside the loop, then we make all -/// defs inside the loop partial defs by adding an implicit use of the -/// register on each one. -/// -/// Note that we do not need to consider an if..else..endif phi. We only need to -/// consider non-uniform control flow, and control flow structurization would -/// have transformed a non-uniform if..else..endif into two if..endifs. -/// -/// The analysis to detect these cases relies on a property of the MIR -/// arising from this pass running straight after PHIElimination and before any -/// coalescing: that any virtual register with more than one definition must be -/// the new register added to lower a phi node by PHIElimination. -/// -/// FIXME: We should detect whether a register in one of the above categories is -/// already live at the WWM code before deciding to add the implicit uses to -/// synthesize its liveness. -/// -/// FIXME: I believe this whole scheme may be flawed due to the possibility of -/// the register allocator doing live interval splitting. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SparseBitVector.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-wwm-liveness" - -namespace { - -class SIFixWWMLiveness : public MachineFunctionPass { -private: - MachineDominatorTree *DomTree; - MachineLoopInfo *LoopInfo; - LiveIntervals *LIS = nullptr; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - - std::vector WWMs; - std::vector ThenDefs; - std::vector> LoopExitDefs; - std::vector> LoopPhiDefs; - -public: - static char ID; - - SIFixWWMLiveness() : MachineFunctionPass(ID) { - initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fix WWM Liveness"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredID(MachineDominatorsID); - AU.addRequiredID(MachineLoopInfoID); - // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved(); - AU.addPreserved(); - AU.addPreservedID(LiveVariablesID); - AU.addPreservedID(MachineLoopInfoID); - AU.addPreservedID(MachineDominatorsID); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - void processDef(MachineOperand &DefOpnd); - bool processThenDef(MachineOperand *DefOpnd); - bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop); - bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop); -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) - -char SIFixWWMLiveness::ID = 0; - -char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID; - -FunctionPass *llvm::createSIFixWWMLivenessPass() { - return new SIFixWWMLiveness(); -} - -bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n"); - bool Modified = false; - - // This doesn't actually need LiveIntervals, but we can preserve them. - LIS = getAnalysisIfAvailable(); - - const GCNSubtarget &ST = MF.getSubtarget(); - - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); - - DomTree = &getAnalysis(); - LoopInfo = &getAnalysis(); - - // Scan the function to find the WWM sections and the candidate registers for - // having liveness modified. - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::EXIT_WWM) - WWMs.push_back(&MI); - else { - for (MachineOperand &DefOpnd : MI.defs()) { - if (DefOpnd.isReg()) { - unsigned Reg = DefOpnd.getReg(); - if (TRI->isVGPR(*MRI, Reg)) - processDef(DefOpnd); - } - } - } - } - } - if (!WWMs.empty()) { - // Synthesize liveness over WWM sections as required. - for (auto ThenDef : ThenDefs) - Modified |= processThenDef(ThenDef); - for (auto LoopExitDef : LoopExitDefs) - Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second); - for (auto LoopPhiDef : LoopPhiDefs) - Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second); - } - - WWMs.clear(); - ThenDefs.clear(); - LoopExitDefs.clear(); - LoopPhiDefs.clear(); - - return Modified; -} - -// During the function scan, process an operand that defines a VGPR. -// This categorizes the register and puts it in the appropriate list for later -// use when processing a WWM section. -void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) { - unsigned Reg = DefOpnd.getReg(); - // Get all the defining instructions. For convenience, make Defs[0] the def - // we are on now. - SmallVector Defs; - Defs.push_back(DefOpnd.getParent()); - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd.getParent()) - Defs.push_back(&MI); - } - // Check whether this def dominates all the others. If not, ignore this def. - // Either it is going to be processed when the scan encounters its other def - // that dominates all defs, or there is no def that dominates all others. - // The latter case is an eliminated phi from an if..else..endif or similar, - // which must be for uniform control flow so can be ignored. - // Because this pass runs shortly after PHIElimination, we assume that any - // multi-def register is a lowered phi, and thus has each def in a separate - // basic block. - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent())) - return; - } - // Check for the case of an if..endif lowered phi: It has two defs, one - // dominates the other, and there is a single use in a successor of the - // dominant def. - // Later we will spot any WWM code inside - // the "then" clause and turn the second def into a partial def so its - // liveness goes through the WWM code in the "then" clause. - if (Defs.size() == 2) { - auto DomDefBlock = Defs[0]->getParent(); - if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) { - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - for (auto Succ : DomDefBlock->successors()) { - if (Succ == UseBlock) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n"); - ThenDefs.push_back(&DefOpnd); - return; - } - } - } - } - // Check for the case of a non-lowered-phi register (single def) that exits - // a loop, that is, it has a use that is outside a loop that the def is - // inside. We find the outermost loop that the def is inside but a use is - // outside. Later we will spot any WWM code inside that loop and then make - // the def a partial def so its liveness goes round the loop and through the - // WWM code. - if (Defs.size() == 1) { - auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent()); - if (!Loop) - return; - bool IsLoopExit = false; - for (auto &Use : MRI->use_instructions(Reg)) { - auto UseBlock = Use.getParent(); - if (Loop->contains(UseBlock)) - continue; - IsLoopExit = true; - while (auto Parent = Loop->getParentLoop()) { - if (Parent->contains(UseBlock)) - break; - Loop = Parent; - } - } - if (!IsLoopExit) - return; - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop exit reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopExitDefs.push_back(std::pair( - &DefOpnd, Loop)); - return; - } - // Check for the case of a lowered single-preheader-loop phi, that is, a - // multi-def register where the dominating def is in the loop pre-header and - // all other defs are in backedges. Later we will spot any WWM code inside - // that loop and then make the backedge defs partial defs so the liveness - // goes through the WWM code. - // Note that we are ignoring multi-preheader loops on the basis that the - // structurizer does not allow that for non-uniform loops. - // There must be a single use in the loop header. - if (!MRI->hasOneUse(Reg)) - return; - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - auto Loop = LoopInfo->getLoopFor(UseBlock); - if (!Loop || Loop->getHeader() != UseBlock - || Loop->contains(Defs[0]->getParent())) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is multi-def but single use not in loop header\n"); - return; - } - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!Loop->contains(Defs[I]->getParent())) - return; - } - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop phi reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopPhiDefs.push_back( - std::pair(&DefOpnd, Loop)); -} - -// Process a then phi def: It has two defs, one dominates the other, and there -// is a single use in a successor of the dominant def. Here we spot any WWM -// code inside the "then" clause and turn the second def into a partial def so -// its liveness goes through the WWM code in the "then" clause. -bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) { - LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent()); - if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) { - // Ignore if dominating def is undef. - LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n"); - return false; - } - unsigned Reg = DefOpnd->getReg(); - // Get the use block, which is the endif block. - auto UseBlock = MRI->use_instr_begin(Reg)->getParent(); - // Check whether there is WWM code inside the then branch. The WWM code must - // be dominated by the if but not dominated by the endif. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent()) - && !DomTree->dominates(UseBlock, WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - // Get the other def. - MachineInstr *OtherDef = nullptr; - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd->getParent()) - OtherDef = &MI; - } - // Make it a partial def. - OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *OtherDef); - return true; -} - -// Process a loop exit def, that is, a register with a single use in a loop -// that has a use outside the loop. Here we spot any WWM code inside that loop -// and then make the def a partial def so its liveness goes round the loop and -// through the WWM code. -bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Add a new implicit_def in loop preheader(s). - for (auto Pred : Loop->getHeader()->predecessors()) { - if (!Loop->contains(Pred)) { - auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(), - TII->get(TargetOpcode::IMPLICIT_DEF), Reg); - LLVM_DEBUG(dbgs() << *ImplicitDef); - (void)ImplicitDef; - } - } - // Make the original def partial. - DefOpnd->getParent()->addOperand(MachineOperand::CreateReg( - Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *DefOpnd->getParent()); - return true; -} - -// Process a loop phi def, that is, a multi-def register where the dominating -// def is in the loop pre-header and all other defs are in backedges. Here we -// spot any WWM code inside that loop and then make the backedge defs partial -// defs so the liveness goes through the WWM code. -bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Remove kill mark from uses. - for (auto &Use : MRI->use_operands(Reg)) - Use.setIsKill(false); - // Make all defs except the dominating one partial defs. - SmallVector Defs; - for (auto &Def : MRI->def_instructions(Reg)) - Defs.push_back(&Def); - for (auto Def : Defs) { - if (DefOpnd->getParent() == Def) - continue; - Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *Def); - } - return true; -} - diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp index ee39eb04d831..5b834c8de13a 100644 --- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp +++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -1,9 +1,8 @@ //===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file /// SIFixupVectorISel pass cleans up post ISEL Vector issues. @@ -198,6 +197,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB, // Atomics dont have a GLC, so omit the field if not there. if (Glc) NewGlob->addOperand(MF, *Glc); + + MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc); + if (DLC) + NewGlob->addOperand(MF, *DLC); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); // _D16 have an vdst_in operand, copy it in. MachineOperand *VDstInOp = TII->getNamedOperand(MI, diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f4e866958369..74d77d328019 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1,9 +1,8 @@ //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -51,7 +50,7 @@ struct FoldCandidate { } else if (FoldOp->isFI()) { FrameIndexToFold = FoldOp->getIndex(); } else { - assert(FoldOp->isReg()); + assert(FoldOp->isReg() || FoldOp->isGlobal()); OpToFold = FoldOp; } } @@ -68,6 +67,8 @@ struct FoldCandidate { return Kind == MachineOperand::MO_Register; } + bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + bool isCommuted() const { return Commuted; } @@ -88,10 +89,11 @@ public: const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; + const SIMachineFunctionInfo *MFI; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const; @@ -160,19 +162,34 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, } } +// TODO: Add heuristic that the frame index might not fit in the addressing mode +// immediate offset to avoid materializing in loops. +static bool frameIndexMayFold(const SIInstrInfo *TII, + const MachineInstr &UseMI, + int OpNo, + const MachineOperand &OpToFold) { + return OpToFold.isFI() && + (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && + OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); +} + FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } static bool updateOperand(FoldCandidate &Fold, const SIInstrInfo &TII, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, + const GCNSubtarget &ST) { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); assert(Old.isReg()); if (Fold.isImm()) { - if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) { + if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && + !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && + AMDGPU::isInlinableLiteralV216(static_cast(Fold.ImmToFold), + ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is // already set. unsigned Opcode = MI->getOpcode(); @@ -190,77 +207,94 @@ static bool updateOperand(FoldCandidate &Fold, unsigned Val = Mod.getImm(); if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) return false; - // If upper part is all zero we do not need op_sel_hi. - if (!isUInt<16>(Fold.ImmToFold)) { - if (!(Fold.ImmToFold & 0xffff)) { - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + // Only apply the following transformation if that operand requries + // a packed immediate. + switch (TII.get(Opcode).OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + // If upper part is all zero we do not need op_sel_hi. + if (!isUInt<16>(Fold.ImmToFold)) { + if (!(Fold.ImmToFold & 0xffff)) { + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; + } Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); return true; } - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + break; + default: + break; } } + } - if (Fold.needsShrink()) { - MachineBasicBlock *MBB = MI->getParent(); - auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); - if (Liveness != MachineBasicBlock::LQR_Dead) - return false; - - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - int Op32 = Fold.getShrinkOpcode(); - MachineOperand &Dst0 = MI->getOperand(0); - MachineOperand &Dst1 = MI->getOperand(1); - assert(Dst0.isDef() && Dst1.isDef()); - - bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); + if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { + MachineBasicBlock *MBB = MI->getParent(); + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); + if (Liveness != MachineBasicBlock::LQR_Dead) + return false; - const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); - unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); - const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg()); - unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + int Op32 = Fold.getShrinkOpcode(); + MachineOperand &Dst0 = MI->getOperand(0); + MachineOperand &Dst1 = MI->getOperand(1); + assert(Dst0.isDef() && Dst1.isDef()); - MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); + bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); - if (HaveNonDbgCarryUse) { - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) - .addReg(AMDGPU::VCC, RegState::Kill); - } + const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); + unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); - // Keep the old instruction around to avoid breaking iterators, but - // replace the outputs with dummy registers. - Dst0.setReg(NewReg0); - Dst1.setReg(NewReg1); + MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); - if (Fold.isCommuted()) - TII.commuteInstruction(*Inst32, false); - return true; + if (HaveNonDbgCarryUse) { + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) + .addReg(AMDGPU::VCC, RegState::Kill); } - Old.ChangeToImmediate(Fold.ImmToFold); + // Keep the old instruction around to avoid breaking iterators, but + // replace it with a dummy instruction to remove uses. + // + // FIXME: We should not invert how this pass looks at operands to avoid + // this. Should track set of foldable movs instead of looking for uses + // when looking at a use. + Dst0.setReg(NewReg0); + for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) + MI->RemoveOperand(I); + MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); + + if (Fold.isCommuted()) + TII.commuteInstruction(*Inst32, false); return true; } assert(!Fold.needsShrink() && "not handled"); - if (Fold.isFI()) { - Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); return true; } - MachineOperand *New = Fold.OpToFold; - if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && - TargetRegisterInfo::isVirtualRegister(New->getReg())) { - Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); - - Old.setIsUndef(New->isUndef()); + if (Fold.isGlobal()) { + Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), + Fold.OpToFold->getTargetFlags()); return true; } - // FIXME: Handle physical registers. + if (Fold.isFI()) { + Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + return true; + } - return false; + MachineOperand *New = Fold.OpToFold; + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + Old.setIsUndef(New->isUndef()); + return true; } static bool isUseMIInFoldList(ArrayRef FoldList, @@ -277,7 +311,6 @@ static bool tryAddToFoldList(SmallVectorImpl &FoldList, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || @@ -344,7 +377,7 @@ static bool tryAddToFoldList(SmallVectorImpl &FoldList, if ((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64 || Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME - OpToFold->isImm()) { + (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); // Verify the other operand is a VGPR, otherwise we would violate the @@ -357,7 +390,10 @@ static bool tryAddToFoldList(SmallVectorImpl &FoldList, assert(MI->getOperand(1).isDef()); - int Op32 = AMDGPU::getVOPe32(Opc); + // Make sure to get the 32-bit version of the commuted opcode. + unsigned MaybeCommutedOpc = MI->getOpcode(); + int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, Op32)); return true; @@ -384,10 +420,75 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +static bool tryToFoldACImm(const SIInstrInfo *TII, + const MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl &FoldList) { + const MCInstrDesc &Desc = UseMI->getDesc(); + const MCOperandInfo *OpInfo = Desc.OpInfo; + if (!OpInfo || UseOpIdx >= Desc.getNumOperands()) + return false; + + uint8_t OpTy = OpInfo[UseOpIdx].OperandType; + if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return false; + + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); + return true; + } + + if (!OpToFold.isReg()) + return false; + + unsigned UseReg = OpToFold.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + return false; + + if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { + return FC.UseMI == UseMI; }) != FoldList.end()) + return false; + + MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + int64_t Imm; + MachineOperand *Op; + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + const MachineOperand &Sub = Def->getOperand(I); + if (!Sub.isReg() || Sub.getSubReg()) + return false; + MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); + while (SubDef && !SubDef->isMoveImmediate() && + !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) + SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); + if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) + return false; + Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); + if (I == 1) { + if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + return false; + + Imm = SubImm; + continue; + } + if (Imm != SubImm) + return false; // Can only fold splat constants + } + + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + return true; +} + void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); @@ -420,11 +521,18 @@ void SIFoldOperands::foldOperand( unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); + + if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI, + RSUse.getOperandNo(), FoldList)) + continue; + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; @@ -435,10 +543,32 @@ void SIFoldOperands::foldOperand( return; } + if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList)) + return; - bool FoldingImm = OpToFold.isImm(); + if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { + // Sanity check that this is a stack access. + // FIXME: Should probably use stack pseudos before frame lowering. + MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && + SOff->getReg() != MFI->getStackPtrOffsetReg())) + return; + + if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != + MFI->getScratchRSrcReg()) + return; - if (FoldingImm && UseMI->isCopy()) { + // A frame index will resolve to a positive constant, so it should always be + // safe to fold the addressing mode, even pre-GFX9. + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); + SOff->setReg(MFI->getStackPtrOffsetReg()); + return; + } + + bool FoldingImmLike = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); + + if (FoldingImmLike && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? @@ -449,7 +579,7 @@ void SIFoldOperands::foldOperand( if (TargetRegisterInfo::isVirtualRegister(DestReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; SmallVector CopyUses; for (MachineRegisterInfo::use_iterator @@ -467,6 +597,14 @@ void SIFoldOperands::foldOperand( } } + if (DestRC == &AMDGPU::AGPR_32RegClass && + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + CopiesToReplace.push_back(UseMI); + return; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -479,18 +617,71 @@ void SIFoldOperands::foldOperand( } else { if (UseMI->isCopy() && OpToFold.isReg() && TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && - TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { + unsigned Size = TII->getOpSize(*UseMI, 1); UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + if (Size != 4) + return; + if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32)); return; } + unsigned UseOpc = UseMI->getOpcode(); + if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || + (UseOpc == AMDGPU::V_READLANE_B32 && + (int)UseOpIdx == + AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { + // %vgpr = V_MOV_B32 imm + // %sgpr = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr = S_MOV_B32 imm + if (FoldingImmLike) { + if (execMayBeModifiedBeforeUse(*MRI, + UseMI->getOperand(UseOpIdx).getReg(), + *OpToFold.getParent(), + *UseMI)) + return; + + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); + if (OpToFold.isImm()) + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + else + UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + + if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { + if (execMayBeModifiedBeforeUse(*MRI, + UseMI->getOperand(UseOpIdx).getReg(), + *OpToFold.getParent(), + *UseMI)) + return; + + // %vgpr = COPY %sgpr0 + // %sgpr1 = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr1 = COPY %sgpr0 + UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + } + const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes @@ -501,7 +692,7 @@ void SIFoldOperands::foldOperand( return; } - if (!FoldingImm) { + if (!FoldingImmLike) { tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit @@ -515,14 +706,10 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI->getRegClass(UseReg) : - TRI->getPhysRegClass(UseReg); + const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; @@ -763,14 +950,23 @@ static bool tryFoldInst(const SIInstrInfo *TII, Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); - if (Src1->isIdenticalTo(*Src0)) { + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + if (Src1->isIdenticalTo(*Src0) && + (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) && + (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) { LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); + auto &NewDesc = + TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx != -1) MI->RemoveOperand(Src2Idx); MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); - mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY - : getMovOpc(false))); + if (Src1ModIdx != -1) + MI->RemoveOperand(Src1ModIdx); + if (Src0ModIdx != -1) + MI->RemoveOperand(Src0ModIdx); + mutateCopyOp(*MI, NewDesc); LLVM_DEBUG(dbgs() << *MI << '\n'); return true; } @@ -788,7 +984,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, SmallVector FoldList; MachineOperand &Dst = MI.getOperand(0); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; @@ -840,6 +1036,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, + CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; @@ -874,7 +1073,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, *TII, *TRI)) { + if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); @@ -926,7 +1125,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { // Having a 0 op_sel_hi would require swizzling the output in the source // instruction, which we can't do. - unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0; + unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 + : 0u; if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; @@ -1105,13 +1305,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); - - const SIMachineFunctionInfo *MFI = MF.getInfo(); + MFI = MF.getInfo(); // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // - bool IsIEEEMode = ST->enableIEEEBit(MF); + // FIXME: Also need to check strictfp + bool IsIEEEMode = MFI->getMode().IEEE; bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { @@ -1132,7 +1332,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { } MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index aa976d5141f8..f3c9ad63a80a 100644 --- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -1,9 +1,8 @@ //===-- SIFormMemoryClauses.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -119,6 +118,17 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { return false; if (!IsVMEMClause && !isSMEMClauseInst(MI)) return false; + // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. + for (const MachineOperand &ResMO : MI.defs()) { + unsigned ResReg = ResMO.getReg(); + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || MO.isDef()) + continue; + if (MO.getReg() == ResReg) + return false; + } + break; // Only check the first def. + } return true; } @@ -309,6 +319,8 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count(); MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count(); + unsigned FuncMaxClause = AMDGPU::getIntegerAttribute( + MF.getFunction(), "amdgpu-max-memory-clause", MaxClause); for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::instr_iterator Next; @@ -329,7 +341,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { continue; unsigned Length = 1; - for ( ; Next != E && Length < MaxClause; ++Next) { + for ( ; Next != E && Length < FuncMaxClause; ++Next) { if (!isValidClauseInst(*Next, IsVMEM)) break; diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index e4633c88e18f..feab6bed2603 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1,9 +1,8 @@ //===----------------------- SIFrameLowering.cpp --------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// @@ -22,6 +21,8 @@ using namespace llvm; +#define DEBUG_TYPE "frame-info" + static ArrayRef getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF) { @@ -35,6 +36,150 @@ static ArrayRef getAllSGPRs(const GCNSubtarget &ST, ST.getMaxNumSGPRs(MF)); } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC, + bool Unused = false) { + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + if (Unused) { + // We are looking for a register that can be used throughout the entire + // function, so any use is unacceptable. + for (unsigned Reg : RC) { + if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + return Reg; + } + } else { + for (unsigned Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + } + + // If we require an unused register, this is used in contexts where failure is + // an option and has an alternative plan. In other contexts, this must + // succeed0. + if (!Unused) + report_fatal_error("failed to find free scratch register"); + + return AMDGPU::NoRegister; +} + +static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { + LivePhysRegs LiveRegs; + LiveRegs.init(*MRI.getTargetRegisterInfo()); + return findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +} + +// We need to specially emit stack operations here because a different frame +// register is used than in the rest of the function, as getFrameRegister would +// use. +static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) + .addReg(SpillReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + +static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { @@ -71,6 +216,24 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) + .addReg(FlatScrInitHi) + .addImm(0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). + addReg(FlatScrInitLo). + addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | + (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). + addReg(FlatScrInitHi). + addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | + (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); + return; + } + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) .addReg(FlatScrInitLo) .addReg(ScratchWaveOffsetReg); @@ -81,6 +244,8 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, return; } + assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); + // Copy the size in bytes. BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) .addReg(FlatScrInitHi, RegState::Kill); @@ -145,34 +310,30 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset and stack pointer -// SGPRs. -std::pair +// Shift down registers reserved for the scratch wave offset. +std::pair SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(MFI->isEntryFunction()); + // No replacement necessary. if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { - assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); - return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { + return std::make_pair(AMDGPU::NoRegister, false); } - unsigned SPReg = MFI->getStackPtrOffsetReg(); if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, false); unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, false); AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -193,10 +354,11 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( unsigned ReservedRegCount = 13; if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, false); bool HandledScratchWaveOffsetReg = ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + bool FPAdjusted = false; for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the @@ -206,24 +368,25 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( HandledScratchWaveOffsetReg = true; MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { + assert(!hasFP(MF)); + MFI->setStackPtrOffsetReg(Reg); + } + MFI->setScratchWaveOffsetReg(Reg); + MFI->setFrameOffsetReg(Reg); ScratchWaveOffsetReg = Reg; + FPAdjusted = true; break; } } } - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was - // specified. - const GCNSubtarget &ST = MF.getSubtarget(); - if (ST.debuggerEmitPrologue()) - emitDebuggerPrologue(MF, MBB); - assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -234,6 +397,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // FIXME: We should be cleaning up these unused SGPR spill frame indices // somewhere. + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -251,38 +415,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - if (SPReg != AMDGPU::SP_REG) { - assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); - - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); - - if (StackSize == 0) { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } - } - unsigned ScratchRsrcReg = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); unsigned ScratchWaveOffsetReg; - std::tie(ScratchWaveOffsetReg, SPReg) - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - // It's possible to have uses of only ScratchWaveOffsetReg without - // ScratchRsrcReg if it's only used for the initialization of flat_scratch, - // but the inverse is not true. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { - assert(ScratchRsrcReg == AMDGPU::NoRegister); - return; - } + bool FPAdjusted; + std::tie(ScratchWaveOffsetReg, FPAdjusted) = + getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( @@ -294,18 +433,19 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchWaveOffsetReg); bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && MRI.isPhysRegUsed(ScratchRsrcReg); + // FIXME: Hack to not crash in situations which emitted an error. + if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + return; + // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - if (OffsetRegUsed) { - assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && - "scratch wave offset input is required"); - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - } + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); @@ -318,7 +458,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, if (&OtherBB == &MBB) continue; - if (OffsetRegUsed) + if (OffsetRegUsed || FPAdjusted) OtherBB.addLiveIn(ScratchWaveOffsetReg); if (ResourceRegUsed) @@ -346,11 +486,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (OffsetRegUsed && - PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + + // FIXME: Remove the isPhysRegUsed checks + const bool HasFP = hasFP(MF); + + if (HasFP || OffsetRegUsed) { + assert(ScratchWaveOffsetReg); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, - MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); } if (CopyBuffer && !CopyBufferFirst) { @@ -358,9 +503,26 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed) + if (ResourceRegUsed) { emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, PreloadedPrivateBufferReg, ScratchRsrcReg); + } + + if (HasFP) { + DebugLoc DL; + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); + + // On kernel entry, the private scratch wave offset is the SP value. + if (StackSize == 0) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } } // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. @@ -405,7 +567,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, } } MF.getRegInfo().addLiveIn(GitPtrLo); - MF.front().addLiveIn(GitPtrLo); + MBB.addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) .addReg(GitPtrLo) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); @@ -421,12 +583,15 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable, - 0, 0); + 16, 4); unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; + const GCNSubtarget &Subtarget = MF.getSubtarget(); + unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset); BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) - .addImm(Offset) // offset + .addImm(EncodedOffset) // offset .addImm(0) // glc + .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); return; @@ -462,13 +627,17 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable, - 0, 0); + 8, 4); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset .addImm(0) // glc + .addImm(0) // dlc .addMemOperand(MMO) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); + MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); } } else { unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); @@ -494,38 +663,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, } } -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { - MachineFunction *MF = MBB.getParent(); - - const GCNSubtarget &Subtarget = MF->getSubtarget(); - const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); - LivePhysRegs LiveRegs(TRI); - LiveRegs.addLiveIns(MBB); - - // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); - for (unsigned i = 0; CSRegs[i]; ++i) - LiveRegs.addReg(CSRegs[i]); - - MachineRegisterInfo &MRI = MF->getRegInfo(); - - for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { - if (LiveRegs.available(MRI, Reg)) - return Reg; +bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { + switch (ID) { + case TargetStackID::Default: + case TargetStackID::NoAlloc: + case TargetStackID::SGPRSpill: + return true; } - - return AMDGPU::NoRegister; + llvm_unreachable("Invalid TargetStackID::Value"); } void SIFrameLowering::emitPrologue(MachineFunction &MF, @@ -537,31 +682,105 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + LivePhysRegs LiveRegs; MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; - // XXX - Is this the right predicate? - - bool NeedFP = hasFP(MF); + bool HasFP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; - const bool NeedsRealignment = TRI.needsStackRealignment(MF); + // To avoid clobbering VGPRs in lanes that weren't active on function entry, + // turn on all lanes before doing the spill to memory. + unsigned ScratchExecCopy = AMDGPU::NoRegister; + + // Emit the copy if we need an FP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + + if (ScratchExecCopy == AMDGPU::NoRegister) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + if (FuncInfo->SGPRForFPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } + + ScratchExecCopy + = findScratchNonCalleeSaveRegister(MRI, LiveRegs, + *TRI.getWaveMaskRegClass()); + assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); + + const unsigned OrSaveExec = ST.isWave32() ? + AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), + ScratchExecCopy) + .addImm(-1); + } - if (NeedsRealignment) { - assert(NeedFP); + buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + StackPtrReg, + Reg.FI.getValue()); + } + + if (ScratchExecCopy != AMDGPU::NoRegister) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } + + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(FI) && + MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + + // Save FP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(FramePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } + + if (TRI.needsStackRealignment(MF)) { + HasFP = true; const unsigned Alignment = MFI.getMaxAlignment(); RoundedSize += Alignment; + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } - unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); - assert(ScratchSPReg != AMDGPU::NoRegister); + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); + assert(ScratchSPReg != AMDGPU::NoRegister && + ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 @@ -574,7 +793,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm(-Alignment * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); - } else if (NeedFP) { + } else if ((HasFP = hasFP(MF))) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -584,21 +803,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if (RoundedSize != 0 && hasSP(MF)) { + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } + assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister || + FuncInfo->FramePointerSaveIndex)) && + "Needed to save FP but didn't save it anywhere"); + + assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister && + !FuncInfo->FramePointerSaveIndex)) && + "Saved FP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -609,39 +827,87 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + LivePhysRegs LiveRegs; + DebugLoc DL; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + + if (RoundedSize != 0 && hasFP(MF)) { + const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); + } + + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + + assert(!MF.getFrameInfo().isDeadObjectIndex(FI) && + MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill); + + ArrayRef Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + FuncInfo->getFrameOffsetReg()) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + unsigned ScratchExecCopy = AMDGPU::NoRegister; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - if (StackPtrReg == AMDGPU::NoRegister) - return; + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + if (ScratchExecCopy == AMDGPU::NoRegister) { + // See emitPrologue + if (LiveRegs.empty()) { + LiveRegs.init(*ST.getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } - const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint32_t NumBytes = MFI.getStackSize(); + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + LiveRegs.removeReg(ScratchExecCopy); - DebugLoc DL; + const unsigned OrSaveExec = + ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - // FIXME: Clarify distinction between no set SP and SP. For callee functions, - // it's really whether we need SP to be accurate or not. + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy) + .addImm(-1); + } - if (NumBytes != 0 && hasSP(MF)) { - uint32_t RoundedSize = FuncInfo->isStackRealigned() ? - NumBytes + MFI.getMaxAlignment() : NumBytes; + buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); + } - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()); + if (ScratchExecCopy != AMDGPU::NoRegister) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); } } +// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not +// memory. They should have been removed by now. static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { @@ -652,6 +918,22 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { return true; } +#ifndef NDEBUG +static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, + Optional FramePointerSaveIndex) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I) && + MFI.getStackID(I) == TargetStackID::SGPRSpill && + FramePointerSaveIndex && I != FramePointerSaveIndex) { + return false; + } + } + + return true; +} +#endif + int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const SIRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); @@ -665,81 +947,145 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return; - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - bool AllSGPRSpilledToVGPRs = false; - - if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { - AllSGPRSpilledToVGPRs = true; - - // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs - // are spilled to VGPRs, in which case we can eliminate the stack usage. - // - // XXX - This operates under the assumption that only other SGPR spills are - // users of the frame index. I'm not 100% sure this is correct. The - // StackColoring pass has a comment saying a future improvement would be to - // merging of allocas with spill slots, but for now according to - // MachineFrameInfo isSpillSlot can't alias any other object. - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator Next; - for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { - MachineInstr &MI = *I; - Next = std::next(I); - - if (TII->isSGPRSpill(MI)) { - int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); - assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { - bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - } else - AllSGPRSpilledToVGPRs = false; - } - } - } - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); - } + FuncInfo->removeDeadFrameIndices(MFI); + assert(allSGPRSpillsAreDead(MFI, None) && + "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source // allocas. Stack temps produced from legalization are not counted currently. - if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || - !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + if (!allStackObjectsAreDead(MFI)) { assert(RS && "RegScavenger required if spilling"); - // We force this to be at offset 0 so no user object ever has 0 as an - // address, so we may use 0 as an invalid pointer value. This is because - // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca - // is required to be address space 0, we are forced to accept this for - // now. Ideally we could have the stack in another address space with 0 as a - // valid pointer, and -1 as the null value. - // - // This will also waste additional space when user stack objects require > 4 - // byte alignment. - // - // The main cost here is losing the offset for addressing modes. However - // this also ensures we shouldn't need a register for the offset when - // emergency scavenging. - int ScavengeFI = MFI.CreateFixedObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); - RS->addScavengingFrameIndex(ScavengeFI); + if (FuncInfo->isEntryFunction()) { + int ScavengeFI = MFI.CreateFixedObject( + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); + RS->addScavengingFrameIndex(ScavengeFI); + } else { + int ScavengeFI = MFI.CreateStackObject( + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), + TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), + false); + RS->addScavengingFrameIndex(ScavengeFI); + } } } -void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, +// Only report VGPRs to generic code. +void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedVGPRs, RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); + SIMachineFunctionInfo *MFI = MF.getInfo(); + if (MFI->isEntryFunction()) + return; + + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // Ignore the SGPRs the default implementation found. + SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + + // hasFP only knows about stack objects that already exist. We're now + // determining the stack slots that will be created, so we have to predict + // them. Stack objects force FP usage with calls. + // + // Note a new VGPR CSR may be introduced if one is used for the spill, but we + // don't want to report it here. + // + // FIXME: Is this really hasReservedCallFrame? + const bool WillHaveFP = + FrameInfo.hasCalls() && + (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + + // VGPRs used for SGPR spilling need to be specially inserted in the prolog, + // so don't allow the default insertion to handle them. + for (auto SSpill : MFI->getSGPRSpillVGPRs()) + SavedVGPRs.reset(SSpill.VGPR); + + const bool HasFP = WillHaveFP || hasFP(MF); + if (!HasFP) + return; + + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + + // If there is already a VGPR with free lanes, use it. We may already have + // to pay the penalty for spilling a CSR VGPR. + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n'); + return; + } + + MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); + + if (!MFI->SGPRForFPSaveRestoreCopy) { + // There's no free lane to spill, and no free register to save FP, so we're + // forced to spill another VGPR to use for the spill. + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving FP with copy to " << + printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + } +} + +void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const SIMachineFunctionInfo *MFI = MF.getInfo(); + if (MFI->isEntryFunction()) + return; + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); // The SP is specifically managed and we don't want extra spills of it. SavedRegs.reset(MFI->getStackPtrOffsetReg()); + SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); +} + +bool SIFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + if (!FuncInfo->SGPRForFPSaveRestoreCopy) + return false; + + for (auto &CS : CSI) { + if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + break; + } + } + + return false; } MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( @@ -757,8 +1103,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - if (!TFI->hasReservedCallFrame(MF)) { + if (!hasReservedCallFrame(MF)) { unsigned Align = getStackAlignment(); Amount = alignTo(Amount, Align); @@ -777,60 +1122,25 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(I); } -void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - - MachineBasicBlock::iterator I = MBB.begin(); - DebugLoc DL; - - // For each dimension: - for (unsigned i = 0; i < 3; ++i) { - // Get work group ID SGPR, and make it live-in again. - unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); - MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); - MBB.addLiveIn(WorkGroupIDSGPR); - - // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in - // order to spill it to scratch. - unsigned WorkGroupIDVGPR = - MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) - .addReg(WorkGroupIDSGPR); - - // Spill work group ID. - int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); - TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, - WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); - - // Get work item ID VGPR, and make it live-in again. - unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); - MF.getRegInfo().addLiveIn(WorkItemIDVGPR); - MBB.addLiveIn(WorkItemIDVGPR); - - // Spill work item ID. - int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); - TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, - WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); - } -} - bool SIFrameLowering::hasFP(const MachineFunction &MF) const { - // All stack operations are relative to the frame offset SGPR. - // TODO: Still want to eliminate sometimes. const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.hasCalls()) { + // All offsets are unsigned, so need to be addressed in the same direction + // as stack growth. + + // FIXME: This function is pretty broken, since it can be called before the + // frame layout is determined or CSR spills are inserted. + if (MFI.getStackSize() != 0) + return true; + + // For the entry point, the input wave scratch offset must be copied to the + // API SP if there are calls. + if (MF.getInfo()->isEntryFunction()) + return true; + } - // XXX - Is this only called after frame is finalized? Should be able to check - // frame size. - return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); -} - -bool SIFrameLowering::hasSP(const MachineFunction &MF) const { - const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - // All stack operations are relative to the frame offset SGPR. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); + return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || + MFI.hasStackMap() || MFI.hasPatchPoint() || + MF.getSubtarget().getRegisterInfo()->needsStackRealignment(MF) || + MF.getTarget().Options.DisableFramePointerElim(MF); } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 2f35b3631cdc..c644f4726e2c 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -1,9 +1,8 @@ //===--------------------- SIFrameLowering.h --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -37,6 +36,14 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; + + bool isSupportedStackID(TargetStackID::Value ID) const override; void processFunctionBeforeFrameFinalized( MachineFunction &MF, @@ -59,15 +66,9 @@ private: SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - std::pair getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; - - /// Emits debugger prologue. - void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; + std::pair getReservedPrivateSegmentWaveByteOffsetReg( + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const; // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, @@ -77,7 +78,6 @@ private: public: bool hasFP(const MachineFunction &MF) const override; - bool hasSP(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 0ba921647097..db0782e2bf3e 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1,9 +1,8 @@ //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,6 @@ #include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "SIDefines.h" @@ -95,11 +93,10 @@ static cl::opt EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); -static cl::opt AssumeFrameIndexHighZeroBits( - "amdgpu-frame-index-zero-bits", - cl::desc("High bits of frame index assumed to be zero"), - cl::init(5), - cl::ReallyHidden); +static cl::opt DisableLoopAlignment( + "amdgpu-disable-loop-alignment", + cl::desc("Do not align and prefetch loops"), + cl::init(false)); static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); @@ -125,12 +122,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); + addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); + addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -148,18 +151,27 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } + if (Subtarget->hasMAIInsts()) { + addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + } + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v5i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v32i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::STORE, MVT::v5i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -218,11 +230,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -248,8 +264,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) { + for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -323,6 +340,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + // Deal with vec3 vector operations when widened to vec4. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); + + // Deal with vec5 vector operations when widened to vec8. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -400,7 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + if (Subtarget->haveRoundOpsF64()) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -492,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals()) + if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -607,6 +636,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::SHL, MVT::v4i16, Custom); setOperationAction(ISD::SRA, MVT::v4i16, Custom); setOperationAction(ISD::SRL, MVT::v4i16, Custom); @@ -679,6 +711,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -701,13 +734,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); setSchedulingPreference(Sched::RegPressure); - - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); } const GCNSubtarget *SITargetLowering::getSubtarget() const { @@ -910,6 +939,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -919,13 +950,75 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = 0; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = cast(CI.getOperand(4)); + if (!Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(1)); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = dyn_cast(CI.getOperand(4)); if (!Vol || !Vol->isZero()) Info.flags |= MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType() + ->getPointerElementType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + return true; + } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + const ConstantInt *Vol = cast(CI.getOperand(1)); + if (!Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + Info.opc = ISD::INTRINSIC_VOID; + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.ptrVal = + MFI->getGWSPSV(*MF.getSubtarget().getInstrInfo()); + + // This is an abstract access, but we need to specify a type and size. + Info.memVT = MVT::i32; + Info.size = 4; + Info.align = 4; + + Info.flags = MachineMemOperand::MOStore; + if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) + Info.flags = MachineMemOperand::MOLoad; + return true; + } default: return false; } @@ -937,6 +1030,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -960,6 +1055,13 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { // GFX9 added a 13-bit signed offset. When using regular flat instructions, // the sign bit is ignored and is treated as a 12-bit unsigned offset. + // GFX10 shrinked signed offset to 12 bits. When using regular flat + // instructions, the sign bit is also ignored and is treated as 11-bit + // unsigned offset. + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + return isUInt<11>(AM.BaseOffs) && AM.Scale == 0; + // Just r + i return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; } @@ -1030,7 +1132,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return isLegalGlobalAddressingMode(AM); if (AS == AMDGPUAS::CONSTANT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1106,16 +1209,15 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { return (MemVT.getSizeInBits() <= 2 * 32); } return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1178,11 +1280,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32) && Align % 4 == 0; } -EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { +EVT SITargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { // FIXME: Should account for address space here. // The default fallback uses the private pointer size as a guess for a type to @@ -1201,7 +1302,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS) { return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + AS == AMDGPUAS::CONSTANT_ADDRESS || + AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -1216,8 +1318,8 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { return I && I->getMetadata("amdgpu.noclobber"); } -bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { +bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { // Flat -> private/local is a simple truncate. // Flat -> global is no-op if (SrcAS == AMDGPUAS::FLAT_ADDRESS) @@ -1305,6 +1407,17 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg) const { + // First, if it is a widened vector, narrow it. + if (VT.isVector() && + VT.getVectorNumElements() != MemVT.getVectorNumElements()) { + EVT NarrowedVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), + VT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, + DAG.getConstant(0, SL, MVT::i32)); + } + + // Then convert the vector elements or scalar value. if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -1441,8 +1554,7 @@ static void processShaderInputArgs(SmallVectorImpl &Splits, // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && - !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { - + !Arg->Flags.isInReg() && PSInputNum <= 15) { bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); // Inconveniently only the first part of the split is marked as isSplit, @@ -1508,7 +1620,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, // Try to allocate a VGPR at the end of the argument list, or if no argument // VGPRs are left allocating a stack slot. -static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { +// If \p Mask is is given it indicates bitfield position in the register. +// If \p Arg is given use it with new ]p Mask instead of allocating new. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, + ArgDescriptor Arg = ArgDescriptor()) { + if (Arg.isSet()) + return ArgDescriptor::createArg(Arg, Mask); + ArrayRef ArgVGPRs = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); @@ -1516,7 +1634,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { // Spill to stack required. int64_t Offset = CCInfo.AllocateStack(4, 4); - return ArgDescriptor::createStack(Offset); + return ArgDescriptor::createStack(Offset, Mask); } unsigned Reg = ArgVGPRs[RegIdx]; @@ -1525,7 +1643,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { MachineFunction &MF = CCInfo.getMachineFunction(); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - return ArgDescriptor::createRegister(Reg); + return ArgDescriptor::createRegister(Reg, Mask); } static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, @@ -1557,14 +1675,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasWorkItemIDX()) - Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + const unsigned Mask = 0x3ff; + ArgDescriptor Arg; - if (Info.hasWorkItemIDY()) - Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + if (Info.hasWorkItemIDX()) { + Arg = allocateVGPR32Input(CCInfo, Mask); + Info.setWorkItemIDX(Arg); + } + + if (Info.hasWorkItemIDY()) { + Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); + Info.setWorkItemIDY(Arg); + } if (Info.hasWorkItemIDZ()) - Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } static void allocateSpecialInputSGPRs(CCState &CCInfo, @@ -1714,6 +1839,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); bool HasStackObjects = MFI.hasStackObjects(); + const GCNSubtarget &ST = MF.getSubtarget(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1729,65 +1855,89 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const GCNSubtarget &ST = MF.getSubtarget(); - if (ST.isAmdHsaOrMesa(MF.getFunction())) { - if (RequiresStackAccess) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. - - unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - - if (MFI.hasCalls()) { - // If we have calls, we need to keep the frame register in a register - // that won't be clobbered by a call, so ensure it is copied somewhere. - - // This is not a problem for the scratch wave offset, because the same - // registers are reserved in all functions. - - // FIXME: Nothing is really ensuring this is a call preserved register, - // it's just selected from the end so it happens to be. - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } else { - unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); - } - } else { - unsigned ReservedBufferReg - = TRI.reservedPrivateSegmentBufferReg(MF); - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - - // We tentatively reserve the last registers (skipping the last two - // which may contain VCC). After register allocation, we'll replace - // these with the ones immediately after those which were really - // allocated. In the prologue copies will be inserted from the argument - // to these reserved registers. - Info.setScratchRSrcReg(ReservedBufferReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); + } - if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); + // hasFP should be accurate for kernels even before the frame is finalized. + if (ST.getFrameLowering()->hasFP(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); } else { - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); + + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; + } + } + + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); + } + + if (MFI.hasCalls()) { + Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); + Info.setFrameOffsetReg(AMDGPU::SGPR33); + } else { + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } + } else if (RequiresStackAccess) { + assert(!MFI.hasCalls()); + // We know there are accesses and they will be done relative to SP, so just + // pin it to the input. + // + // FIXME: Should not do this if inline asm is reading/writing these + // registers. + unsigned PreloadedSP = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + Info.setStackPtrOffsetReg(PreloadedSP); + Info.setScratchWaveOffsetReg(PreloadedSP); + Info.setFrameOffsetReg(PreloadedSP); + } else { + assert(!MFI.hasCalls()); + + // There may not be stack access at all. There may still be spills, or + // access of a constant pointer (in which cases an extra copy will be + // emitted in the prolog). + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedOffsetReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } } @@ -1845,7 +1995,6 @@ SDValue SITargetLowering::LowerFormalArguments( const Function &Fn = MF.getFunction(); FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); - const GCNSubtarget &ST = MF.getSubtarget(); if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { DiagnosticInfoUnsupported NoGraphicsHSA( @@ -1854,11 +2003,6 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } - // Create stack objects that are used for emitting debugger prologue if - // "amdgpu-debugger-emit-prologue" attribute was specified. - if (ST.debuggerEmitPrologue()) - createDebuggerPrologueStackObjects(MF); - SmallVector Splits; SmallVector ArgLocs; BitVector Skipped(Ins.size()); @@ -1869,12 +2013,6 @@ SDValue SITargetLowering::LowerFormalArguments( bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); - if (!IsEntryFunc) { - // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over - // this when allocating argument fixed offsets. - CCInfo.AllocateStack(4, 4); - } - if (IsShader) { processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -1975,7 +2113,8 @@ SDValue SITargetLowering::LowerFormalArguments( auto *ParamTy = dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. @@ -2002,13 +2141,14 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + if (Arg.Flags.isSRet()) { // The return object should be reasonably addressable. // FIXME: This helps when the return is a real sret. If it is a // automatically inserted sret (i.e. CanLowerReturn returns false), an // extra copy is inserted in SelectionDAGBuilder which obscures this. - unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + unsigned NumBits + = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); } @@ -2126,16 +2266,13 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue ReturnAddrReg = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - // FIXME: Should be able to use a vreg here, but need a way to prevent it - // from being allcoated to a CSR. - - SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); - - Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + SDValue ReturnAddrVirtualReg = DAG.getRegister( + MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass), + MVT::i64); + Chain = + DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); - - RetOps.push_back(PhysReturnAddrReg); + RetOps.push_back(ReturnAddrVirtualReg); } // Copy the result values into the output registers. @@ -2295,9 +2432,6 @@ void SITargetLowering::passSpecialInputs( AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::WORKITEM_ID_X, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR }; @@ -2337,6 +2471,71 @@ void SITargetLowering::passSpecialInputs( MemOpChains.push_back(ArgStore); } } + + // Pack workitem IDs into a single register or pass it as is if already + // packed. + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + if (!OutgoingArg) + return; + + const ArgDescriptor *IncomingArgX + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; + const ArgDescriptor *IncomingArgY + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; + const ArgDescriptor *IncomingArgZ + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + + SDValue InputReg; + SDLoc SL; + + // If incoming ids are not packed we need to pack them. + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); + Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, + DAG.getShiftAmountConstant(10, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; + } + + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); + Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, + DAG.getShiftAmountConstant(20, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; + } + + if (!InputReg.getNode()) { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } + + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); + MemOpChains.push_back(ArgStore); + } } static bool canGuaranteeTCO(CallingConv::ID CC) { @@ -2478,7 +2677,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported call from graphics shader of function "); } - // The first 4 bytes are reserved for the callee's emergency stack slot. if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2505,9 +2703,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - // The first 4 bytes are reserved for the callee's emergency stack slot. - CCInfo.AllocateStack(4, 4); - CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2528,31 +2723,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector, 8> RegsToPass; - SDValue CallerSavedFP; - // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); - unsigned OffsetReg = Info->getScratchWaveOffsetReg(); + SmallVector CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - - // TODO: Don't hardcode these registers and get from the callee function. - SDValue ScratchWaveOffsetReg - = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); - RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); - - if (!Info->isEntryFunction()) { - // Avoid clobbering this function's FP value. In the current convention - // callee will overwrite this, so do save/restore around the call site. - CallerSavedFP = DAG.getCopyFromReg(Chain, DL, - Info->getFrameOffsetReg(), MVT::i32); - } + CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); + Chain = DAG.getTokenFactor(DL, CopyFromChains); } SmallVector MemOpChains; @@ -2694,6 +2877,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); + // Add a redundant copy of the callee global which will not be legalized, as + // we need direct access to the callee later. + GlobalAddressSDNode *GSD = cast(Callee); + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -2735,12 +2923,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = Call.getValue(0); InFlag = Call.getValue(1); - if (CallerSavedFP) { - SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); - Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); - InFlag = Chain.getValue(1); - } - uint64_t CalleePopBytes = NumBytes; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), @@ -2773,8 +2955,8 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, } - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { + if (!Subtarget->hasFlatScrRegister() && + Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + "\" for subtarget.")); } @@ -2830,6 +3012,107 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, return SplitBB; } +// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, +// \p MI will be the only instruction in the loop body block. Otherwise, it will +// be the first instruction in the remainder block. +// +/// \returns { LoopBody, Remainder } +static std::pair +splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock::iterator I(&MI); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + + if (InstInLoop) { + auto Next = std::next(I); + + // Move instruction to loop body. + LoopBB->splice(LoopBB->begin(), &MBB, I, Next); + + // Move the rest of the block. + RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); + } else { + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + } + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +MachineBasicBlock * +SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); + + MachineBasicBlock::iterator I = LoopBB->end(); + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + + const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + + // Clear TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(0) + .addImm(EncodedReg); + + // This is a pain, but we're not allowed to have physical register live-ins + // yet. Insert a pair of copies if the VGPR0 hack is necessary. + if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { + unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) + .add(*Src); + + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) + .addReg(Data0); + + MRI.setSimpleHint(Data0, Src->getReg()); + } + + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Load and check TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) + .addImm(EncodedReg); + + // FIXME: Do we need to use an isel pseudo that may clobber scc? + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Reg, RegState::Kill) + .addImm(0); + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(LoopBB); + + return RemainderBB; +} + // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the // wavefront. If the value is uniform and just happens to be in a VGPR, this // will only do one iteration. In the worst case, this will loop 64 times. @@ -2849,12 +3132,16 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( int Offset, bool UseGPRIdxMode, bool IsIndirectSrc) { + MachineFunction *MF = OrigBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = LoopBB.begin(); - unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + unsigned PhiExec = MRI.createVirtualRegister(BoolRC); + unsigned NewExec = MRI.createVirtualRegister(BoolRC); unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) .addReg(InitReg) @@ -2878,7 +3165,9 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 + : AMDGPU::S_AND_SAVEEXEC_B64), + NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); @@ -2894,7 +3183,7 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addImm(Offset); } unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; MachineInstr *SetOn = BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) .addReg(IdxReg, RegState::Kill) @@ -2913,10 +3202,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( } // Update EXEC, switch all done bits to 0 and all todo bits to 1. + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(NewExec); + BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term + : AMDGPU::S_XOR_B64_term), Exec) + .addReg(Exec) + .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use // s_cbranch_scc0? @@ -2942,38 +3233,28 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, bool UseGPRIdxMode, bool IsIndirectSrc) { MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); // Save the EXEC mask - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); - - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) + .addReg(Exec); - MBB.addSuccessor(LoopBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); @@ -2982,7 +3263,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, Offset, UseGPRIdxMode, IsIndirectSrc); MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec) .addReg(SaveExec); return InsPt; @@ -3025,7 +3306,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (UseGPRIdxMode) { unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; if (Offset == 0) { MachineInstr *SetOn = BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) @@ -3274,6 +3555,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); const DebugLoc &DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); @@ -3284,17 +3568,17 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32_XM0RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -3330,6 +3614,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; + case AMDGPU::SI_INIT_EXEC_LO: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::EXEC_LO) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { // Extract the thread count from an SGPR input and set EXEC accordingly. // Since BFM can't shift by 64, handle that case with CMP + CMOV. @@ -3363,24 +3655,31 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( (void)Found; // This should be before all vector instructions. + unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; + bool isWave32 = getSubtarget()->isWave32(); + unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), - AMDGPU::EXEC) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), + TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), + Exec) .addReg(CountReg) .addImm(0); BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) .addReg(CountReg, RegState::Kill) - .addImm(64); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), - AMDGPU::EXEC) + .addImm(getSubtarget()->getWavefrontSize()); + BuildMI(*BB, FirstMI, DebugLoc(), + TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) .addImm(-1); MI.eraseFromParent(); return BB; } case AMDGPU::GET_GROUPSTATICSIZE: { + assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .add(MI.getOperand(0)) @@ -3405,6 +3704,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src0 = MI.getOperand(1).getReg(); @@ -3414,16 +3715,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC); BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addImm(0) .addReg(Src0, 0, AMDGPU::sub0) + .addImm(0) .addReg(Src1, 0, AMDGPU::sub0) .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addImm(0) .addReg(Src0, 0, AMDGPU::sub1) + .addImm(0) .addReg(Src1, 0, AMDGPU::sub1) .addReg(SrcCondCopy); @@ -3457,40 +3763,60 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL_ISEL: - case AMDGPU::SI_TCRETURN_ISEL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned GlobalAddrReg = MI.getOperand(0).getReg(); - MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); - assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + MachineInstrBuilder MIB; + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); - MachineInstrBuilder MIB; - if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - } else { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) - .add(MI.getOperand(0)) - .addGlobalAddress(G); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_SUB_I32_e32: + case AMDGPU::V_SUBREV_I32_e32: { + // TODO: Define distinct V_*_I32_Pseudo instructions instead. + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc = MI.getOpcode(); - // There is an additional imm operand for tcreturn, but it should be in the - // right place already. + bool NeedClampOperand = false; + if (TII->pseudoToMCOpcode(Opc) == -1) { + Opc = AMDGPU::getVOPe64(Opc); + NeedClampOperand = true; } - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); + auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); + if (TII->isVOP3(*I)) { + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + I.addReg(TRI->getVCC(), RegState::Define); + } + I.add(MI.getOperand(1)) + .add(MI.getOperand(2)); + if (NeedClampOperand) + I.addImm(0); // clamp bit for e64 encoding + + TII->legalizeOperands(*I); - MIB.cloneMemRefs(MI); MI.eraseFromParent(); return BB; } + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_SEMA_V: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: + case AMDGPU::DS_GWS_BARRIER: + if (getSubtarget()->hasGWSAutoReplay()) + return BB; + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } @@ -3617,6 +3943,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); assert((!Result.getNode() || @@ -3641,10 +3968,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::INSERT_VECTOR_ELT: return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return lowerVECTOR_SHUFFLE(Op, DAG); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: @@ -3742,10 +4073,7 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = dyn_cast(N->getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); - + const auto *CD = cast(N->getOperand(3)); int CondCode = CD->getSExtValue(); if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) @@ -3753,7 +4081,6 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, ICmpInst::Predicate IcInput = static_cast(CondCode); - SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -3769,16 +4096,20 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, ISD::CondCode CCOpcode = getICmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS, - DAG.getCondCode(CCOpcode)); + unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); + EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); + + SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS, + DAG.getCondCode(CCOpcode)); + if (VT.bitsEq(CCVT)) + return SetCC; + return DAG.getZExtOrTrunc(SetCC, DL, VT); } static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = dyn_cast(N->getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); + const auto *CD = cast(N->getOperand(3)); int CondCode = CD->getSExtValue(); if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || @@ -3798,8 +4129,13 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, FCmpInst::Predicate IcInput = static_cast(CondCode); ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0, - Src1, DAG.getCondCode(CCOpcode)); + unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); + EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); + SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, + Src1, DAG.getCondCode(CCOpcode)); + if (VT.bitsEq(CCVT)) + return SetCC; + return DAG.getZExtOrTrunc(SetCC, SL, VT); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3957,32 +4293,6 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { return 0; } -void SITargetLowering::createDebuggerPrologueStackObjects( - MachineFunction &MF) const { - // Create stack objects that are used for emitting debugger prologue. - // - // Debugger prologue writes work group IDs and work item IDs to scratch memory - // at fixed location in the following format: - // offset 0: work group ID x - // offset 4: work group ID y - // offset 8: work group ID z - // offset 16: work item ID x - // offset 20: work item ID y - // offset 24: work item ID z - SIMachineFunctionInfo *Info = MF.getInfo(); - int ObjectIdx = 0; - - // For each dimension: - for (unsigned i = 0; i < 3; ++i) { - // Create fixed stack object for work group ID. - ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); - Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); - // Create fixed stack object for work item ID. - ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); - Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); - } -} - bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || @@ -3991,7 +4301,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + // FIXME: Either avoid relying on address space here or change the default + // address space for functions to avoid the explicit check. + return (GV->getValueType()->isFunctionTy() || + GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && @@ -4103,6 +4416,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + // Checking the depth + if (cast(Op.getOperand(0))->getZExtValue() != 0) + return DAG.getConstant(0, DL, VT); + + MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + // Check for kernel and shader functions + if (Info->isEntryFunction()) + return DAG.getConstant(0, DL, VT); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + // There is a call to @llvm.returnaddress in this function + MFI.setReturnAddressIsTaken(true); + + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + // Get the return address reg and mark it as an implicit live-in + unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent())); + + return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); +} + SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, @@ -4131,7 +4469,9 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + bool IsIEEEMode = Info->getMode().IEEE; // FIXME: Assert during eslection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee @@ -4302,6 +4642,32 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, return DAG.getUNDEF(ASC->getValueType(0)); } +// This lowers an INSERT_SUBVECTOR by extracting the individual elements from +// the small vector and inserting them into the big vector. That is better than +// the default expansion of doing it via a stack slot. Even though the use of +// the stack slot would be optimized away afterwards, the stack slot itself +// remains. +SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue Ins = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT InsVT = Ins.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned InsNumElts = InsVT.getVectorNumElements(); + unsigned IdxVal = cast(Idx)->getZExtValue(); + SDLoc SL(Op); + + for (unsigned I = 0; I != InsNumElts; ++I) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, + DAG.getConstant(I, SL, MVT::i32)); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, + DAG.getConstant(IdxVal + I, SL, MVT::i32)); + } + return Vec; +} + SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); @@ -4352,12 +4718,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, MVT IntVT = MVT::getIntegerVT(VecSize); // Avoid stack access for dynamic indexing. - SDValue Val = InsVal; - if (InsVal.getValueType() == MVT::f16) - Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); - // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + + // Create a congruent vector with the target value in each element so that + // the required element can be masked and ORed into the target vector. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); @@ -4419,6 +4785,63 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); } +static bool elementPairIsContiguous(ArrayRef Mask, int Elt) { + assert(Elt % 2 == 0); + return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); +} + +SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT ResultVT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast(Op); + + EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; + EVT EltVT = PackVT.getVectorElementType(); + int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); + + // vector_shuffle <0,1,6,7> lhs, rhs + // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) + // + // vector_shuffle <6,7,2,3> lhs, rhs + // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) + // + // vector_shuffle <6,7,0,1> lhs, rhs + // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) + + // Avoid scalarizing when both halves are reading from consecutive elements. + SmallVector Pieces; + for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { + if (elementPairIsContiguous(SVN->getMask(), I)) { + const int Idx = SVN->getMaskElt(I); + int VecIdx = Idx < SrcNumElts ? 0 : 1; + int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; + SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, + PackVT, SVN->getOperand(VecIdx), + DAG.getConstant(EltIdx, SL, MVT::i32)); + Pieces.push_back(SubVec); + } else { + const int Idx0 = SVN->getMaskElt(I); + const int Idx1 = SVN->getMaskElt(I + 1); + int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; + int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; + int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; + int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; + + SDValue Vec0 = SVN->getOperand(VecIdx0); + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32)); + + SDValue Vec1 = SVN->getOperand(VecIdx1); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32)); + Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 })); + } + } + + return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); +} + SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -4512,11 +4935,18 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags); - SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags == SIInstrInfo::MO_NONE ? - GAFlags : GAFlags + 1); + unsigned LoFlags = GAFlags; + if (LoFlags == SIInstrInfo::MO_NONE) + LoFlags = SIInstrInfo::MO_REL32; + SDValue PtrLo = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags); + SDValue PtrHi; + if (GAFlags == SIInstrInfo::MO_NONE) { + PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); + } else { + PtrHi = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1); + } return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -4525,7 +4955,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); const GlobalValue *GV = GSD->getGlobal(); - if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + (!GV->hasExternalLinkage() || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -4533,7 +4966,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDLoc DL(GSD); EVT PtrVT = Op.getValueType(); - // FIXME: Should not make address space based decisions here. + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), + SIInstrInfo::MO_ABS32_LO); + return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); else if (shouldEmitPCReloc(GV)) @@ -4641,10 +5079,8 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, } static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, - SDValue *GLC, SDValue *SLC) { - auto CachePolicyConst = dyn_cast(CachePolicy.getNode()); - if (!CachePolicyConst) - return false; + SDValue *GLC, SDValue *SLC, SDValue *DLC) { + auto CachePolicyConst = cast(CachePolicy.getNode()); uint64_t Value = CachePolicyConst->getZExtValue(); SDLoc DL(CachePolicy); @@ -4656,6 +5092,10 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); Value &= ~(uint64_t)0x2; } + if (DLC) { + *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x4; + } return Value == 0; } @@ -4689,14 +5129,14 @@ static SDValue constructRetValue(SelectionDAG &DAG, EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) : AdjEltVT; - // Special case for v8f16. Rather than add support for this, use v4i32 to + // Special case for v6f16. Rather than add support for this, use v3i32 to // extract the data elements - bool V8F16Special = false; - if (CastVT == MVT::v8f16) { - CastVT = MVT::v4i32; + bool V6F16Special = false; + if (NumElts == 6) { + CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); DMaskPop >>= 1; ReqRetNumElts >>= 1; - V8F16Special = true; + V6F16Special = true; AdjVT = MVT::v2i32; } @@ -4726,7 +5166,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, PreTFCRes = BVElts[0]; } - if (V8F16Special) + if (V6F16Special) PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); if (!IsTexFail) { @@ -4745,9 +5185,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, SDValue *LWE, bool &IsTexFail) { - auto TexFailCtrlConst = dyn_cast(TexFailCtrl.getNode()); - if (!TexFailCtrlConst) - return false; + auto TexFailCtrlConst = cast(TexFailCtrl.getNode()); uint64_t Value = TexFailCtrlConst->getZExtValue(); if (Value) { @@ -4774,7 +5212,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; SmallVector ResultTypes(Op->value_begin(), Op->value_end()); SmallVector OrigResultTypes(Op->value_begin(), Op->value_end()); @@ -4810,9 +5251,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } else { unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa(Op) ? 2 : 1; - auto DMaskConst = dyn_cast(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; + auto DMaskConst = cast(Op.getOperand(DMaskIdx)); DMask = DMaskConst->getZExtValue(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); @@ -4821,8 +5260,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT StoreVT = VData.getSimpleValueType(); if (StoreVT.getScalarType() == MVT::f16) { - if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || - !BaseOpcode->HasD16) + if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; @@ -4835,8 +5273,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // and whether packing is supported. MVT LoadVT = ResultTypes[0].getSimpleVT(); if (LoadVT.getScalarType() == MVT::f16) { - if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || - !BaseOpcode->HasD16) + if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; @@ -4878,6 +5315,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } + // Optimize _mip away, when 'lod' is zero + if (MIPMappingInfo) { + if (auto ConstantLod = + dyn_cast(Op.getOperand(AddrIdx+NumVAddrs-1))) { + if (ConstantLod->isNullValue()) { + IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip + NumMIVAddrs--; // remove 'lod' + } + } + } + // Check for 16 bit addresses and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); @@ -4915,7 +5363,22 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VAddrs.push_back(Op.getOperand(AddrIdx + i)); } - SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); + // If the register allocator cannot place the address registers contiguously + // without introducing moves, then using the non-sequential address encoding + // is always preferable, since it saves VALU instructions and is usually a + // wash in terms of code size or even better. + // + // However, we currently have no way of hinting to the register allocator that + // MIMG addresses should be placed contiguously when it is possible to do so, + // so force non-NSA for the common 2-address case as a heuristic. + // + // SIShrinkInstructions will convert NSA encodings to non-NSA after register + // allocation when possible. + bool UseNSA = + ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3; + SDValue VAddr; + if (!UseNSA) + VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); @@ -4926,9 +5389,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 1; } else { auto UnormConst = - dyn_cast(Op.getOperand(AddrIdx + NumVAddrs + 2)); - if (!UnormConst) - return Op; + cast(Op.getOperand(AddrIdx + NumVAddrs + 2)); Unorm = UnormConst->getZExtValue() ? True : False; CtrlIdx = AddrIdx + NumVAddrs + 3; @@ -4965,9 +5426,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return Undef; } - // Have to use a power of 2 number of dwords - NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); - EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) : MVT::f32; @@ -4983,45 +5441,66 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue GLC; SDValue SLC; + SDValue DLC; if (BaseOpcode->Atomic) { GLC = True; // TODO no-return optimization - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC)) + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC, + IsGFX10 ? &DLC : nullptr)) return Op; } else { - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC)) + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC, + IsGFX10 ? &DLC : nullptr)) return Op; } - SmallVector Ops; + SmallVector Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) Ops.push_back(VData); // vdata - Ops.push_back(VAddr); + if (UseNSA) { + for (const SDValue &Addr : VAddrs) + Ops.push_back(Addr); + } else { + Ops.push_back(VAddr); + } Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc if (BaseOpcode->Sampler) Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); + if (IsGFX10) + Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); + if (IsGFX10) + Ops.push_back(DLC); Ops.push_back(GLC); Ops.push_back(SLC); Ops.push_back(IsA16 && // a16 or r128 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); Ops.push_back(TFE); // tfe Ops.push_back(LWE); // lwe - Ops.push_back(DimInfo->DA ? True : False); + if (!IsGFX10) + Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); if (isa(Op)) Ops.push_back(Op.getOperand(0)); // chain - int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32; + int NumVAddrDwords = + UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, - NumVDataDwords, NumVAddrDwords); - if (Opcode == -1) - Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + if (IsGFX10) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx10NSA + : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); + } else { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + } assert(Opcode != -1); MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); @@ -5046,7 +5525,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, + SDValue Offset, SDValue GLC, SDValue DLC, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -5059,7 +5538,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Ops[] = { Rsrc, Offset, // Offset - GLC // glc + GLC, + DLC, }; return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(VT), Ops, VT, MMO); @@ -5263,16 +5743,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); - case SIIntrinsic::SI_load_const: { - SDValue Load = - lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(0, DL, MVT::i1), DAG); - return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); - } + case Intrinsic::amdgcn_wavefrontsize: + return DAG.getConstant(MF.getSubtarget().getWavefrontSize(), + SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { - unsigned Cache = cast(Op.getOperand(3))->getZExtValue(); - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; + SDValue GLC; + SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); + if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, + IsGFX10 ? &DLC : nullptr)) + return Op; + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC, + DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -5295,12 +5777,70 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_interp_p1_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = M0.getValue(1); + if (getSubtarget()->getLDSBankCount() == 16) { + // 16 bank LDS + SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Glue); + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + S, // Src2 - holds two f16 values selected by high + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32) // $omod + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); + } else { + // 32 bank LDS + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32), // $omod + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); + } + } + case Intrinsic::amdgcn_interp_p2_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); + SDValue Glue = SDValue(M0.getNode(), 1); + SDValue Ops[] = { + Op.getOperand(2), // Src0 + Op.getOperand(3), // Attrchan + Op.getOperand(4), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(1), // Src2 + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(5), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); + } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_cos: return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_mul_u24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_mul_i24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_log_clamp: { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); @@ -5334,10 +5874,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); - if (!Param) - return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL); + const ConstantSDNode *Param = cast(Op.getOperand(3)); // Translate to the operands expected by the machine instruction. The // first parameter must be the same as the first instruction. @@ -5423,6 +5960,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::amdgcn_if_break: + return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, + Op->getOperand(1), Op->getOperand(2)), 0); + + case Intrinsic::amdgcn_groupstaticsize: { + Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return Op; + + const Module *M = MF.getFunction().getParent(); + const GlobalValue *GV = + M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5438,9 +5992,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDLoc DL(Op); switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + MemSDNode *M = cast(Op); + SDValue Chain = M->getOperand(0); + SDValue M0 = M->getOperand(2); + SDValue Value = M->getOperand(3); + unsigned IndexOperand = M->getConstantOperandVal(7); + unsigned WaveRelease = M->getConstantOperandVal(8); + unsigned WaveDone = M->getConstantOperandVal(9); + unsigned ShaderType; + unsigned Instruction; + + unsigned OrderedCountIndex = IndexOperand & 0x3f; + IndexOperand &= ~0x3f; + unsigned CountDw = 0; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { + CountDw = (IndexOperand >> 24) & 0xf; + IndexOperand &= ~(0xf << 24); + + if (CountDw < 1 || CountDw > 4) { + report_fatal_error( + "ds_ordered_count: dword count must be between 1 and 4"); + } + } + + if (IndexOperand) + report_fatal_error("ds_ordered_count: bad index operand"); + + switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + Instruction = 0; + break; + case Intrinsic::amdgcn_ds_ordered_swap: + Instruction = 1; + break; + } + + if (WaveDone && !WaveRelease) + report_fatal_error("ds_ordered_count: wave_done requires wave_release"); + + switch (DAG.getMachineFunction().getFunction().getCallingConv()) { + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + ShaderType = 0; + break; + case CallingConv::AMDGPU_PS: + ShaderType = 1; + break; + case CallingConv::AMDGPU_VS: + ShaderType = 2; + break; + case CallingConv::AMDGPU_GS: + ShaderType = 3; + break; + default: + report_fatal_error("ds_ordered_count unsupported for this calling conv"); + } + + unsigned Offset0 = OrderedCountIndex << 2; + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | + (Instruction << 4); + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + Offset1 |= (CountDw - 1) << 6; + + unsigned Offset = Offset0 | (Offset1 << 8); + + SDValue Ops[] = { + Chain, + Value, + DAG.getTargetConstant(Offset, DL, MVT::i16), + copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } + case Intrinsic::amdgcn_ds_fadd: { + MemSDNode *M = cast(Op); + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_ds_fadd: + Opc = ISD::ATOMIC_LOAD_FADD; + break; + } + + return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), + M->getOperand(0), M->getOperand(2), M->getOperand(3), + M->getMemOperand()); + } case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast(Op); @@ -5452,9 +6096,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_atomic_dec: Opc = AMDGPUISD::ATOMIC_DEC; break; - case Intrinsic::amdgcn_ds_fadd: - Opc = AMDGPUISD::ATOMIC_LOAD_FADD; - break; case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; break; @@ -5503,8 +6144,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { @@ -5531,8 +6178,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { @@ -5559,8 +6212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast(Op); @@ -5588,9 +6247,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_raw_tbuffer_load: { MemSDNode *M = cast(Op); @@ -5612,9 +6271,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_struct_tbuffer_load: { MemSDNode *M = cast(Op); @@ -5636,9 +6295,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -5913,6 +6572,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } } +// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to +// dwordx4 if on SI. +SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, + SDVTList VTList, + ArrayRef Ops, EVT MemVT, + MachineMemOperand *MMO, + SelectionDAG &DAG) const { + EVT VT = VTList.VTs[0]; + EVT WidenedVT = VT; + EVT WidenedMemVT = MemVT; + if (!Subtarget->hasDwordx3LoadStores() && + (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) { + WidenedVT = EVT::getVectorVT(*DAG.getContext(), + WidenedVT.getVectorElementType(), 4); + WidenedMemVT = EVT::getVectorVT(*DAG.getContext(), + WidenedMemVT.getVectorElementType(), 4); + MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16); + } + + assert(VTList.NumVTs == 2); + SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); + + auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, + WidenedMemVT, MMO); + if (WidenedVT != VT) { + auto Extract = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); + } + return NewOp; +} + SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); @@ -6129,6 +6821,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6155,6 +6853,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6181,10 +6885,63 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + EVT VT = Op.getOperand(2).getValueType(); + + auto *M = cast(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD + : AMDGPUISD::BUFFER_ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_global_atomic_fadd: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // ptr + Op.getOperand(3) // vdata + }; + EVT VT = Op.getOperand(3).getValueType(); + + auto *M = cast(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD + : AMDGPUISD::ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_end_cf: + return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, + Op->getOperand(2), Chain), 0); + default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -6283,6 +7040,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[2] = DAG.getConstant(0, DL, MVT::i32); } +// Handle 8 bit and 16 bit buffer loads +SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, + EVT LoadVT, SDLoc DL, + ArrayRef Ops, + MemSDNode *M) const { + EVT IntVT = LoadVT.changeTypeToInteger(); + unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? + AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; + + SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); +} + +// Handle 8 bit and 16 bit buffer stores +SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, + EVT VDataType, SDLoc DL, + SDValue Ops[], + MemSDNode *M) const { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : + AMDGPUISD::BUFFER_STORE_SHORT; + ArrayRef OpsRef = makeArrayRef(&Ops[0], 9); + return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, + M->getMemOperand()); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { @@ -6395,8 +7184,25 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, RealMemVT, MMO); + if (!MemVT.isVector()) { + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + SmallVector Elts; + for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { + SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, + DAG.getConstant(I, DL, MVT::i32)); + + Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); + } + SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1) }; @@ -6409,15 +7215,21 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned Alignment = Load->getAlignment(); - unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Alignment)) { + *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); } + unsigned Alignment = Load->getAlignment(); + unsigned AS = Load->getAddressSpace(); + if (Subtarget->hasLDSMisalignedBug() && + AS == AMDGPUAS::FLAT_ADDRESS && + Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { + return SplitVectorLoad(Op, DAG); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); // If there is a possibilty that flat instruction access scratch memory @@ -6430,8 +7242,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) - return SDValue(); + if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6443,8 +7260,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && - Alignment >= 4 && NumElements < 32) - return SDValue(); + Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6456,7 +7278,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); - // v4 loads are supported for private and global memory. + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); + // v3 and v4 loads are supported for private and global memory. return SDValue(); } if (AS == AMDGPUAS::PRIVATE_ADDRESS) { @@ -6474,11 +7299,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // Same as global/flat if (NumElements > 4) return SplitVectorLoad(Op, DAG); + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_read_b128 if possible. if (Subtarget->useDS128() && Load->getAlignment() >= 16 && MemVT.getStoreSize() == 16) @@ -6794,7 +7622,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (!Subtarget->hasUsableDivScaleConditionOutput()) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -6856,12 +7684,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - unsigned AS = Store->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AS, Store->getAlignment())) { + *Store->getMemOperand())) { return expandUnalignedStore(Store, DAG); } + unsigned AS = Store->getAddressSpace(); + if (Subtarget->hasLDSMisalignedBug() && + AS == AMDGPUAS::FLAT_ADDRESS && + Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) { + return SplitVectorStore(Op, DAG); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); // If there is a possibilty that flat instruction access scratch memory @@ -6875,6 +7709,9 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); + // v3 stores not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return SplitVectorStore(Op, DAG); return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -6885,16 +7722,16 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SplitVectorStore(Op, DAG); return SDValue(); case 16: - if (NumElements > 4) + if (NumElements > 4 || NumElements == 3) return SplitVectorStore(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && - VT.getStoreSize() == 16) + VT.getStoreSize() == 16 && NumElements != 3) return SDValue(); if (NumElements > 2) @@ -6905,7 +7742,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // out-of-bounds even if base + offsets is in bounds. Split vectorized // stores here to avoid emitting ds_write2_b32. We may re-combine the // store later in the SILoadStoreOptimizer. - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + if (!Subtarget->hasUsableDSOffset() && NumElements == 2 && VT.getStoreSize() == 8 && Store->getAlignment() < 8) { return SplitVectorStore(Op, DAG); @@ -7614,6 +8451,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N, + DAGCombinerInfo &DCI) + const { + SDValue Src = N->getOperand(0); + auto *VTSign = cast(N->getOperand(1)); + + if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { + auto *M = cast(Src); + SDValue Ops[] = { + Src.getOperand(0), // Chain + Src.getOperand(1), // rsrc + Src.getOperand(2), // vindex + Src.getOperand(3), // voffset + Src.getOperand(4), // soffset + Src.getOperand(5), // offset + Src.getOperand(6), + Src.getOperand(7) + }; + // replace with BUFFER_LOAD_BYTE/SHORT + SDVTList ResList = DCI.DAG.getVTList(MVT::i32, + Src.getOperand(0).getValueType()); + unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? + AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; + SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N), + ResList, + Ops, M->getMemoryVT(), + M->getMemOperand()); + return DCI.DAG.getMergeValues({BufferLoadSignExt, + BufferLoadSignExt.getValue(1)}, SDLoc(N)); + } + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8013,9 +8887,12 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + // TODO: Check IEEE bit enabled? EVT VT = Op0.getValueType(); - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. // FIXME: Should this be allowing -0.0? @@ -8059,10 +8936,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - !VT.isVector() && VT != MVT::f64 && - ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { + !VT.isVector() && + (VT == MVT::i32 || VT == MVT::f32 || + ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -8149,9 +9026,12 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N, return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); } + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother // handling no dx10-clamp? - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If NaNs is clamped to 0, we are free to reorder the inputs. if (isa(Src0) && !isa(Src1)) @@ -8342,8 +9222,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + getSubtarget()->hasMadF16())) && + isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; @@ -8357,6 +9239,46 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +// For a reassociatable opcode perform: +// op x, (op y, z) -> op (op x, z), y, if x and z are uniform +SDValue SITargetLowering::reassociateScalarOps(SDNode *N, + SelectionDAG &DAG) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!(Op0->isDivergent() ^ Op1->isDivergent())) + return SDValue(); + + if (Op0->isDivergent()) + std::swap(Op0, Op1); + + if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) + return SDValue(); + + SDValue Op2 = Op1.getOperand(1); + Op1 = Op1.getOperand(0); + if (!(Op1->isDivergent() ^ Op2->isDivergent())) + return SDValue(); + + if (Op1->isDivergent()) + std::swap(Op1, Op2); + + // If either operand is constant this will conflict with + // DAGCombiner::ReassociateOps(). + if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || + DAG.isConstantIntBuildVectorOrConstantInt(Op1)) + return SDValue(); + + SDLoc SL(N); + SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); + return DAG.getNode(Opc, SL, VT, Add1, Op2); +} + static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, @@ -8405,6 +9327,10 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } + if (SDValue V = reassociateScalarOps(N, DAG)) { + return V; + } + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -8452,14 +9378,10 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - unsigned Opc = LHS.getOpcode(); - if (Opc != ISD::SUBCARRY) - std::swap(RHS, LHS); - if (LHS.getOpcode() == ISD::SUBCARRY) { // sub (subcarry x, 0, cc), y => subcarry x, y, cc auto C = dyn_cast(LHS.getOperand(1)); - if (!C || C->getZExtValue() != 0) + if (!C || !C->isNullValue()) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); @@ -8587,7 +9509,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDotInsts() || VT != MVT::f32) + if (!Subtarget->hasDot2Insts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> @@ -8801,11 +9723,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, if (!CSrc) return SDValue(); + const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); APFloat::cmpResult Cmp0 = F.compare(Zero); if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + (Cmp0 == APFloat::cmpUnordered && + MF.getInfo()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } @@ -8822,7 +9746,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return SDValue(); - switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -8873,11 +9796,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_INC: case AMDGPUISD::ATOMIC_DEC: - case AMDGPUISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast(N), DCI); @@ -8889,6 +9812,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: return performZeroExtendCombine(N, DCI); + case ISD::SIGN_EXTEND_INREG: + return performSignExtendInRegCombine(N , DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: @@ -9034,6 +9959,10 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Don't allow 0 dmask, as hardware assumes one channel enabled. bool NoChannels = !NewDmask; if (NoChannels) { + if (!UsesTFC) { + // No uses of the result and not using TFC. Then do nothing. + return Node; + } // If the original dmask has one channel - then nothing to do if (OldBitsSet == 1) return Node; @@ -9205,7 +10134,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, break; MVT VT = Src0.getValueType().getSimpleVT(); - const TargetRegisterClass *RC = getRegClassFor(VT); + const TargetRegisterClass *RC = + getRegClassFor(VT, Src0.getNode()->isDivergent()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); @@ -9238,6 +10168,24 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } + case AMDGPU::V_PERMLANE16_B32: + case AMDGPU::V_PERMLANEX16_B32: { + ConstantSDNode *FI = cast(Node->getOperand(0)); + ConstantSDNode *BC = cast(Node->getOperand(2)); + if (!FI->getZExtValue() && !BC->getZExtValue()) + break; + SDValue VDstIn = Node->getOperand(6); + if (VDstIn.isMachineOpcode() + && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) + break; + MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(Node), MVT::i32); + SmallVector Ops = { SDValue(FI, 0), Node->getOperand(1), + SDValue(BC, 0), Node->getOperand(3), + Node->getOperand(4), Node->getOperand(5), + SDValue(ImpDef, 0), Node->getOperand(7) }; + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } default: break; } @@ -9256,6 +10204,36 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); + + // Prefer VGPRs over AGPRs in mAI instructions where possible. + // This saves a chain-copy of registers and better ballance register + // use between vgpr and agpr as agpr tuples tend to be big. + if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + unsigned Opc = MI.getOpcode(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + if (I == -1) + break; + MachineOperand &Op = MI.getOperand(I); + if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && + OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || + !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !TRI->isAGPR(MRI, Op.getReg())) + continue; + auto *Src = MRI.getUniqueVRegDef(Op.getReg()); + if (!Src || !Src->isCopy() || + !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + auto *NewRC = TRI->getEquivalentVGPRClass(RC); + // All uses of agpr64 and agpr32 can also accept vgpr except for + // v_accvgpr_read, but we do not produce agpr reads during selection, + // so no use checks are needed. + MRI.setRegClass(Op.getReg(), NewRC); + } + } + return; } @@ -9391,9 +10369,15 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 64: RC = &AMDGPU::SGPR_64RegClass; break; + case 96: + RC = &AMDGPU::SReg_96RegClass; + break; case 128: RC = &AMDGPU::SReg_128RegClass; break; + case 160: + RC = &AMDGPU::SReg_160RegClass; + break; case 256: RC = &AMDGPU::SReg_256RegClass; break; @@ -9419,6 +10403,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 128: RC = &AMDGPU::VReg_128RegClass; break; + case 160: + RC = &AMDGPU::VReg_160RegClass; + break; case 256: RC = &AMDGPU::VReg_256RegClass; break; @@ -9427,6 +10414,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, break; } break; + case 'a': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + case 16: + RC = &AMDGPU::AGPR_32RegClass; + break; + case 64: + RC = &AMDGPU::AReg_64RegClass; + break; + case 128: + RC = &AMDGPU::AReg_128RegClass; + break; + case 512: + RC = &AMDGPU::AReg_512RegClass; + break; + case 1024: + RC = &AMDGPU::AReg_1024RegClass; + // v32 types are not legal but we support them here. + return std::make_pair(0U, RC); + } + break; } // We actually support i128, i16 and f16 as inline parameters // even if they are not reported as legal @@ -9440,6 +10450,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::VGPR_32RegClass; } else if (Constraint[1] == 's') { RC = &AMDGPU::SGPR_32RegClass; + } else if (Constraint[1] == 'a') { + RC = &AMDGPU::AGPR_32RegClass; } if (RC) { @@ -9459,6 +10471,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { default: break; case 's': case 'v': + case 'a': return C_RegisterClass; } } @@ -9471,7 +10484,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { @@ -9479,31 +10492,45 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } - // We have to assume the SP is needed in case there are calls in the function - // during lowering. Calls are only detected after the function is - // lowered. We're about to reserve registers, so don't bother using it if we - // aren't really going to use it. - bool NeedSP = !Info->isEntryFunction() || - MFI.hasVarSizedObjects() || - MFI.hasCalls(); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); - Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + // We need to worry about replacing the default register with itself in case + // of MIR testcases missing the MFI. + if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) + MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); - assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); - assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), - Info->getStackPtrOffsetReg())); - MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - } + if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) + MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); - MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); + if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { + MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, + Info->getScratchWaveOffsetReg()); + } Info->limitOccupancy(MF); + if (ST.isWave32() && !MF.empty()) { + // Add VCC_HI def because many instructions marked as imp-use VCC where + // we may only define VCC_LO. If nothing defines VCC_HI we may end up + // having a use of undef. + + const SIInstrInfo *TII = ST.getInstrInfo(); + DebugLoc DL; + + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr(); + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI); + + for (auto &MBB : MF) { + for (auto &MI : MBB) { + TII->fixImplicitOperands(MI); + } + } + } + TargetLoweringBase::finalizeLowering(MF); } @@ -9515,14 +10542,81 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, DAG, Depth); - if (getSubtarget()->enableHugePrivateBuffer()) - return; - - // Technically it may be possible to have a dispatch with a single workitem - // that uses the full private memory size, but that's not really useful. We - // can't use vaddr in MUBUF instructions if we don't know the address + // Set the high bits to zero based on the maximum allowed scratch size per + // wave. We can't use vaddr in MUBUF instructions if we don't know the address // calculation won't overflow, so assume the sign bit is never set. - Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); + Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); +} + +unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML); + const unsigned CacheLineAlign = 6; // log2(64) + + // Pre-GFX10 target did not benefit from loop alignment + if (!ML || DisableLoopAlignment || + (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) || + getSubtarget()->hasInstFwdPrefetchBug()) + return PrefAlign; + + // On GFX10 I$ is 4 x 64 bytes cache lines. + // By default prefetcher keeps one cache line behind and reads two ahead. + // We can modify it with S_INST_PREFETCH for larger loops to have two lines + // behind and one ahead. + // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. + // If loop fits 64 bytes it always spans no more than two cache lines and + // does not need an alignment. + // Else if loop is less or equal 128 bytes we do not need to modify prefetch, + // Else if loop is less or equal 192 bytes we need two lines behind. + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const MachineBasicBlock *Header = ML->getHeader(); + if (Header->getAlignment() != PrefAlign) + return Header->getAlignment(); // Already processed. + + unsigned LoopSize = 0; + for (const MachineBasicBlock *MBB : ML->blocks()) { + // If inner loop block is aligned assume in average half of the alignment + // size to be added as nops. + if (MBB != Header) + LoopSize += (1 << MBB->getAlignment()) / 2; + + for (const MachineInstr &MI : *MBB) { + LoopSize += TII->getInstSizeInBytes(MI); + if (LoopSize > 192) + return PrefAlign; + } + } + + if (LoopSize <= 64) + return PrefAlign; + + if (LoopSize <= 128) + return CacheLineAlign; + + // If any of parent loops is surrounded by prefetch instructions do not + // insert new for inner loop, which would reset parent's settings. + for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { + if (MachineBasicBlock *Exit = P->getExitBlock()) { + auto I = Exit->getFirstNonDebugInstr(); + if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) + return CacheLineAlign; + } + } + + MachineBasicBlock *Pre = ML->getLoopPreheader(); + MachineBasicBlock *Exit = ML->getExitBlock(); + + if (Pre && Exit) { + BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(), + TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(1); // prefetch 2 lines behind PC + + BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(), + TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(2); // prefetch 1 line behind PC + } + + return CacheLineAlign; } LLVM_ATTRIBUTE_UNUSED @@ -9531,7 +10625,8 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) { do { // Follow the chain until we find an INLINEASM node. N = N->getOperand(0).getNode(); - if (N->getOpcode() == ISD::INLINEASM) + if (N->getOpcode() == ISD::INLINEASM || + N->getOpcode() == ISD::INLINEASM_BR) return true; } while (N->getOpcode() == ISD::CopyFromReg); return false; @@ -9616,7 +10711,10 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, bool SNaN, unsigned Depth) const { if (Op.getOpcode() == AMDGPUISD::CLAMP) { - if (Subtarget->enableDX10Clamp()) + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + + if (Info->getMode().DX10Clamp) return true; // Clamped to 0. return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } @@ -9624,3 +10722,29 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, SNaN, Depth); } + +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + switch (RMW->getOperation()) { + case AtomicRMWInst::FAdd: { + Type *Ty = RMW->getType(); + + // We don't have a way to support 16-bit atomics now, so just leave them + // as-is. + if (Ty->isHalfTy()) + return AtomicExpansionKind::None; + + if (!Ty->isFloatTy()) + return AtomicExpansionKind::CmpXChg; + + // TODO: Do have these for flat. Older targets also had them for buffers. + unsigned AS = RMW->getPointerAddressSpace(); + return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? + AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + } + default: + break; + } + + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index bcef519ee663..21a215e16ce7 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -1,9 +1,8 @@ //===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -61,7 +60,7 @@ private: SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue GLC, SelectionDAG &DAG) const; + SDValue GLC, SDValue DLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -90,11 +89,17 @@ private: SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG, ArrayRef Ops, bool IsIntrinsic = false) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to + // dwordx4 if on SI. + SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, + ArrayRef Ops, EVT MemVT, + MachineMemOperand *MMO, SelectionDAG &DAG) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the @@ -116,8 +121,10 @@ private: SelectionDAG &DAG) const; SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const; @@ -141,6 +148,7 @@ private: SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const; @@ -156,6 +164,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -174,8 +183,6 @@ private: unsigned isCFIntrinsic(const SDNode *Intr) const; - void createDebuggerPrologueStackObjects(MachineFunction &MF) const; - /// \returns True if fixup needs to be emitted for given global value \p GV, /// false otherwise. bool shouldEmitFixup(const GlobalValue *GV) const; @@ -194,6 +201,15 @@ private: void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align = 4) const; + // Handle 8 bit and 16 bit buffer loads + SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, + ArrayRef Ops, MemSDNode *M) const; + + // Handle 8 bit and 16 bit buffer stores + SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, + SDLoc DL, SDValue Ops[], + MemSDNode *M) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); @@ -219,20 +235,21 @@ public: bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, - bool *IsFast) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *IsFast = nullptr) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; bool isMemOpUniform(const SDNode *N) const; bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; - bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; @@ -298,6 +315,9 @@ public: MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; @@ -352,6 +372,9 @@ public: const SelectionDAG &DAG, bool SNaN = false, unsigned Depth = 0) const override; + AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + + unsigned getPrefLoopAlignment(MachineLoop *ML) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index ba21a5ce1293..87e63fcc4a04 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -1,9 +1,8 @@ //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -93,15 +92,13 @@ INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; -static bool opcodeEmitsNoInsts(unsigned Opc) { - switch (Opc) { - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::BUNDLE: - case TargetOpcode::CFI_INSTRUCTION: - case TargetOpcode::EH_LABEL: - case TargetOpcode::GC_LABEL: - case TargetOpcode::DBG_VALUE: +static bool opcodeEmitsNoInsts(const MachineInstr &MI) { + if (MI.isMetaInstruction()) + return true; + + // Handle target specific opcodes. + switch (MI.getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: return true; default: return false; @@ -110,9 +107,6 @@ static bool opcodeEmitsNoInsts(unsigned Opc) { bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const { - if (From.succ_empty()) - return false; - unsigned NumInstr = 0; const MachineFunction *MF = From.getParent(); @@ -122,7 +116,7 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(I->getOpcode())) + if (opcodeEmitsNoInsts(*I)) continue; // FIXME: Since this is required for correctness, this should be inserted @@ -138,6 +132,11 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + ++NumInstr; if (NumInstr >= SkipThreshold) return true; @@ -177,7 +176,7 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { .addImm(0); // en // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); return true; } @@ -245,6 +244,10 @@ void SIInsertSkips::kill(MachineInstr &MI) { llvm_unreachable("invalid ISD:SET cond code"); } + const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); + if (ST.hasNoSdstCMPX()) + Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode); + assert(MI.getOperand(0).isReg()); if (TRI->isVGPR(MBB.getParent()->getRegInfo(), @@ -254,17 +257,23 @@ void SIInsertSkips::kill(MachineInstr &MI) { .add(MI.getOperand(1)) .add(MI.getOperand(0)); } else { - BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .addReg(AMDGPU::VCC, RegState::Define) - .addImm(0) // src0 modifiers - .add(MI.getOperand(1)) - .addImm(0) // src1 modifiers - .add(MI.getOperand(0)) - .addImm(0); // omod + auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode)); + if (!ST.hasNoSdstCMPX()) + I.addReg(AMDGPU::VCC, RegState::Define); + + I.addImm(0) // src0 modifiers + .add(MI.getOperand(1)) + .addImm(0) // src1 modifiers + .add(MI.getOperand(0)); + + I.addImm(0); // omod } break; } case AMDGPU::SI_KILL_I1_TERMINATOR: { + const MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; const MachineOperand &Op = MI.getOperand(0); int64_t KillVal = MI.getOperand(1).getImm(); assert(KillVal == 0 || KillVal == -1); @@ -275,14 +284,17 @@ void SIInsertSkips::kill(MachineInstr &MI) { assert(Imm == 0 || Imm == -1); if (Imm == KillVal) - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 + : AMDGPU::S_MOV_B64), Exec) .addImm(0); break; } unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; - BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + if (ST.isWave32()) + Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32; + BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) + .addReg(Exec) .add(Op); break; } @@ -331,9 +343,11 @@ bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { // S_CBRANCH_EXEC[N]Z bool Changed = false; MachineBasicBlock &MBB = *MI.getParent(); - const unsigned CondReg = AMDGPU::VCC; - const unsigned ExecReg = AMDGPU::EXEC; - const unsigned And = AMDGPU::S_AND_B64; + const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); + const bool IsWave32 = ST.isWave32(); + const unsigned CondReg = TRI->getVCC(); + const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), E = MBB.rend(); diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index afc0b4467610..c89d5b71ec5c 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1,9 +1,8 @@ //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -69,10 +68,10 @@ DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", "Force emit s_waitcnt vmcnt(0) instrs"); -static cl::opt ForceEmitZeroFlag( +static cl::opt ForceEmitZeroFlag( "amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), - cl::init(0), cl::Hidden); + cl::init(false), cl::Hidden); namespace { @@ -101,7 +100,7 @@ public: #define CNT_MASK(t) (1u << (t)) -enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; +enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; iterator_range> inst_counter_types() { return make_range(enum_iterator(VM_CNT), @@ -114,6 +113,7 @@ struct { uint32_t VmcntMax; uint32_t ExpcntMax; uint32_t LgkmcntMax; + uint32_t VscntMax; int32_t NumVGPRsMax; int32_t NumSGPRsMax; } HardwareLimits; @@ -127,6 +127,8 @@ struct { enum WaitEventType { VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_WRITE_ACCESS,// vector-memory write LDS_ACCESS, // lds read & write GDS_ACCESS, // gds read & write SQ_MESSAGE, // send message @@ -140,11 +142,12 @@ enum WaitEventType { }; static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { - (1 << VMEM_ACCESS), + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | (1 << SQ_MESSAGE), (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), + (1 << VMEM_WRITE_ACCESS) }; // The mapping is: @@ -172,6 +175,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { case LGKM_CNT: Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); break; + case VS_CNT: + Wait.VsCnt = std::min(Wait.VsCnt, Count); + break; default: llvm_unreachable("bad InstCounterType"); } @@ -200,6 +206,8 @@ public: return HardwareLimits.LgkmcntMax; case EXP_CNT: return HardwareLimits.ExpcntMax; + case VS_CNT: + return HardwareLimits.VscntMax; default: break; } @@ -222,10 +230,12 @@ public: // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - if (E == VMEM_ACCESS) + if (WaitEventMaskForInst[VM_CNT] & (1 << E)) return VM_CNT; if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) return LGKM_CNT; + if (WaitEventMaskForInst[VS_CNT] & (1 << E)) + return VS_CNT; assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); return EXP_CNT; } @@ -453,7 +463,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, unsigned OpNo, bool Def) const { const MachineOperand &Op = MI->getOperand(OpNo); if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || - (Def && !Op.isDef())) + (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. @@ -526,20 +536,22 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { + int AddrOpIdx = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); // All GDS operations must protect their address register (same as // export.) - if (Inst.getOpcode() != AMDGPU::DS_APPEND && - Inst.getOpcode() != AMDGPU::DS_CONSUME) { - setExpScore( - &Inst, TII, TRI, MRI, - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), - CurrScore); + if (AddrOpIdx != -1) { + setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); } + if (Inst.mayStore()) { - setExpScore( - &Inst, TII, TRI, MRI, - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), - CurrScore); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data0) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), + CurrScore); + } if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data1) != -1) { setExpScore(&Inst, TII, TRI, MRI, @@ -663,6 +675,9 @@ void WaitcntBrackets::print(raw_ostream &OS) { case EXP_CNT: OS << " EXP_CNT(" << UB - LB << "): "; break; + case VS_CNT: + OS << " VS_CNT(" << UB - LB << "): "; + break; default: OS << " UNKNOWN(" << UB - LB << "): "; break; @@ -702,7 +717,8 @@ void WaitcntBrackets::print(raw_ostream &OS) { bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | - simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) | + simplifyWaitcnt(VS_CNT, Wait.VsCnt); } bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -745,6 +761,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(VM_CNT, Wait.VmCnt); applyWaitcnt(EXP_CNT, Wait.ExpCnt); applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + applyWaitcnt(VS_CNT, Wait.VsCnt); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -790,6 +807,21 @@ static bool readsVCCZ(const MachineInstr &MI) { !MI.getOperand(1).isUndef(); } +/// \returns true if the callee inserts an s_waitcnt 0 on function entry. +static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { + // Currently all conventions wait, but this may not always be the case. + // + // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make + // senses to omit the wait and do it in the caller. + return true; +} + +/// \returns true if the callee is expected to wait for any outstanding waits +/// before returning. +static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { + return true; +} + /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -815,7 +847,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // TODO: Handle other cases of NeedsWaitcntVmBefore() if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || + MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || + MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { Wait.VmCnt = 0; } @@ -823,8 +857,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { - Wait = AMDGPU::Waitcnt::allZero(); + MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV)); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || @@ -903,91 +938,91 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } } -#if 0 // TODO: the following code to handle CALL. - // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. - // However, there is a problem with EXP_CNT, because the call cannot - // easily tell if a register is used in the function, and if it did, then - // the referring instruction would have to have an S_WAITCNT, which is - // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs - // before the call. - if (MI.getOpcode() == SC_CALL) { - if (ScoreBrackets->getScoreUB(EXP_CNT) > - ScoreBrackets->getScoreLB(EXP_CNT)) { - ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= CNT_MASK(EXP_CNT); - } - } -#endif - - // FIXME: Should not be relying on memoperands. - // Look at the source operands of every instruction to see if - // any of them results from a previous memory operation that affects - // its current usage. If so, an s_waitcnt instruction needs to be - // emitted. - // If the source operand was defined by a load, add the s_waitcnt - // instruction. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - continue; - unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } + if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { + // Don't bother waiting on anything except the call address. The function + // is going to insert a wait on everything in its prolog. This still needs + // to be careful if the call target is a load (e.g. a GOT load). + Wait = AMDGPU::Waitcnt(); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &Op = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); + int CallAddrOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, + CallAddrOpIdx, false); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Op.getReg())) { - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. - - // Two cases are handled for destination operands: - // 1) If the destination operand was defined by a load, add the s_waitcnt - // instruction to guarantee the right WAW order. - // 2) If a destination operand that was used by a recent export/store ins, - // add s_waitcnt on exp_cnt to guarantee the WAR order. - if (MI.mayStore()) { + } else { // FIXME: Should not be relying on memoperands. + // Look at the source operands of every instruction to see if + // any of them results from a previous memory operation that affects + // its current usage. If so, an s_waitcnt instruction needs to be + // emitted. + // If the source operand was defined by a load, add the s_waitcnt + // instruction. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + // VM_CNT is only relevant to vgpr or LDS. ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - } - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &Def = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Def.getReg())) { + + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Op.getReg())) { + // VM_CNT is only relevant to vgpr or LDS. + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + } + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } + } + // End of for loop that looks at all source operands to decide vm_wait_cnt + // and lgk_wait_cnt. + + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. + if (MI.mayStore()) { + // FIXME: Should not be relying on memoperands. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } // End of for loop that looks at all dest operands. + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + MachineOperand &Def = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Def.getReg())) { + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + } + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } + } // End of for loop that looks at all dest operands. + } } // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 @@ -996,13 +1031,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - Wait = AMDGPU::Waitcnt::allZero(); + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV)); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { + if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { if (ScoreBrackets.getScoreLB(LGKM_CNT) < ScoreBrackets.getScoreUB(LGKM_CNT) && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { @@ -1014,21 +1049,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { bool Modified = false; if (OldWaitcntInstr) { - if (TrackedWaitcntSet.count(OldWaitcntInstr)) { - TrackedWaitcntSet.erase(OldWaitcntInstr); - OldWaitcntInstr->eraseFromParent(); - Modified = true; - } else { - int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); + &*II != &MI; II = NextI, ++NextI) { + if (II->isDebugInstr()) + continue; + + if (TrackedWaitcntSet.count(&*II)) { + TrackedWaitcntSet.erase(&*II); + II->eraseFromParent(); + Modified = true; + } else if (II->getOpcode() == AMDGPU::S_WAITCNT) { + int64_t Imm = II->getOperand(0).getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + } else { + assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + ScoreBrackets.applyWaitcnt( + AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm())); + } } - Modified = true; } return Modified; } if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZero(); + Wait = AMDGPU::Waitcnt::allZero(IV); if (ForceEmitWaitcnt[VM_CNT]) Wait.VmCnt = 0; @@ -1036,39 +1081,88 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( Wait.ExpCnt = 0; if (ForceEmitWaitcnt[LGKM_CNT]) Wait.LgkmCnt = 0; + if (ForceEmitWaitcnt[VS_CNT]) + Wait.VsCnt = 0; ScoreBrackets.applyWaitcnt(Wait); AMDGPU::Waitcnt OldWait; + bool Modified = false; + if (OldWaitcntInstr) { - OldWait = - AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm()); - } - if (OldWait.dominates(Wait)) - return false; + for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); + &*II != &MI; II = NextI, NextI++) { + if (II->isDebugInstr()) + continue; - if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr)) - Wait = Wait.combined(OldWait); + if (II->getOpcode() == AMDGPU::S_WAITCNT) { + unsigned IEnc = II->getOperand(0).getImm(); + AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc); + OldWait = OldWait.combined(IWait); + if (!TrackedWaitcntSet.count(&*II)) + Wait = Wait.combined(IWait); + unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); + if (IEnc != NewEnc) { + II->getOperand(0).setImm(NewEnc); + Modified = true; + } + Wait.VmCnt = ~0u; + Wait.LgkmCnt = ~0u; + Wait.ExpCnt = ~0u; + } else { + assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + + unsigned ICnt = II->getOperand(1).getImm(); + OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt); + if (!TrackedWaitcntSet.count(&*II)) + Wait.VsCnt = std::min(Wait.VsCnt, ICnt); + if (Wait.VsCnt != ICnt) { + II->getOperand(1).setImm(Wait.VsCnt); + Modified = true; + } + Wait.VsCnt = ~0u; + } - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - if (OldWaitcntInstr) { - OldWaitcntInstr->getOperand(0).setImm(Enc); + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *II << '\n'); - LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *OldWaitcntInstr << '\n'); - } else { + if (!Wait.hasWait()) + return Modified; + } + } + + if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm(Enc); TrackedWaitcntSet.insert(SWaitInst); + Modified = true; LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << *SWaitInst << '\n'); } - return true; + if (Wait.VsCnt != ~0u) { + assert(ST->hasVscnt()); + + auto SWaitInst = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.VsCnt); + TrackedWaitcntSet.insert(SWaitInst); + Modified = true; + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; } // This is a flat memory operation. Check to see if it has memory @@ -1093,7 +1187,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { - if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { + if (TII->isAlwaysGDS(Inst.getOpcode()) || + TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1102,8 +1197,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (TII->isFLAT(Inst)) { assert(Inst.mayLoad() || Inst.mayStore()); - if (TII->usesVM_CNT(Inst)) - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + if (TII->usesVM_CNT(Inst)) { + if (!ST->hasVscnt()) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + else if (Inst.mayLoad() && + AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); + else + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); + } if (TII->usesLGKM_CNT(Inst)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); @@ -1118,14 +1220,33 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // TODO: get a better carve out. Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL && + Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV && + Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) { + if (!ST->hasVscnt()) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + else if ((Inst.mayLoad() && + AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) || + /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */ + (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore())) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); + else if (Inst.mayStore()) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); + if (ST->vmemWriteNeedsExpWaitcnt() && (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + } else if (Inst.isCall()) { + if (callWaitsOnFunctionReturn(Inst)) { + // Act as a wait on everything + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV)); + } else { + // May need to way wait for anything. + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); + } } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: @@ -1236,31 +1357,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; - for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); + for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), + E = Block.instr_end(); Iter != E;) { MachineInstr &Inst = *Iter; - // Remove any previously existing waitcnts. - if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { - if (OldWaitcntInstr) { - if (TrackedWaitcntSet.count(OldWaitcntInstr)) { - TrackedWaitcntSet.erase(OldWaitcntInstr); - OldWaitcntInstr->eraseFromParent(); - OldWaitcntInstr = nullptr; - } else if (!TrackedWaitcntSet.count(&Inst)) { - // Two successive s_waitcnt's, both of which are pre-existing and - // are therefore preserved. - int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); - } else { - ++Iter; - Inst.eraseFromParent(); - Modified = true; - continue; - } - } - - OldWaitcntInstr = &Inst; + // Track pre-existing waitcnts from earlier iterations. + if (Inst.getOpcode() == AMDGPU::S_WAITCNT || + (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) { + if (!OldWaitcntInstr) + OldWaitcntInstr = &Inst; ++Iter; continue; } @@ -1299,27 +1407,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets.dump(); }); - // Check to see if this is a GWS instruction. If so, and if this is CI or - // VI, then the generated code sequence will include an S_WAITCNT 0. - // TODO: Are these the only GWS instructions? - if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || - Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { - // TODO: && context->target_info->GwsRequiresMemViolTest() ) { - ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero()); - } - // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. if (VCCZBugWorkAround) { // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. - BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::VCC) - .addReg(AMDGPU::VCC); + BuildMI(Block, Inst, Inst.getDebugLoc(), + TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + TRI->getVCC()) + .addReg(TRI->getVCC()); VCCZBugHandledSet.insert(&Inst); Modified = true; } @@ -1345,6 +1442,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); + HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); @@ -1480,6 +1578,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); + if (ST->hasVscnt()) + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm(0); diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 65ffc27b8b60..561a16c3e351 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -1,9 +1,8 @@ //===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,19 +10,9 @@ // //===----------------------------------------------------------------------===// -def isGCN : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureGCN">; -def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureSouthernIslands">; - - class InstSI pattern = []> : AMDGPUInst, GCNPredicateControl { - let SubtargetPredicate = isGCN; - // Low bits - basic encoding information. field bit SALU = 0; field bit VALU = 0; @@ -121,10 +110,20 @@ class InstSI op> : Enc32 { let Inst{31-26} = 0x32; // encoding } -class MIMGe op> : Enc64 { +class MIMGe : Enc64 { bits<8> vdata; bits<4> dmask; bits<1> unorm; bits<1> glc; - bits<1> da; bits<1> r128; bits<1> tfe; bits<1> lwe; bits<1> slc; bit d16; - bits<8> vaddr; bits<7> srsrc; bits<7> ssamp; let Inst{11-8} = dmask; let Inst{12} = unorm; let Inst{13} = glc; - let Inst{14} = da; let Inst{15} = r128; let Inst{16} = tfe; let Inst{17} = lwe; - let Inst{24-18} = op; let Inst{25} = slc; let Inst{31-26} = 0x3c; - let Inst{39-32} = vaddr; let Inst{47-40} = vdata; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; let Inst{63} = d16; } +class MIMGe_gfx6789 op> : MIMGe { + bits<8> vaddr; + bits<1> da; + + let Inst{0} = op{7}; + let Inst{14} = da; + let Inst{24-18} = op{6-0}; + let Inst{39-32} = vaddr; +} + +class MIMGe_gfx10 op> : MIMGe { + bits<8> vaddr0; + bits<3> dim; + bits<2> nsa; + bits<1> dlc; + bits<1> a16 = 0; // TODO: this should be an operand + + let Inst{0} = op{7}; + let Inst{2-1} = nsa; + let Inst{5-3} = dim; + let Inst{7} = dlc; + let Inst{24-18} = op{6-0}; + let Inst{39-32} = vaddr0; + let Inst{62} = a16; +} + class EXPe : Enc64 { bits<4> en; bits<6> tgt; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 2370d5fa7b27..ba8ed6993a56 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1,9 +1,8 @@ //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ #include "SIInstrInfo.h" #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" @@ -100,12 +98,6 @@ static unsigned getNumOperandsNoGlue(SDNode *Node) { return N; } -static SDValue findChainOperand(SDNode *Load) { - SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); - assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); - return LastOp; -} - /// Returns true if both nodes have the same value for the given /// operand \p Op, or if both nodes do not have this operand. static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { @@ -142,7 +134,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: - return true; + // No implicit operands. + return MI.getNumOperands() == MI.getDesc().getNumOperands(); default: return false; } @@ -168,22 +161,25 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, return false; // Check base reg. - if (Load0->getOperand(1) != Load1->getOperand(1)) - return false; - - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) + if (Load0->getOperand(0) != Load1->getOperand(0)) return false; // Skip read2 / write2 variants for simplicity. // TODO: We should report true if the used offsets are adjacent (excluded // st64 versions). - if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || - AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + if (Offset0Idx == -1 || Offset1Idx == -1) return false; - Offset0 = cast(Load0->getOperand(2))->getZExtValue(); - Offset1 = cast(Load1->getOperand(2))->getZExtValue(); + // XXX - be careful of datalesss loads + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // include the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + Offset0Idx -= get(Opc0).NumDefs; + Offset1Idx -= get(Opc1).NumDefs; + Offset0 = cast(Load0->getOperand(Offset0Idx))->getZExtValue(); + Offset1 = cast(Load1->getOperand(Offset1Idx))->getZExtValue(); return true; } @@ -207,10 +203,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!Load0Offset || !Load1Offset) return false; - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) - return false; - Offset0 = Load0Offset->getZExtValue(); Offset1 = Load1Offset->getZExtValue(); return true; @@ -221,7 +213,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, // MUBUF and MTBUF have vaddr at different indices. if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || - findChainOperand(Load0) != findChainOperand(Load1) || !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) return false; @@ -233,10 +224,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, return false; // getNamedOperandIdx returns the index for MachineInstrs. Since they - // inlcude the output in the operand list, but SDNodes don't, we need to + // include the output in the operand list, but SDNodes don't, we need to // subtract the index by one. - --OffIdx0; - --OffIdx1; + OffIdx0 -= get(Opc0).NumDefs; + OffIdx1 -= get(Opc1).NumDefs; SDValue Off0 = Load0->getOperand(OffIdx0); SDValue Off1 = Load1->getOperand(OffIdx1); @@ -265,8 +256,8 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, - MachineOperand *&BaseOp, +bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt.getOpcode(); @@ -277,6 +268,11 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, if (OffsetImm) { // Normal, single offset LDS instruction. BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); + // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to + // report that here? + if (!BaseOp) + return false; + Offset = OffsetImm->getImm(); assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register."); @@ -325,7 +321,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, if (SOffset && SOffset->isReg()) return false; - MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; @@ -348,7 +344,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, if (!OffsetImm) return false; - MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); + const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); BaseOp = SBaseReg; Offset = OffsetImm->getImm(); assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " @@ -357,7 +353,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, } if (isFLAT(LdSt)) { - MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (VAddr) { // Can't analyze 2 offsets. if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) @@ -413,11 +409,11 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, return Base1 == Base2; } -bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, - MachineOperand &BaseOp2, +bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, + const MachineOperand &BaseOp2, unsigned NumLoads) const { - MachineInstr &FirstLdSt = *BaseOp1.getParent(); - MachineInstr &SecondLdSt = *BaseOp2.getParent(); + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) return false; @@ -461,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, const MachineRegisterInfo &MRI = FirstLdSt.getParent()->getParent()->getRegInfo(); - const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); + + const unsigned Reg = FirstDst->getReg(); + + const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } @@ -511,8 +512,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? + AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; + BuildMI(MBB, MI, DL, get(Opc), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } @@ -526,6 +530,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (DestReg == AMDGPU::VCC_LO) { + if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; @@ -570,10 +589,83 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (RC == &AMDGPU::AGPR_32RegClass) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { + // First try to find defining accvgpr_write to avoid temporary registers. + for (auto Def = MI, E = MBB.begin(); Def != E; ) { + --Def; + if (!Def->definesRegister(SrcReg, &RI)) + continue; + if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + break; + + MachineOperand &DefOp = Def->getOperand(1); + assert(DefOp.isReg() || DefOp.isImm()); + + if (DefOp.isReg()) { + // Check that register source operand if not clobbered before MI. + // Immediate operands are always safe to propagate. + bool SafeToPropagate = true; + for (auto I = Def; I != MI && SafeToPropagate; ++I) + if (I->modifiesRegister(DefOp.getReg(), &RI)) + SafeToPropagate = false; + + if (!SafeToPropagate) + break; + + DefOp.setIsKill(false); + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .add(DefOp); + return; + } + + RegScavenger RS; + RS.enterBasicBlock(MBB); + RS.forward(MI); + + // Ideally we want to have three registers for a long reg_sequence copy + // to hide 2 waitstates between v_mov_b32 and accvgpr_write. + unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + *MBB.getParent()); + + // Registers in the sequence are allocated contiguously so we can just + // use register number to pick one of three round-robin temps. + unsigned RegNo = DestReg % 3; + unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp) + report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); + RS.setRegUsed(Tmp); + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } + copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(Tmp, RegState::Kill); + return; + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { - if (RI.getRegSizeInBits(*RC) > 32) { + // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. + if (!(RI.getRegSizeInBits(*RC) % 64)) { Opcode = AMDGPU::S_MOV_B64; EltSize = 8; } else { @@ -585,6 +677,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } + } else if (RI.hasAGPRs(RC)) { + Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + Opcode = AMDGPU::V_ACCVGPR_READ_B32; } ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); @@ -597,6 +694,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + if (Opcode == TargetOpcode::COPY) { + copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), + RI.getSubReg(SrcReg, SubIdx), KillSrc); + continue; + } + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); @@ -696,38 +799,50 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + const TargetRegisterClass *BoolXExecRC = + RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg"); if (Cond.size() == 1) { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(Cond[0]); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); } else if (Cond.size() == 2) { assert(Cond[0].isImm() && "Cond[0] is not an immediate"); switch (Cond[0].getImm()) { case SIInstrInfo::SCC_TRUE: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) .addImm(0); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; } case SIInstrInfo::SCC_FALSE: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(0) .addImm(-1); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; @@ -735,11 +850,13 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCNZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; @@ -747,39 +864,49 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(TrueReg) + .addImm(0) .addReg(FalseReg) .addReg(SReg); break; } case SIInstrInfo::EXECNZ: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) .addImm(0); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; } case SIInstrInfo::EXECZ: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(0) .addImm(-1); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); llvm_unreachable("Unhandled branch predicate EXECZ"); @@ -798,7 +925,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -811,7 +938,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -821,6 +948,8 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + if (RI.hasAGPRs(DstRC)) + return AMDGPU::COPY; if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { @@ -837,12 +966,18 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S32_SAVE; case 8: return AMDGPU::SI_SPILL_S64_SAVE; + case 12: + return AMDGPU::SI_SPILL_S96_SAVE; case 16: return AMDGPU::SI_SPILL_S128_SAVE; + case 20: + return AMDGPU::SI_SPILL_S160_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: return AMDGPU::SI_SPILL_S512_SAVE; + case 128: + return AMDGPU::SI_SPILL_S1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -858,10 +993,31 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; + case 20: + return AMDGPU::SI_SPILL_V160_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: return AMDGPU::SI_SPILL_V512_SAVE; + case 128: + return AMDGPU::SI_SPILL_V1024_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getAGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_A32_SAVE; + case 8: + return AMDGPU::SI_SPILL_A64_SAVE; + case 16: + return AMDGPU::SI_SPILL_A128_SAVE; + case 64: + return AMDGPU::SI_SPILL_A512_SAVE; + case 128: + return AMDGPU::SI_SPILL_A1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -906,12 +1062,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. - - FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); @@ -920,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - - unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // data - .addFrameIndex(FrameIndex) // addr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + + auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); + if (RI.hasAGPRs(RC)) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MIB.addReg(Tmp, RegState::Define); + } + MIB.addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { @@ -939,12 +1100,18 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S32_RESTORE; case 8: return AMDGPU::SI_SPILL_S64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_S96_RESTORE; case 16: return AMDGPU::SI_SPILL_S128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_S160_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: return AMDGPU::SI_SPILL_S512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_S1024_RESTORE; default: llvm_unreachable("unknown register size"); } @@ -960,10 +1127,31 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_V160_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: return AMDGPU::SI_SPILL_V512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_V1024_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_A32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_A64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_A128_RESTORE; + case 64: + return AMDGPU::SI_SPILL_A512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_A1024_RESTORE; default: llvm_unreachable("unknown register size"); } @@ -999,12 +1187,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. @@ -1014,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - - unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); + auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); + if (RI.hasAGPRs(RC)) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MIB.addReg(Tmp, RegState::Define); + } + MIB.addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled @@ -1089,7 +1282,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z getAddNoCarry(Entry, Insert, DL, TIDReg) .addReg(TIDReg) - .addReg(TIDIGZReg); + .addReg(TIDIGZReg) + .addImm(0); // clamp bit } else { // Get the wave id BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), @@ -1114,7 +1308,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); getAddNoCarry(MBB, MI, DL, TmpReg) .addImm(LDSOffset) - .addReg(TIDReg); + .addReg(TIDReg) + .addImm(0); // clamp bit return TmpReg; } @@ -1148,13 +1343,17 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { if (MBB.succ_empty()) { bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); - if (HasNoTerminator) - BuildMI(MBB, MBB.end(), DebugLoc(), - get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); + if (HasNoTerminator) { + if (Info->returnsVoid()) { + BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); + } else { + BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); + } + } } } -unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? @@ -1174,18 +1373,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_MOV_B64)); break; + case AMDGPU::S_MOV_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_MOV_B32)); + break; + case AMDGPU::S_XOR_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_XOR_B64)); break; + case AMDGPU::S_XOR_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_XOR_B32)); + break; + + case AMDGPU::S_OR_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_OR_B32)); + break; + case AMDGPU::S_ANDN2_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_ANDN2_B64)); break; + case AMDGPU::S_ANDN2_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_ANDN2_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -1215,24 +1438,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } case AMDGPU::V_SET_INACTIVE_B32: { - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); MI.eraseFromParent(); break; } case AMDGPU::V_SET_INACTIVE_B64: { - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); expandPostRAPseudo(*Copy); - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); MI.eraseFromParent(); break; } @@ -1282,10 +1509,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi); - if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) - MIB.addImm(0); - else - MIB.add(MI.getOperand(2)); + MIB.add(MI.getOperand(2)); Bundler.append(MIB); finalizeBundle(MBB, Bundler.begin()); @@ -1293,10 +1517,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::ENTER_WWM: { + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is entered. + MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64)); + break; + } case AMDGPU::EXIT_WWM: { - // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM - // is exited. - MI.setDesc(get(AMDGPU::S_MOV_B64)); + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is exited. + MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } case TargetOpcode::BUNDLE: { @@ -1492,7 +1723,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) @@ -1502,7 +1733,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) @@ -1659,6 +1890,10 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_MOV_B32_term: + case AMDGPU::S_XOR_B32_term: + case AMDGPU::S_OR_B32_term: + case AMDGPU::S_ANDN2_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: @@ -1826,7 +2061,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? // Limit to equal cost for branch vs. N v_cndmask_b32s. - return !RI.isSGPRClass(RC) && NumInsts <= 6; + return RI.hasVGPRs(RC) && NumInsts <= 6; } case SCC_TRUE: case SCC_FALSE: { @@ -1907,14 +2142,18 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, const int16_t *SubIndices = Sub0_15; int NElts = DstSize / 32; - // 64-bit select is only avaialble for SALU. + // 64-bit select is only available for SALU. + // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. if (Pred == SCC_TRUE) { - SelOp = AMDGPU::S_CSELECT_B64; - EltRC = &AMDGPU::SGPR_64RegClass; - SubIndices = Sub0_15_64; - - assert(NElts % 2 == 0); - NElts /= 2; + if (NElts % 2) { + SelOp = AMDGPU::S_CSELECT_B32; + EltRC = &AMDGPU::SGPR_32RegClass; + } else { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + NElts /= 2; + } } MachineInstrBuilder MIB = BuildMI( @@ -1934,6 +2173,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, .addReg(FalseReg, 0, SubIdx) .addReg(TrueReg, 0, SubIdx); preserveCondRegFlags(Select->getOperand(3), Cond[1]); + fixImplicitOperands(*Select); MIB.addReg(DstElt) .addImm(SubIdx); @@ -1955,6 +2195,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::V_ACCVGPR_WRITE_B32: + case AMDGPU::V_ACCVGPR_READ_B32: return true; default: return false; @@ -2007,6 +2249,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: break; } @@ -2020,6 +2263,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::COPY) { bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { + if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + return false; + NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; + } UseMI.setDesc(get(NewOpc)); UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); @@ -2027,7 +2275,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. if (hasAnyModifiersSet(UseMI)) @@ -2042,7 +2292,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (isInlineConstant(UseMI, *Src0, *ImmOp)) return false; - bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -2055,6 +2308,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; + unsigned NewOpc = + IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) + : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); + if (pseudoToMCOpcode(NewOpc) == -1) + return false; + // We need to swap operands 0 and 1 since madmk constant is at operand 1. const int64_t Imm = ImmOp->getImm(); @@ -2075,14 +2334,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setIsKill(Src1->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAC_F16_e64) + Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); + UseMI.setDesc(get(NewOpc)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -2107,9 +2368,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->ChangeToImmediate(Def->getOperand(1).getImm()); Src0Inlined = true; } else if ((RI.isPhysicalRegister(Src0->getReg()) && - RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) || + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || (RI.isVirtualRegister(Src0->getReg()) && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) return false; // VGPR is okay as Src0 - fallthrough } @@ -2130,6 +2393,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // VGPR is okay as Src1 - fallthrough } + unsigned NewOpc = + IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) + : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); + if (pseudoToMCOpcode(NewOpc) == -1) + return false; + const int64_t Imm = ImmOp->getImm(); // FIXME: This would be a lot easier if we could return a new instruction @@ -2142,7 +2411,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAC_F16_e64) + Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -2151,7 +2422,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); + UseMI.setDesc(get(NewOpc)); + // It might happen that UseMI was commuted + // and we now have SGPR as SRC1. If so 2 inlined + // constant and SGPR are illegal. + legalizeOperands(UseMI); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -2172,9 +2447,9 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, return LowOffset + LowWidth <= HighOffset; } -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, - MachineInstr &MIb) const { - MachineOperand *BaseOp0, *BaseOp1; +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, + const MachineInstr &MIb) const { + const MachineOperand *BaseOp0, *BaseOp1; int64_t Offset0, Offset1; if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && @@ -2196,8 +2471,8 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, return false; } -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, - MachineInstr &MIb, +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA) const { assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); @@ -2211,17 +2486,6 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; - if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { - const MachineMemOperand *MMOa = *MIa.memoperands_begin(); - const MachineMemOperand *MMOb = *MIb.memoperands_begin(); - if (MMOa->getValue() && MMOb->getValue()) { - MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); - MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); - if (!AA->alias(LocA, LocB)) - return true; - } - } - // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -2275,18 +2539,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, LiveVariables *LV) const { unsigned Opc = MI.getOpcode(); bool IsF16 = false; - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; switch (Opc) { default: return nullptr; case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F16_e64: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_FMAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: @@ -2315,30 +2582,38 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. - (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { + (ST.getConstantBusLimit(Opc) > 1 || + !Src0->isReg() || + !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) - .add(*Dst) - .add(*Src0) - .add(*Src1) - .addImm(Imm); + unsigned NewOpc = + IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) + : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); + if (pseudoToMCOpcode(NewOpc) != -1) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .add(*Dst) + .add(*Src0) + .add(*Src1) + .addImm(Imm); } + unsigned NewOpc = + IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) + : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); if (auto Imm = getFoldableImm(Src1)) { - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) - .add(*Dst) - .add(*Src0) - .addImm(Imm) - .add(*Src2); + if (pseudoToMCOpcode(NewOpc) != -1) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .add(*Dst) + .add(*Src0) + .addImm(Imm) + .add(*Src2); } if (auto Imm = getFoldableImm(Src0)) { - if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, + if (pseudoToMCOpcode(NewOpc) != -1 && + isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), Src1)) - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .add(*Src1) .addImm(Imm) @@ -2346,9 +2621,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, } } - assert((!IsFMA || !IsF16) && "fmac only expected with f32"); - unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : - (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) + : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + if (pseudoToMCOpcode(NewOpc) == -1) + return nullptr; + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .addImm(Src0Mods ? Src0Mods->getImm() : 0) @@ -2390,12 +2667,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, changesVGPRIndexingMode(MI); } +bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { + return Opcode == AMDGPU::DS_ORDERED_COUNT || + Opcode == AMDGPU::DS_GWS_INIT || + Opcode == AMDGPU::DS_GWS_SEMA_V || + Opcode == AMDGPU::DS_GWS_SEMA_BR || + Opcode == AMDGPU::DS_GWS_SEMA_P || + Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || + Opcode == AMDGPU::DS_GWS_BARRIER; +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); if (MI.mayStore() && isSMRD(MI)) return true; // scalar store or atomic + // This will terminate the function when other lanes may need to continue. + if (MI.isReturn()) + return true; + // These instructions cause shader I/O that may cause hardware lockups // when executed with an empty EXEC mask. // @@ -2403,10 +2694,12 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // EXEC = 0, but checking for that case here seems not worth it // given the typical code patterns. if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || - Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE) + Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || + Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || + Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) return true; - if (MI.isInlineAsm()) + if (MI.isCall() || MI.isInlineAsm()) return true; // conservative assumption // These are like SALU instructions in terms of effects, so it's questionable @@ -2420,8 +2713,36 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const return false; } +bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, + const MachineInstr &MI) const { + if (MI.isMetaInstruction()) + return false; + + // This won't read exec if this is an SGPR->SGPR copy. + if (MI.isCopyLike()) { + if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) + return true; + + // Make sure this isn't copying exec as a normal operand + return MI.readsRegister(AMDGPU::EXEC, &RI); + } + + // Make a conservative assumption about the callee. + if (MI.isCall()) + return true; + + // Be conservative with any unhandled generic opcodes. + if (!isTargetSpecificOpcode(MI.getOpcode())) + return true; + + return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { switch (Imm.getBitWidth()) { + case 1: // This likely will be a condition code mask. + return true; + case 32: return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); @@ -2454,7 +2775,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: { + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { int32_t Trunc = static_cast(Imm); return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } @@ -2467,7 +2790,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -2480,19 +2805,14 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { - if (isUInt<16>(Imm)) { - int16_t Trunc = static_cast(Imm); - return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); - } - if (!(Imm & 0xffff)) { - return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm()); - } + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint32_t Trunc = static_cast(Imm); - return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); } default: llvm_unreachable("invalid bitwidth"); @@ -2534,9 +2854,10 @@ static bool compareMachineOp(const MachineOperand &Op0, bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; + const MCInstrDesc &InstDesc = MI.getDesc(); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; @@ -2547,7 +2868,15 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (MO.isImm() && isInlineConstant(MO, OpInfo)) return RI.opCanUseInlineConstant(OpInfo.OperandType); - return RI.opCanUseLiteralConstant(OpInfo.OperandType); + if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) + return false; + + if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) + return true; + + const MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + return ST.hasVOP3Literal(); } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { @@ -2586,7 +2915,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, // Can't shrink instruction with three operands. // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add // a special case for it. It can only be shrunk if the third operand - // is vcc. We should handle this the same way we handle vopc, by addding + // is vcc, and src0_modifiers and src1_modifiers are not set. + // We should handle this the same way we handle vopc, by addding // a register allocation hint pre-regalloc and then do the shrinking // post-regalloc. if (Src2) { @@ -2606,6 +2936,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; @@ -2662,7 +2993,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, // dst Inst32.add(MI.getOperand(0)); } else { - assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || + (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case"); } @@ -2707,19 +3039,19 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); - // FLAT_SCR is just an SGPR pair. - if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) - return true; - - // EXEC register uses the constant bus. - if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) - return true; + // Null is free + if (MO.getReg() == AMDGPU::SGPR_NULL) + return false; // SGPRs use the constant bus - return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); + if (MO.isImplicit()) { + return MO.getReg() == AMDGPU::M0 || + MO.getReg() == AMDGPU::VCC || + MO.getReg() == AMDGPU::VCC_LO; + } else { + return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || + AMDGPU::SReg_64RegClass.contains(MO.getReg()); + } } static unsigned findImplicitSGPRRead(const MachineInstr &MI) { @@ -2730,6 +3062,8 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) { switch (MO.getReg()) { case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: case AMDGPU::M0: case AMDGPU::FLAT_SCR: return MO.getReg(); @@ -2746,10 +3080,12 @@ static bool shouldReadExec(const MachineInstr &MI) { if (SIInstrInfo::isVALU(MI)) { switch (MI.getOpcode()) { case AMDGPU::V_READLANE_B32: - case AMDGPU::V_READLANE_B32_si: + case AMDGPU::V_READLANE_B32_gfx6_gfx7: + case AMDGPU::V_READLANE_B32_gfx10: case AMDGPU::V_READLANE_B32_vi: case AMDGPU::V_WRITELANE_B32: - case AMDGPU::V_WRITELANE_B32_si: + case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: + case AMDGPU::V_WRITELANE_B32_gfx10: case AMDGPU::V_WRITELANE_B32_vi: return false; } @@ -2830,7 +3166,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI.getOperand(i).isImm()) { + if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -2843,7 +3179,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; @@ -3022,9 +3362,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; + SmallVector SGPRsUsed; unsigned SGPRUsed = findImplicitSGPRRead(MI); - if (SGPRUsed != AMDGPU::NoRegister) + if (SGPRUsed != AMDGPU::NoRegister) { ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } for (int OpIdx : OpIndices) { if (OpIdx == -1) @@ -3032,23 +3375,37 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { - if (MO.getReg() != SGPRUsed) - ++ConstantBusCount; SGPRUsed = MO.getReg(); + if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { + return !RI.regsOverlap(SGPRUsed, SGPR); + })) { + ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } } else { ++ConstantBusCount; ++LiteralCount; } } } - if (ConstantBusCount > 1) { - ErrInfo = "VOP* instruction uses the constant bus more than once"; + const GCNSubtarget &ST = MF->getSubtarget(); + // v_writelane_b32 is an exception from constant bus restriction: + // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const + if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && + Opcode != AMDGPU::V_WRITELANE_B32) { + ErrInfo = "VOP* instruction violates constant bus restriction"; return false; } if (isVOP3(MI) && LiteralCount) { - ErrInfo = "VOP3 instruction uses literal"; - return false; + if (LiteralCount && !ST.hasVOP3Literal()) { + ErrInfo = "VOP3 instruction uses literal"; + return false; + } + if (LiteralCount > 1) { + ErrInfo = "VOP3 instruction uses more than one literal"; + return false; + } } } @@ -3067,17 +3424,43 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isSOP2(MI) || isSOPC(MI)) { + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &Src1 = MI.getOperand(Src1Idx); + unsigned Immediates = 0; + + if (!Src0.isReg() && + !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) + Immediates++; + if (!Src1.isReg() && + !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) + Immediates++; + + if (Immediates > 1) { + ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; + return false; + } + } + if (isSOPK(MI)) { - int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); - if (sopkIsZext(MI)) { - if (!isUInt<16>(Imm)) { - ErrInfo = "invalid immediate for SOPK instruction"; + auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); + if (Desc.isBranch()) { + if (!Op->isMBB()) { + ErrInfo = "invalid branch target for SOPK instruction"; return false; } } else { - if (!isInt<16>(Imm)) { - ErrInfo = "invalid immediate for SOPK instruction"; - return false; + uint64_t Imm = Op->getImm(); + if (sopkIsZext(MI)) { + if (!isUInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } + } else { + if (!isInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } } } } @@ -3155,6 +3538,53 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isMIMG(MI)) { + const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); + if (DimOp) { + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::vaddr0); + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + const AMDGPU::MIMGDimInfo *Dim = + AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); + + if (!Dim) { + ErrInfo = "dim is out of range"; + return false; + } + + bool IsNSA = SRsrcIdx - VAddr0Idx > 1; + unsigned AddrWords = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? Dim->NumGradients : 0) + + (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + + unsigned VAddrWords; + if (IsNSA) { + VAddrWords = SRsrcIdx - VAddr0Idx; + } else { + const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); + VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; + if (AddrWords > 8) + AddrWords = 16; + else if (AddrWords > 4) + AddrWords = 8; + else if (AddrWords == 3 && VAddrWords == 4) { + // CodeGen uses the V4 variant of instructions for three addresses, + // because the selection DAG does not support non-power-of-two types. + AddrWords = 4; + } + } + + if (VAddrWords != AddrWords) { + ErrInfo = "bad vaddr size"; + return false; + } + } + } + const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); if (DppCt) { using namespace AMDGPU::DPP; @@ -3165,10 +3595,29 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || - (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { + (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || + (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { ErrInfo = "Invalid dpp_ctrl value"; return false; } + if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "wavefront shifts are not supported on GFX10+"; + return false; + } + if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "broadcats are not supported on GFX10+"; + return false; + } + if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && + ST.getGeneration() < AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_share and row_xmask are not supported before GFX10"; + return false; + } } return true; @@ -3183,9 +3632,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::WWM: return AMDGPU::WWM; - case AMDGPU::S_MOV_B32: - return MI.getOperand(1).isReg() ? + case AMDGPU::S_MOV_B32: { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + return MI.getOperand(1).isReg() || + RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + } case AMDGPU::S_ADD_I32: return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: @@ -3199,7 +3651,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; - case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; + case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; + case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; @@ -3244,6 +3698,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } + llvm_unreachable( + "Unexpected scalar opcode without corresponding vector one!"); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, @@ -3263,30 +3719,21 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, return RI.getRegClass(RCID); } -bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { - switch (MI.getOpcode()) { - case AMDGPU::COPY: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::PHI: - case AMDGPU::INSERT_SUBREG: - return RI.hasVGPRs(getOpRegClass(MI, 0)); - default: - return RI.hasVGPRs(getOpRegClass(MI, OpNo)); - } -} - void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = + static_cast(MRI.getTargetRegisterInfo()); unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); - unsigned Opcode = AMDGPU::V_MOV_B32_e32; + unsigned Size = TRI->getRegSizeInBits(*RC); + unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; if (MO.isReg()) Opcode = AMDGPU::COPY; else if (RI.isSGPRClass(RC)) - Opcode = AMDGPU::S_MOV_B32; + Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) @@ -3396,37 +3843,53 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return isLegalRegOperand(MRI, OpInfo, MO); // Handle non-register types that are treated like immediates. - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); return true; } bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const GCNSubtarget &ST = MF.getSubtarget(); const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { + if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) + return false; - RegSubRegPair SGPRUsed; + SmallDenseSet SGPRsUsed; if (MO->isReg()) - SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); + SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { - if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && + RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); + if (!SGPRsUsed.count(SGPR) && usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { - return false; + if (--ConstantBusLimit <= 0) + return false; + SGPRsUsed.insert(SGPR); } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { - return false; + if (--ConstantBusLimit <= 0) + return false; + } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && + isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { + if (!VOP3LiteralLimit--) + return false; + if (--ConstantBusLimit <= 0) + return false; } } } @@ -3437,7 +3900,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); if (!DefinedRC) { // This operand expects an immediate. @@ -3452,30 +3915,24 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 - // we need to only have one constant bus use. - // - // Note we do not need to worry about literal constants here. They are - // disabled for the operand type for instructions because they will always - // violate the one constant bus use rule. + // we need to only have one constant bus use before GFX10. bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; - if (HasImplicitSGPR) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) - legalizeOpWithMove(MI, Src0Idx); - } + if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && + Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || + isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) + legalizeOpWithMove(MI, Src0Idx); // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for // both the value to write (src0) and lane select (src1). Fix up non-SGPR // src0/src1 with V_READFIRSTLANE. if (Opc == AMDGPU::V_WRITELANE_B32) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); const DebugLoc &DL = MI.getDebugLoc(); if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -3493,6 +3950,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } + // No VOP2 instructions support AGPRs. + if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) + legalizeOpWithMove(MI, Src0Idx); + + if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) + legalizeOpWithMove(MI, Src1Idx); + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) @@ -3520,9 +3984,6 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. // @@ -3556,12 +4017,11 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); Src1.setSubReg(Src0SubReg); + fixImplicitOperands(MI); } -// Legalize VOP3 operands. Because all operand types are supported for any -// operand, and since literal constants are not allowed and should never be -// seen, we only need to worry about inserting copies if we use multiple SGPR -// operands. +// Legalize VOP3 operands. All operand types are supported for any operand +// but only one literal constant and only starting from GFX10. void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); @@ -3572,8 +4032,35 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) }; + if (Opc == AMDGPU::V_PERMLANE16_B32 || + Opc == AMDGPU::V_PERMLANEX16_B32) { + // src1 and src2 must be scalar + MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); + MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); + const DebugLoc &DL = MI.getDebugLoc(); + if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + } + if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src2); + Src2.ChangeToRegister(Reg, false); + } + } + // Find the one SGPR operand we are allowed to use. + int ConstantBusLimit = ST.getConstantBusLimit(Opc); + int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; + SmallDenseSet SGPRsUsed; unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + if (SGPRReg != AMDGPU::NoRegister) { + SGPRsUsed.insert(SGPRReg); + --ConstantBusLimit; + } for (unsigned i = 0; i < 3; ++i) { int Idx = VOP3Idx[i]; @@ -3581,16 +4068,38 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, break; MachineOperand &MO = MI.getOperand(Idx); - // We should never see a VOP3 instruction with an illegal immediate operand. - if (!MO.isReg()) + if (!MO.isReg()) { + if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) + continue; + + if (LiteralLimit > 0 && ConstantBusLimit > 0) { + --LiteralLimit; + --ConstantBusLimit; + continue; + } + + --LiteralLimit; + --ConstantBusLimit; + legalizeOpWithMove(MI, Idx); continue; + } + + if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && + !isOperandLegal(MI, Idx, &MO)) { + legalizeOpWithMove(MI, Idx); + continue; + } if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) continue; // VGPRs are legal - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. + // We can use one SGPR in each VOP3 instruction prior to GFX10 + // and two starting from GFX10. + if (SGPRsUsed.count(MO.getReg())) + continue; + if (ConstantBusLimit > 0) { + SGPRsUsed.insert(MO.getReg()); + --ConstantBusLimit; continue; } @@ -3607,6 +4116,15 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, unsigned DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; + if (RI.hasAGPRs(VRC)) { + VRC = RI.getEquivalentVGPRClass(VRC); + unsigned NewSrcReg = MRI.createVirtualRegister(VRC); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(TargetOpcode::COPY), NewSrcReg) + .addReg(SrcReg); + SrcReg = NewSrcReg; + } + if (SubRegs == 1) { BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), DstReg) @@ -3691,15 +4209,27 @@ static void emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, MachineOperand &Rsrc) { + MachineFunction &MF = *OrigBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned SaveExecOpc = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + unsigned XorTermOpc = + ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + unsigned AndOpc = + ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + MachineBasicBlock::iterator I = LoopBB.begin(); unsigned VRsrc = Rsrc.getReg(); unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC); unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -3737,22 +4267,22 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) .addReg(SRsrc, 0, AMDGPU::sub2_sub3) .addReg(VRsrc, 0, AMDGPU::sub2_sub3); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond) + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) .addReg(CondReg0) .addReg(CondReg1); MRI.setSimpleHint(SaveExec, AndCond); // Update EXEC to matching lanes, saving original to SaveExec. - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec) + BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) .addReg(AndCond, RegState::Kill); // The original instruction is here; we insert the terminators after it. I = LoopBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) + .addReg(Exec) .addReg(SaveExec); BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); } @@ -3763,15 +4293,19 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc, MachineDominatorTree *MDT) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask - BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); // Killed uses in the instruction we are waterfalling around will be // incorrect due to the added control-flow. @@ -3820,8 +4354,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(SaveExec); + BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); } // Extract pointer from Rsrc and return a zero-value Rsrc replacement. @@ -3901,7 +4434,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); - if (RI.hasVGPRs(OpRC)) { + if (RI.hasVectorRegisters(OpRC)) { VRC = OpRC; } else { SRC = OpRC; @@ -3914,7 +4447,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { @@ -3983,7 +4517,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize SI_INIT_M0 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { MachineOperand &Src = MI.getOperand(0); - if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); return; } @@ -4047,19 +4581,28 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + unsigned RsrcPtr, NewSRsrc; std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 - DebugLoc DL = MI.getDebugLoc(); - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) - .addReg(RsrcPtr, 0, AMDGPU::sub0) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) + .addDef(CondReg0) + .addReg(RsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0) + .addImm(0); // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) - .addReg(RsrcPtr, 0, AMDGPU::sub1) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) + .addDef(CondReg1, RegState::Dead) + .addReg(RsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1) + .addReg(CondReg0, RegState::Kill) + .addImm(0); // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) @@ -4106,6 +4649,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, getNamedOperand(MI, AMDGPU::OpName::glc)) { MIB.addImm(GLC->getImm()); } + if (const MachineOperand *DLC = + getNamedOperand(MI, AMDGPU::OpName::dlc)) { + MIB.addImm(DLC->getImm()); + } MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); @@ -4235,37 +4782,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, continue; case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } @@ -4279,10 +4826,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: // Clear unused bits of vcc - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), - AMDGPU::VCC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + if (ST.isWave32()) + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), + AMDGPU::VCC_LO) + .addReg(AMDGPU::EXEC_LO) + .addReg(AMDGPU::VCC_LO); + else + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), + AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); break; case AMDGPU::S_BFE_U64: @@ -4339,8 +4892,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { MachineOperand &Op = Inst.getOperand(i); if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { + // Only propagate through live-def of SCC. + if (Op.isDef() && !Op.isDead()) + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); Inst.RemoveOperand(i); - addSCCDefUsersToVALUWorklist(Inst, Worklist); } } @@ -4358,6 +4913,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, } Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); + fixImplicitOperands(Inst); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = Inst.getOperand(2); @@ -4445,6 +5001,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, Inst.RemoveOperand(3); Inst.setDesc(get(NewOpc)); + Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); legalizeOperands(Inst, MDT); @@ -4514,8 +5071,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); bool Src1IsSGPR = Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); - MachineInstr *Not = nullptr; - MachineInstr *Xor = nullptr; + MachineInstr *Xor; unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -4523,14 +5079,12 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, // The next iteration over the work list will lower these to the vector // unit as necessary. if (Src0IsSGPR) { - Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) - .add(Src0); + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) .addReg(Temp) .add(Src1); } else if (Src1IsSGPR) { - Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) - .add(Src1); + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) .add(Src0) .addReg(Temp); @@ -4538,8 +5092,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) .add(Src0) .add(Src1); - Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) - .addReg(Temp); + MachineInstr *Not = + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); Worklist.insert(Not); } @@ -4670,13 +5224,14 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned CarryReg = MRI.createVirtualRegister(CarryRC); + unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); @@ -4705,7 +5260,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) .addReg(CarryReg, RegState::Define) .add(SrcReg0Sub0) - .add(SrcReg1Sub0); + .add(SrcReg1Sub0) + .addImm(0); // clamp bit unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; MachineInstr *HiHalf = @@ -4713,7 +5269,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, .addReg(DeadCarryReg, RegState::Define | RegState::Dead) .add(SrcReg0Sub1) .add(SrcReg1Sub1) - .addReg(CarryReg, RegState::Kill); + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) @@ -4943,7 +5500,23 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { + + unsigned OpNo = 0; + + switch (UseMI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::WQM: + case AMDGPU::WWM: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: + break; + default: + OpNo = I.getOperandNo(); + break; + } + + if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { Worklist.insert(&UseMI); do { @@ -5017,19 +5590,23 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist( - MachineInstr &SCCDefInst, SetVectorType &Worklist) const { +void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, + MachineInstr &SCCDefInst, + SetVectorType &Worklist) const { + // Ensure that def inst defines SCC, which is still live. + assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && + !Op.isDead() && Op.getParent() == &SCCDefInst); // This assumes that all the users of SCC are in the same block // as the SCC def. - for (MachineInstr &MI : - make_range(MachineBasicBlock::iterator(SCCDefInst), - SCCDefInst.getParent()->end())) { + for (MachineInstr &MI : // Skip the def inst itself. + make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), + SCCDefInst.getParent()->end())) { + // Check if SCC is used first. + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) + Worklist.insert(&MI); // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) return; - - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) - Worklist.insert(&MI); } } @@ -5046,14 +5623,26 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: - case AMDGPU::WWM: - if (RI.hasVGPRs(NewDstRC)) - return nullptr; + case AMDGPU::WWM: { + const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); + if (RI.hasAGPRs(SrcRC)) { + if (RI.hasAGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } else { + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - return nullptr; return NewDstRC; + } default: return NewDstRC; } @@ -5139,6 +5728,12 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, } uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + return (22ULL << 44) | // IMG_FORMAT_32_FLOAT + (1ULL << 56) | // RESOURCE_LEVEL = 1 + (3ULL << 60); // OOB_SELECT = 3 + } + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { // Set ATC = 1. GFX9 doesn't have this bit. @@ -5165,12 +5760,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; } - // IndexStride = 64. - Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; + // IndexStride = 64 / 32. + uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; + Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + ST.getGeneration() <= AMDGPUSubtarget::GFX9) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; @@ -5267,25 +5864,35 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return DescSize; // No operands. if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) - return DescSize + 4; + return isVOP3(MI) ? 12 : (DescSize + 4); int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return DescSize; if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) - return DescSize + 4; + return isVOP3(MI) ? 12 : (DescSize + 4); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx == -1) return DescSize; if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) - return DescSize + 4; + return isVOP3(MI) ? 12 : (DescSize + 4); return DescSize; } + // Check whether we have extra NSA words. + if (isMIMG(MI)) { + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + if (VAddr0Idx < 0) + return 8; + + int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); + } + switch (Opc) { case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: @@ -5294,10 +5901,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return 0; case TargetOpcode::BUNDLE: return getInstBundleSize(MI); - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); - return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), + &MF->getSubtarget()); } default: return DescSize; @@ -5332,7 +5941,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstr *SIIF = BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) .add(Branch->getOperand(0)) @@ -5359,8 +5968,8 @@ void SIInstrInfo::convertNonUniformLoopRegion( if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); + unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstrBuilder HeaderPHIBuilder = BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), @@ -5370,7 +5979,7 @@ void SIInstrInfo::convertNonUniformLoopRegion( HeaderPHIBuilder.addReg(BackEdgeReg); } else { MachineBasicBlock *PMBB = *PI; - unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), ZeroReg, 0); HeaderPHIBuilder.addReg(ZeroReg); @@ -5432,7 +6041,9 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, { MO_REL32_LO, "amdgpu-rel32-lo" }, - { MO_REL32_HI, "amdgpu-rel32-hi" } + { MO_REL32_HI, "amdgpu-rel32-hi" }, + { MO_ABS32_LO, "amdgpu-abs32-lo" }, + { MO_ABS32_HI, "amdgpu-abs32-hi" }, }; return makeArrayRef(TargetFlags); @@ -5452,8 +6063,8 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC); + unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); + MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); @@ -5480,6 +6091,20 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con } } +void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + + if (!ST.isWave32()) + return; + + for (auto &Op : MI.implicit_operands()) { + if (Op.isReg() && Op.getReg() == AMDGPU::VCC) + Op.setReg(AMDGPU::VCC_LO); + } +} + bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { if (!isSMRD(MI)) return false; @@ -5493,6 +6118,25 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return RCID == AMDGPU::SReg_128RegClassID; } +bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, + bool Signed) const { + // TODO: Should 0 be special cased? + if (!ST.hasFlatInstOffsets()) + return false; + + if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) + return false; + + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + return (Signed && isInt<12>(Offset)) || + (!Signed && isUInt<11>(Offset)); + } + + return (Signed && isInt<13>(Offset)) || + (!Signed && isUInt<12>(Offset)); +} + + // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, @@ -5500,7 +6144,9 @@ enum SIEncodingFamily { SDWA = 2, SDWA9 = 3, GFX80 = 4, - GFX9 = 5 + GFX9 = 5, + GFX10 = 6, + SDWA10 = 7 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -5513,6 +6159,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { case AMDGPUSubtarget::VOLCANIC_ISLANDS: case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; + case AMDGPUSubtarget::GFX10: + return SIEncodingFamily::GFX10; } llvm_unreachable("Unknown subtarget generation!"); } @@ -5521,18 +6169,29 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { SIEncodingFamily Gen = subtargetEncodingFamily(ST); if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && - ST.getGeneration() >= AMDGPUSubtarget::GFX9) + ST.getGeneration() == AMDGPUSubtarget::GFX9) Gen = SIEncodingFamily::GFX9; - if (get(Opcode).TSFlags & SIInstrFlags::SDWA) - Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 - : SIEncodingFamily::SDWA; // Adjust the encoding family to GFX80 for D16 buffer instructions when the // subtarget has UnpackedD16VMem feature. // TODO: remove this when we discard GFX80 encoding. if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) Gen = SIEncodingFamily::GFX80; + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { + switch (ST.getGeneration()) { + default: + Gen = SIEncodingFamily::SDWA; + break; + case AMDGPUSubtarget::GFX9: + Gen = SIEncodingFamily::SDWA9; + break; + case AMDGPUSubtarget::GFX10: + Gen = SIEncodingFamily::SDWA10; + break; + } + } + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. @@ -5627,3 +6286,77 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, } return nullptr; } + +bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI, + const MachineInstr &UseMI) { + assert(MRI.isSSA() && "Must be run on SSA"); + + auto *TRI = MRI.getTargetRegisterInfo(); + auto *DefBB = DefMI.getParent(); + + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseMI.getParent() != DefBB) + return true; + + const int MaxInstScan = 20; + int NumInst = 0; + + // Stop scan at the use. + auto E = UseMI.getIterator(); + for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } + + return false; +} + +bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI) { + assert(MRI.isSSA() && "Must be run on SSA"); + + auto *TRI = MRI.getTargetRegisterInfo(); + auto *DefBB = DefMI.getParent(); + + const int MaxUseInstScan = 10; + int NumUseInst = 0; + + for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseInst.getParent() != DefBB) + return true; + + if (++NumUseInst > MaxUseInstScan) + return true; + } + + const int MaxInstScan = 20; + int NumInst = 0; + + // Stop scan when we have seen all the uses. + for (auto I = std::next(DefMI.getIterator()); ; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->readsRegister(VReg)) + if (--NumUseInst == 0) + return false; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 5b1a05f3785e..3ff35da0b963 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -1,9 +1,8 @@ //===- SIInstrInfo.h - SI Instruction Info Interface ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -121,14 +120,15 @@ private: void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI, SetVectorType &Worklist) const; - void - addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, - SetVectorType &Worklist) const; + void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + MachineInstr &SCCDefInst, + SetVectorType &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; - bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const; + bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, + const MachineInstr &MIb) const; unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; @@ -143,7 +143,7 @@ protected: public: enum TargetOperandFlags { - MO_MASK = 0x7, + MO_MASK = 0xf, MO_NONE = 0, // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. @@ -157,7 +157,13 @@ public: MO_REL32 = 4, MO_REL32_LO = 4, // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. - MO_REL32_HI = 5 + MO_REL32_HI = 5, + + MO_LONG_BRANCH_FORWARD = 6, + MO_LONG_BRANCH_BACKWARD = 7, + + MO_ABS32_LO = 8, + MO_ABS32_HI = 9, }; explicit SIInstrInfo(const GCNSubtarget &ST); @@ -173,11 +179,13 @@ public: int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2, + bool shouldClusterMemOps(const MachineOperand &BaseOp1, + const MachineOperand &BaseOp2, unsigned NumLoads) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, @@ -294,7 +302,8 @@ public: unsigned Kind) const override; bool - areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; bool isFoldableCopy(const MachineInstr &MI) const; @@ -376,6 +385,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::SOPP; } + static bool isPacked(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsPacked; + } + + bool isPacked(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsPacked; + } + static bool isVOP1(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VOP1; } @@ -450,6 +467,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::DS; } + bool isAlwaysGDS(uint16_t Opcode) const; + static bool isMIMG(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::MIMG; } @@ -477,6 +496,11 @@ public: return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); } + // FIXME: Make this more precise + static bool isFLATScratch(const MachineInstr &MI) { + return isSegmentSpecificFLAT(MI); + } + // Any FLAT encoded instruction, including global_* and scratch_*. bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; @@ -546,6 +570,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VINTRP; } + static bool isMAI(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsMAI; + } + + bool isMAI(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsMAI; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -612,6 +644,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; } + static bool isFPAtomic(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic; + } + + bool isFPAtomic(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPAtomic; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -620,9 +660,21 @@ public: return !RI.isSGPRReg(MRI, Dest); } + bool hasVGPRUses(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return llvm::any_of(MI.explicit_uses(), + [&MRI, this](const MachineOperand &MO) { + return MO.isReg() && RI.isVGPR(MRI, MO.getReg());}); + } + /// Whether we must prevent this instruction from executing with EXEC = 0. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; + /// Returns true if the instruction could potentially depend on the value of + /// exec. If false, exec dependencies may safely be ignored. + bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; @@ -761,10 +813,6 @@ public: return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; } - /// \returns true if it is legal for the operand at index \p OpNo - /// to read a VGPR. - bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; - /// Legalize the \p OpIndex operand of this instruction by inserting /// a MOV. For example: /// ADD_I32_e32 VGPR0, 15 @@ -836,7 +884,7 @@ public: void insertReturn(MachineBasicBlock &MBB) const; /// Return the number of wait states that result from executing this /// instruction. - unsigned getNumWaitStates(const MachineInstr &MI) const; + static unsigned getNumWaitStates(const MachineInstr &MI); /// Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. @@ -922,10 +970,27 @@ public: return isUInt<12>(Imm); } + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT + /// encoded instruction. If \p Signed, this is for an instruction that + /// interprets the offset as signed. + bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, + bool Signed) const; + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; + + const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum, + const TargetRegisterInfo *TRI, + const MachineFunction &MF) + const override { + if (OpNum >= TID.getNumOperands()) + return nullptr; + return RI.getRegClass(TID.OpInfo[OpNum].RegClass); + } + + void fixImplicitOperands(MachineInstr &MI) const; }; /// \brief Returns true if a reg:subreg pair P has a TRC class @@ -956,6 +1021,21 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI); +/// \brief Return false if EXEC is not changed between the def of \p VReg at \p +/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not +/// attempt to track between blocks. +bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI, + const MachineInstr &UseMI); + +/// \brief Return false if EXEC is not changed between the def of \p VReg at \p +/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to +/// track between blocks. +bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI); + namespace AMDGPU { LLVM_READONLY @@ -1003,17 +1083,14 @@ namespace AMDGPU { LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode); + LLVM_READONLY + int getVCMPXNoSDstOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); - // For MachineOperands. - enum TargetFlags { - TF_LONG_BRANCH_FORWARD = 1 << 0, - TF_LONG_BRANCH_BACKWARD = 1 << 1 - }; - } // end namespace AMDGPU namespace SI { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 13afa4d4974b..c382c816e0b4 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1,25 +1,21 @@ //===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -def isCI : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isCIOnly : Predicate<"Subtarget->getGeneration() ==" - "AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate <"FeatureSeaIslands">; -def isVIOnly : Predicate<"Subtarget->getGeneration() ==" - "AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate <"FeatureVolcanicIslands">; + +def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">, + AssemblerPredicate <"FeatureWavefrontSize32">; +def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">, + AssemblerPredicate <"FeatureWavefrontSize64">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; class GCNPredicateControl : PredicateControl { - Predicate SIAssemblerPredicate = isSICI; - Predicate VIAssemblerPredicate = isVI; + Predicate SIAssemblerPredicate = isGFX6GFX7; + Predicate VIAssemblerPredicate = isGFX8GFX9; } // Execpt for the NONE field, this must be kept in sync with the @@ -32,6 +28,8 @@ def SIEncodingFamily { int SDWA9 = 3; int GFX80 = 4; int GFX9 = 5; + int GFX10 = 6; + int SDWA10 = 7; } //===----------------------------------------------------------------------===// @@ -41,10 +39,16 @@ def SIEncodingFamily { def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", - SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>, + SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>, + SDTCisVT<4, i1>]>, [SDNPMayLoad, SDNPMemOperand] >; +def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue] +>; + def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; @@ -57,10 +61,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; -def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; @@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +// load_d16_{lo|hi} ptr, tied_input +def SIload_d16 : SDTypeProfile<1, 2, [ + SDTCisPtrTy<1>, + SDTCisSameAs<0, 2> +]>; + + def SDTtbuffer_load : SDTypeProfile<1, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -101,9 +108,6 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; -def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", - SDTtbuffer_store, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; @@ -120,6 +124,14 @@ def SDTBufferLoad : SDTypeProfile<1, 7, def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", @@ -138,6 +150,12 @@ def SDTBufferStore : SDTypeProfile<0, 8, def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; @@ -147,9 +165,7 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", class SDBufferAtomic : SDNode , // dst - SDTCisVT<1, i32>, // vdata - SDTCisVT<2, v4i32>, // rsrc + [SDTCisVT<2, v4i32>, // rsrc SDTCisVT<3, i32>, // vindex(VGPR) SDTCisVT<4, i32>, // voffset(VGPR) SDTCisVT<5, i32>, // soffset(SGPR) @@ -159,6 +175,19 @@ class SDBufferAtomic : SDNode ; +class SDBufferAtomicNoRtn : SDNode , // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>, // idxen(imm) + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; @@ -169,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; +def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -185,10 +216,54 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +class SDGlobalAtomicNoRtn : SDNode , // vaddr + SDTCisVT<1, ty>]>, // vdata + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + +def SIglobal_atomic_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_FADD", f32>; +def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>; + def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; +def SIlds : SDNode<"AMDGPUISD::LDS", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> +>; + +def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -201,7 +276,8 @@ class isFloatType { !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, !if(!eq(SrcVT.Value, v2f16.Value), 1, - 0)))); + !if(!eq(SrcVT.Value, v4f16.Value), 1, + 0))))); } class isIntType { @@ -215,8 +291,9 @@ class isIntType { class isPackedType { bit ret = !if(!eq(SrcVT.Value, v2i16.Value), 1, - !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) - ); + !if(!eq(SrcVT.Value, v2f16.Value), 1, + !if(!eq(SrcVT.Value, v4f16.Value), 1, 0) + )); } //===----------------------------------------------------------------------===// @@ -228,7 +305,7 @@ defm atomic_dec_global : global_binary_atomic_op; def atomic_inc_local : local_binary_atomic_op; def atomic_dec_local : local_binary_atomic_op; -def atomic_load_fadd_local : local_binary_atomic_op; +def atomic_load_fadd_local : local_binary_atomic_op; def atomic_load_fmin_local : local_binary_atomic_op; def atomic_load_fmax_local : local_binary_atomic_op; @@ -250,13 +327,13 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; -def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{ - return cast(N)->getAddressingMode() == ISD::UNINDEXED; -}]>; +def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> { + let IsUnindexed = 1; +} -def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{ - return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; -}]>; +def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsNonExtLoad = 1; +} def atomic_load_32_glue : PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> { @@ -270,35 +347,49 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr), let MemoryVT = i64; } -def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{ - return cast(N)->getExtensionType() == ISD::EXTLOAD; -}]>; +def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { + let IsLoad = 1; + let IsAnyExtLoad = 1; +} def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ return cast(N)->getExtensionType() == ISD::SEXTLOAD; }]>; -def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ - return cast(N)->getExtensionType() == ISD::ZEXTLOAD; -}]>; +def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsLoad = 1; + let IsZeroExtLoad = 1; +} -def az_extload_glue : AZExtLoadBase ; +def extloadi8_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} -def az_extloadi8_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; +def zextloadi8_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} -def az_extloadi16_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; +def extloadi16_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} -def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; +def zextloadi16_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} -def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; +def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} def load_glue_align8 : Aligned8Bytes < (ops node:$ptr), (load_glue node:$ptr) @@ -311,8 +402,10 @@ def load_glue_align16 : Aligned16Bytes < def load_local_m0 : LoadFrag, LocalAddress; def sextloadi8_local_m0 : LoadFrag, LocalAddress; def sextloadi16_local_m0 : LoadFrag, LocalAddress; -def az_extloadi8_local_m0 : LoadFrag, LocalAddress; -def az_extloadi16_local_m0 : LoadFrag, LocalAddress; +def extloadi8_local_m0 : LoadFrag, LocalAddress; +def zextloadi8_local_m0 : LoadFrag, LocalAddress; +def extloadi16_local_m0 : LoadFrag, LocalAddress; +def zextloadi16_local_m0 : LoadFrag, LocalAddress; def load_align8_local_m0 : LoadFrag , LocalAddress; def load_align16_local_m0 : LoadFrag , LocalAddress; def atomic_load_32_local_m0 : LoadFrag, LocalAddress; @@ -386,6 +479,51 @@ def si_setcc_uniform : PatFrag < return true; }]>; +//===----------------------------------------------------------------------===// +// SDNodes PatFrags for d16 loads +//===----------------------------------------------------------------------===// + +class LoadD16Frag : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>; +class LocalLoadD16 : LoadD16Frag , LocalAddress; +class GlobalLoadD16 : LoadD16Frag , GlobalLoadAddress; +class PrivateLoadD16 : LoadD16Frag , PrivateAddress; +class FlatLoadD16 : LoadD16Frag , FlatLoadAddress; + +def load_d16_hi_local : LocalLoadD16 ; +def az_extloadi8_d16_hi_local : LocalLoadD16 ; +def sextloadi8_d16_hi_local : LocalLoadD16 ; + +def load_d16_hi_global : GlobalLoadD16 ; +def az_extloadi8_d16_hi_global : GlobalLoadD16 ; +def sextloadi8_d16_hi_global : GlobalLoadD16 ; + +def load_d16_hi_private : PrivateLoadD16 ; +def az_extloadi8_d16_hi_private : PrivateLoadD16 ; +def sextloadi8_d16_hi_private : PrivateLoadD16 ; + +def load_d16_hi_flat : FlatLoadD16 ; +def az_extloadi8_d16_hi_flat : FlatLoadD16 ; +def sextloadi8_d16_hi_flat : FlatLoadD16 ; + + +def load_d16_lo_local : LocalLoadD16 ; +def az_extloadi8_d16_lo_local : LocalLoadD16 ; +def sextloadi8_d16_lo_local : LocalLoadD16 ; + +def load_d16_lo_global : GlobalLoadD16 ; +def az_extloadi8_d16_lo_global : GlobalLoadD16 ; +def sextloadi8_d16_lo_global : GlobalLoadD16 ; + +def load_d16_lo_private : PrivateLoadD16 ; +def az_extloadi8_d16_lo_private : PrivateLoadD16 ; +def sextloadi8_d16_lo_private : PrivateLoadD16 ; + +def load_d16_lo_flat : FlatLoadD16 ; +def az_extloadi8_d16_lo_flat : FlatLoadD16 ; +def sextloadi8_d16_lo_flat : FlatLoadD16 ; + + + def lshr_rev : PatFrag < (ops node:$src1, node:$src0), (srl $src0, $src1) @@ -410,6 +548,7 @@ multiclass SIAtomicM0Glue2 ; def _local_m0 : local_binary_atomic_op (NAME#"_glue")>; + def _region_m0 : region_binary_atomic_op (NAME#"_glue")>; } defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; @@ -424,7 +563,7 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; -defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>; +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>; defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; @@ -433,6 +572,7 @@ def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, >; def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal; +def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion; def as_i1imm : SDNodeXForm : SDNodeXFormgetTargetConstant(Bit, SDLoc(N), MVT::i1); }]>; -def SIMM16bit : PatLeaf <(imm), - [{return isInt<16>(N->getSExtValue());}] +def SIMM16bit : ImmLeaf (Imm);}] +>; + +def UIMM16bit : ImmLeaf (Imm); }] >; class InlineImm : PatLeaf <(vt imm), [{ @@ -515,6 +659,22 @@ def ShiftAmt32Imm : PatLeaf <(imm), [{ return N->getZExtValue() < 32; }]>; +def getNegV2I16Imm : SDNodeXForm; + +def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ + assert(N->getNumOperands() == 2); + assert(N->getOperand(0).getValueType().getSizeInBits() == 16); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + if (Src0 == Src1) + return isNegInlineImmediate(Src0.getNode()); + + return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) || + (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); +}], getNegV2I16Imm>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -588,6 +748,14 @@ def SwizzleMatchClass : AsmOperandClass { let IsOptional = 1; } +def EndpgmMatchClass : AsmOperandClass { + let Name = "EndpgmImm"; + let PredicateMethod = "isEndpgm"; + let ParserMethod = "parseEndpgmOp"; + let RenderMethod = "addImmOperands"; + let IsOptional = 1; +} + def ExpTgtMatchClass : AsmOperandClass { let Name = "ExpTgt"; let PredicateMethod = "isExpTgt"; @@ -605,6 +773,11 @@ def SwizzleImm : Operand { let ParserMatchClass = SwizzleMatchClass; } +def EndpgmImm : Operand { + let PrintMethod = "printEndpgm"; + let ParserMatchClass = EndpgmMatchClass; +} + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; @@ -619,11 +792,41 @@ def VReg32OrOffClass : AsmOperandClass { def WAIT_FLAG : Operand { let ParserMatchClass = SWaitMatchClass; let PrintMethod = "printWaitFlag"; + let OperandType = "OPERAND_IMMEDIATE"; } include "SIInstrFormats.td" include "VIInstrFormats.td" +def BoolReg : AsmOperandClass { + let Name = "BoolReg"; + let ParserMethod = "parseBoolReg"; + let RenderMethod = "addRegOperands"; +} + +class BoolRC : RegisterOperand { + let ParserMatchClass = BoolReg; + let DecoderMethod = "decodeBoolReg"; +} + +def SSrc_i1 : RegisterOperand { + let ParserMatchClass = BoolReg; + let DecoderMethod = "decodeBoolReg"; +} + +def VOPDstS64orS32 : BoolRC { + let PrintMethod = "printVOPDst"; +} + +// SCSrc_i1 is the operand for pseudo instructions only. +// Boolean immeadiates shall not be exposed to codegen instructions. +def SCSrc_i1 : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM_INT32"; + let ParserMatchClass = BoolReg; + let DecoderMethod = "decodeBoolReg"; +} + // ===----------------------------------------------------------------------===// // ExpSrc* Special cases for exp src operands which are printed as // "off" depending on en operand. @@ -662,11 +865,12 @@ def SDWASrc_i16 : SDWASrc; def SDWASrc_f32 : SDWASrc; def SDWASrc_f16 : SDWASrc; -def SDWAVopcDst : VOPDstOperand { +def SDWAVopcDst : BoolRC { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_SDWA_VOPC_DST"; let EncoderMethod = "getSDWAVopcDstEncoding"; let DecoderMethod = "decodeSDWAVopcDst"; + let PrintMethod = "printVOPDst"; } class NamedMatchClass : AsmOperandClass { @@ -688,21 +892,11 @@ class NamedOperandU8 : Operand { let ParserMatchClass = MatchClass; } -class NamedOperandU12 : Operand { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - class NamedOperandU16 : Operand { let PrintMethod = "print"#Name; let ParserMatchClass = MatchClass; } -class NamedOperandS13 : Operand { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - class NamedOperandU32 : Operand { let PrintMethod = "print"#Name; let ParserMatchClass = MatchClass; @@ -720,8 +914,7 @@ def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>; def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; -def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>; -def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>; +def flat_offset : NamedOperandU16<"FlatOffset", NamedMatchClass<"FlatOffset">>; def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; @@ -732,6 +925,7 @@ def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>; +def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; @@ -746,11 +940,15 @@ def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>; def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; +def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>; + +def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>; def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; +def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>; def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; @@ -762,6 +960,10 @@ def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; +def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>; +def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>; +def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>; + def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { @@ -793,9 +995,6 @@ def f32kimm : kimmOperand; def KImmFP16MatchClass : KImmMatchClass<16>; def f16kimm : kimmOperand; - -def VOPDstS64 : VOPDstOperand ; - class FPInputModsMatchClass : AsmOperandClass { let Name = "RegOrImmWithFP"#opSize#"InputMods"; let ParserMethod = "parseRegOrImmWithFPInputMods"; @@ -863,7 +1062,7 @@ def FP32SDWAInputMods : FPSDWAInputMods; def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isVReg"; + let PredicateMethod = "isVReg32"; } def FPVRegInputMods : InputMods { @@ -890,7 +1089,7 @@ def Int32SDWAInputMods : IntSDWAInputMods; def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isVReg"; + let PredicateMethod = "isVReg32"; } def IntVRegInputMods : InputMods { @@ -941,6 +1140,8 @@ def VOP3Mods : ComplexPattern; def VOP3NoMods : ComplexPattern; // VOP3Mods, but the input source is known to never be NaN. def VOP3Mods_nnan : ComplexPattern; +// VOP3Mods, but only allowed for f32 operands. +def VOP3Mods_f32 : ComplexPattern; def VOP3OMods : ComplexPattern; @@ -995,6 +1196,31 @@ def TRAPID{ int LLVM_DEBUG_TRAP = 3; } +def HWREG { + int MODE = 1; + int STATUS = 2; + int TRAPSTS = 3; + int HW_ID = 4; + int GPR_ALLOC = 5; + int LDS_ALLOC = 6; + int IB_STS = 7; + int MEM_BASES = 15; + int TBA_LO = 16; + int TBA_HI = 17; + int TMA_LO = 18; + int TMA_HI = 19; + int FLAT_SCR_LO = 20; + int FLAT_SCR_HI = 21; + int XNACK_MASK = 22; + int POPS_PACKER = 25; +} + +class getHwRegImm { + int ret = !or(Reg, + !or(!shl(Offset, 6), + !shl(!add(Size, -1), 11))); +} + //===----------------------------------------------------------------------===// // // SI Instruction multiclass helpers. @@ -1045,18 +1271,26 @@ multiclass EXP_m { def _si : EXP_Helper, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>, EXPe { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; + let AssemblerPredicates = [isGFX6GFX7]; + let DecoderNamespace = "GFX6GFX7"; let DisableDecoder = DisableSIDecoder; } def _vi : EXP_Helper, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>, EXPe_vi { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; + let AssemblerPredicates = [isGFX8GFX9]; + let DecoderNamespace = "GFX8"; let DisableDecoder = DisableVIDecoder; } + + def _gfx10 : EXP_Helper, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.GFX10>, + EXPe { + let AssemblerPredicates = [isGFX10Plus]; + let DecoderNamespace = "GFX10"; + let DisableDecoder = DisableSIDecoder; + } } } } @@ -1080,7 +1314,19 @@ class getVALUDstForVT { !if(!eq(VT.Size, 128), VOPDstOperand, !if(!eq(VT.Size, 64), VOPDstOperand, !if(!eq(VT.Size, 16), VOPDstOperand, - VOPDstOperand)))); // else VT == i1 + VOPDstS64orS32)))); // else VT == i1 +} + +// Returns true if VT is floating point. +class getIsFP { + bit ret = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, + !if(!eq(VT.Value, v4f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, v2f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + !if(!eq(VT.Value, v2f64.Value), 1, + 0))))))); } // Returns the register class to use for the destination of VOP[12C] @@ -1094,11 +1340,7 @@ class getSDWADstForVT { // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, v2f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0)))); + bit isFP = getIsFP.ret; RegisterOperand ret = !if(isFP, @@ -1107,8 +1349,11 @@ class getVOPSrc0ForVT { !if(!eq(VT.Value, f16.Value), VSrc_f16, !if(!eq(VT.Value, v2f16.Value), - VCSrc_v2f16, - VSrc_f32 + VSrc_v2f16, + !if(!eq(VT.Value, v4f16.Value), + AVSrc_64, + VSrc_f32 + ) ) ) ), @@ -1117,7 +1362,7 @@ class getVOPSrc0ForVT { !if(!eq(VT.Value, i16.Value), VSrc_b16, !if(!eq(VT.Value, v2i16.Value), - VCSrc_v2b16, + VSrc_v2b16, VSrc_b32 ) ) @@ -1132,9 +1377,7 @@ class getVregSrcForVT { } class getSDWASrcForVT { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - 0)); + bit isFP = getIsFP.ret; RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); RegisterOperand ret = !if(isFP, retFlt, retInt); @@ -1143,33 +1386,32 @@ class getSDWASrcForVT { // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, v2f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0)))); + bit isFP = getIsFP.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), VSrc_128, !if(!eq(VT.Size, 64), !if(isFP, - VCSrc_f64, - VCSrc_b64), + VSrc_f64, + VSrc_b64), !if(!eq(VT.Value, i1.Value), - SCSrc_i1, + SSrc_i1, !if(isFP, !if(!eq(VT.Value, f16.Value), - VCSrc_f16, + VSrc_f16, !if(!eq(VT.Value, v2f16.Value), - VCSrc_v2f16, - VCSrc_f32 + VSrc_v2f16, + !if(!eq(VT.Value, v4f16.Value), + AVSrc_64, + VSrc_f32 + ) ) ), !if(!eq(VT.Value, i16.Value), - VCSrc_b16, + VSrc_b16, !if(!eq(VT.Value, v2i16.Value), - VCSrc_v2b16, - VCSrc_b32 + VSrc_v2b16, + VSrc_b32 ) ) ) @@ -1190,11 +1432,8 @@ class isModifierType { } // Return type of input modifiers operand for specified input operand -class getSrcMod { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0))); +class getSrcMod { + bit isFP = getIsFP.ret; bit isPacked = isPackedType.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), @@ -1203,7 +1442,7 @@ class getSrcMod { FP16InputMods, FP32InputMods ), - Int32InputMods) + !if(EnableF32SrcMods, FP32InputMods, Int32InputMods)) ); } @@ -1213,10 +1452,7 @@ class getOpSelMod { // Return type of input modifiers operand specified input operand for DPP class getSrcModExt { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0))); + bit isFP = getIsFP.ret; Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } @@ -1238,7 +1474,7 @@ class getIns32 { // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 { dag ret = @@ -1276,16 +1512,33 @@ class getIns64 { + dag ret = !con(getInsDPP.ret, + (ins FI:$fi)); +} + +class getInsDPP8 { + dag ret = !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins dpp8:$dpp8, FI:$fi), + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1_DPP with modifiers + (ins DstRC:$old, Src0Mod:$src0_modifiers, + Src0RC:$src0, dpp8:$dpp8, FI:$fi) + /* else */, + // VOP1_DPP without modifiers + (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi) + /* endif */) + /* NumSrcArgs == 2 */, + !if (!eq(HasModifiers, 1), + // VOP2_DPP with modifiers + (ins DstRC:$old, + Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + dpp8:$dpp8, FI:$fi) + /* else */, + // VOP2_DPP without modifiers + (ins DstRC:$old, + Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi) + /* endif */))); +} // Ins for SDWA @@ -1556,6 +1845,26 @@ class getAsmDPP { + string ret = getAsmDPP.ret#"$fi"; +} + +class getAsmDPP8 { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", + "$vdst"), + ""); // use $sdst for VOPC + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string args = !if(!eq(HasModifiers, 0), + getAsm32<0, NumSrcArgs, DstVT>.ret, + ", "#src0#src1); + string ret = dst#args#"$dpp8$fi"; +} + class getAsmSDWA { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), @@ -1650,9 +1959,12 @@ def PatGenMode { int Pattern = 1; } -class VOPProfile _ArgVT> { +class VOPProfile _ArgVT, bit _EnableF32SrcMods = 0, + bit _EnableClamp = 0> { field list ArgVT = _ArgVT; + field bit EnableF32SrcMods = _EnableF32SrcMods; + field bit EnableClamp = _EnableClamp; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; @@ -1670,9 +1982,9 @@ class VOPProfile _ArgVT> { field RegisterClass Src1DPP = getVregSrcForVT.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT.ret; field RegisterOperand Src1SDWA = getSDWASrcForVT.ret; - field Operand Src0Mod = getSrcMod.ret; - field Operand Src1Mod = getSrcMod.ret; - field Operand Src2Mod = getSrcMod.ret; + field Operand Src0Mod = getSrcMod.ret; + field Operand Src1Mod = getSrcMod.ret; + field Operand Src2Mod = getSrcMod.ret; field Operand Src0ModDPP = getSrcModExt.ret; field Operand Src1ModDPP = getSrcModExt.ret; field Operand Src0ModSDWA = getSrcModSDWA.ret; @@ -1688,12 +2000,16 @@ class VOPProfile _ArgVT> { field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1); // TODO: Modifiers logic is somewhat adhoc here, to be refined later - field bit HasModifiers = isModifierType.ret; + // HasModifiers affects the normal and DPP encodings. We take note of EnableF32SrcMods, which + // enables modifiers for i32 type. + field bit HasModifiers = BitOr.ret, EnableF32SrcMods>.ret; + // HasSrc*FloatMods affects the SDWA encoding. We ignore EnableF32SrcMods. field bit HasSrc0FloatMods = isFloatType.ret; field bit HasSrc1FloatMods = isFloatType.ret; field bit HasSrc2FloatMods = isFloatType.ret; + // HasSrc*IntMods affects the SDWA encoding. We ignore EnableF32SrcMods. field bit HasSrc0IntMods = isIntType.ret; field bit HasSrc1IntMods = isIntType.ret; field bit HasSrc2IntMods = isIntType.ret; @@ -1702,7 +2018,7 @@ class VOPProfile _ArgVT> { field bit HasSrc1Mods = !if(HasModifiers, BitOr.ret, 0); field bit HasSrc2Mods = !if(HasModifiers, BitOr.ret, 0); - field bit HasClamp = HasModifiers; + field bit HasClamp = BitOr.ret, EnableClamp>.ret; field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd.ret, HasClamp>.ret; field bit HasIntClamp = !if(isFloatType.ret, 0, HasClamp); @@ -1721,6 +2037,8 @@ class VOPProfile _ArgVT> { field bit HasExtSDWA9 = HasExt; field int NeedPatGen = PatGenMode.NoPattern; + field bit IsMAI = 0; + field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1732,12 +2050,13 @@ class VOPProfile _ArgVT> { field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt.ret; + field dag OutsDPP8 = getOutsExt.ret; field dag OutsSDWA = getOutsSDWA.ret; field dag Ins32 = getIns32.ret; field dag Ins64 = getIns64.ret; + HasIntClamp, HasModifiers, HasSrc2Mods, + HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; field dag InsVOP3P = getInsVOP3P.ret; @@ -1751,6 +2070,10 @@ class VOPProfile _ArgVT> { getInsDPP.ret, (ins)); + field dag InsDPP16 = getInsDPP16.ret; + field dag InsDPP8 = getInsDPP8.ret; field dag InsSDWA = getInsSDWA.ret; @@ -1766,8 +2089,12 @@ class VOPProfile _ArgVT> { HasSrc2FloatMods>.ret; field string AsmDPP = !if(HasExtDPP, getAsmDPP.ret, ""); + field string AsmDPP16 = getAsmDPP16.ret; + field string AsmDPP8 = getAsmDPP8.ret; field string AsmSDWA = getAsmSDWA.ret; field string AsmSDWA9 = getAsmSDWA9.ret; + + field string TieRegDPP = "$old"; } class VOP_NO_EXT : VOPProfile { @@ -1828,6 +2155,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 0, /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; @@ -1848,6 +2176,19 @@ def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>; def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>; +def VOP_V4F32_F32_F32_V4F32 : VOPProfile <[v4f32, f32, f32, v4f32]>; +def VOP_V16F32_F32_F32_V16F32 : VOPProfile <[v16f32, f32, f32, v16f32]>; +def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32f32, f32, f32, v32f32]>; +def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>; +def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>; +def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>; +def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>; +def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>; +def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>; +def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>; +def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>; +def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>; + class Commutable_REV { string RevOp = revOp; bit IsOrig = isOrig; @@ -1871,13 +2212,12 @@ class VINTRP_Pseudo pattern> : let isCodeGenOnly = 1; } +// FIXME-GFX10: WIP. class VINTRP_Real_si op, string opName, dag outs, dag ins, - string asm> : + string asm, int encodingFamily> : VINTRPCommon , VINTRPe , - SIMCInstr { - let AssemblerPredicate = SIAssemblerPredicate; - let DecoderNamespace = "SICI"; + SIMCInstr { let DisableDecoder = DisableSIDecoder; } @@ -1887,19 +2227,25 @@ class VINTRP_Real_vi op, string opName, dag outs, dag ins, VINTRPe_vi , SIMCInstr { let AssemblerPredicate = VIAssemblerPredicate; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; let DisableDecoder = DisableVIDecoder; } +// FIXME-GFX10: WIP. multiclass VINTRP_m op, dag outs, dag ins, string asm, list pattern = []> { def "" : VINTRP_Pseudo ; - def _si : VINTRP_Real_si ; + let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + def _si : VINTRP_Real_si ; + } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" def _vi : VINTRP_Real_vi ; -} + let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + def _gfx10 : VINTRP_Real_si; + } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} //===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// @@ -1981,7 +2327,9 @@ def getMCOpcodeGen : InstrMapping { // does not actually change the encoding, and thus may be // removed later. [!cast(SIEncodingFamily.GFX80)], - [!cast(SIEncodingFamily.GFX9)]]; + [!cast(SIEncodingFamily.GFX9)], + [!cast(SIEncodingFamily.GFX10)], + [!cast(SIEncodingFamily.SDWA10)]]; } // Get equivalent SOPK instruction. @@ -2044,6 +2392,24 @@ def getGlobalSaddrOp : InstrMapping { let ValueCols = [["1"]]; } +// Maps a v_cmpx opcode with sdst to opcode without sdst. +def getVCMPXNoSDstOp : InstrMapping { + let FilterClass = "VCMPXNoSDstTable"; + let RowFields = ["NoSDstOp"]; + let ColFields = ["HasSDst"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +// Maps a SOPP to a SOPP with S_NOP +def getSOPPWithRelaxation : InstrMapping { + let FilterClass = "Base_SOPP"; + let RowFields = ["AsmString"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["8"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index b6b00c2e4257..70f20bb69370 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1,9 +1,8 @@ //===-- SIInstructions.td - SI Instruction Defintions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file was originally auto-generated from a GPU register header file and @@ -12,7 +11,7 @@ //===----------------------------------------------------------------------===// class GCNPat : Pat, GCNPredicateControl { - let SubtargetPredicate = isGCN; + } include "SOPInstructions.td" @@ -122,7 +121,14 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> { +def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { + let Defs = [EXEC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; @@ -155,13 +161,12 @@ def S_SUB_U64_PSEUDO : SPseudoInstSI < >; def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; - } // End usesCustomInserter = 1, Defs = [SCC] let usesCustomInserter = 1 in { @@ -169,23 +174,30 @@ def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; } // End let usesCustomInserter = 1, SALU = 1 -def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst), - (ins SSrc_b64:$src0)> { - let isAsCheapAsAMove = 1; +// Wrap an instruction by duplicating it, except for setting isTerminator. +class WrapTerminatorInst : SPseudoInstSI< + base_inst.OutOperandList, + base_inst.InOperandList> { + let Uses = base_inst.Uses; + let Defs = base_inst.Defs; let isTerminator = 1; + let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; + let hasSideEffects = base_inst.hasSideEffects; + let UseNamedOperandTable = base_inst.UseNamedOperandTable; + let CodeSize = base_inst.CodeSize; } -def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst), - (ins SSrc_b64:$src0, SSrc_b64:$src1)> { - let isAsCheapAsAMove = 1; - let isTerminator = 1; - let Defs = [SCC]; +let WaveSizePredicate = isWave64 in { +def S_MOV_B64_term : WrapTerminatorInst; +def S_XOR_B64_term : WrapTerminatorInst; +def S_ANDN2_B64_term : WrapTerminatorInst; } -def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst), - (ins SSrc_b64:$src0, SSrc_b64:$src1)> { - let isAsCheapAsAMove = 1; - let isTerminator = 1; +let WaveSizePredicate = isWave32 in { +def S_MOV_B32_term : WrapTerminatorInst; +def S_XOR_B32_term : WrapTerminatorInst; +def S_OR_B32_term : WrapTerminatorInst; +def S_ANDN2_B32_term : WrapTerminatorInst; } def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), @@ -195,7 +207,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; - let isBarrier = 1; let isConvergent = 1; let FixedSize = 1; let Size = 0; @@ -222,30 +233,30 @@ let isTerminator = 1 in { let OtherPredicates = [EnableLateCFGStructurize] in { def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < (outs), - (ins SReg_64:$vcc, brtarget:$target), + (ins SReg_1:$vcc, brtarget:$target), [(brcond i1:$vcc, bb:$target)]> { let Size = 12; } } def SI_IF: CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), - [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { + (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), + [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; let Size = 12; let hasSideEffects = 1; } def SI_ELSE : CFPseudoInstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { + (outs SReg_1:$dst), + (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { let Size = 12; let hasSideEffects = 1; } def SI_LOOP : CFPseudoInstSI < - (outs), (ins SReg_64:$saved, brtarget:$target), - [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> { + (outs), (ins SReg_1:$saved, brtarget:$target), + [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { let Size = 8; let isBranch = 1; let hasSideEffects = 1; @@ -254,8 +265,7 @@ def SI_LOOP : CFPseudoInstSI < } // End isTerminator = 1 def SI_END_CF : CFPseudoInstSI < - (outs), (ins SReg_64:$saved), - [(int_amdgcn_end_cf i64:$saved)], 1, 1> { + (outs), (ins SReg_1:$saved), [], 1, 1> { let Size = 4; let isAsCheapAsAMove = 1; let isReMaterializable = 1; @@ -265,8 +275,7 @@ def SI_END_CF : CFPseudoInstSI < } def SI_IF_BREAK : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), - [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> { + (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { let Size = 4; let isAsCheapAsAMove = 1; let isReMaterializable = 1; @@ -292,7 +301,7 @@ multiclass PseudoInstKill { } } -defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; +defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; let Defs = [EXEC,VCC] in @@ -311,7 +320,7 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { } def SI_PS_LIVE : PseudoInstSI < - (outs SReg_64:$dst), (ins), + (outs SReg_1:$dst), (ins), [(set i1:$dst, (int_amdgcn_ps_live))]> { let SALU = 1; } @@ -340,6 +349,15 @@ def SI_INIT_EXEC : SPseudoInstSI < let Defs = [EXEC]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; + let WaveSizePredicate = isWave64; +} + +def SI_INIT_EXEC_LO : SPseudoInstSI < + (outs), (ins i32imm:$src), []> { + let Defs = [EXEC_LO]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; + let WaveSizePredicate = isWave32; } def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < @@ -374,11 +392,14 @@ def SI_RETURN : SPseudoInstSI < // This version is only needed so we can fill in the output regiter in // the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { + (outs), (ins SSrc_b64:$src0, unknown:$callee), + [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; let usesCustomInserter = 1; + // TODO: Should really base this on the call target + let isConvergent = 1; } // Wrapper around s_swappc_b64 with extra $callee parameter to track @@ -389,23 +410,14 @@ def SI_CALL : SPseudoInstSI < let isCall = 1; let UseNamedOperandTable = 1; let SchedRW = [WriteBranch]; + // TODO: Should really base this on the call target + let isConvergent = 1; } // Tail call handling pseudo -def SI_TCRETURN_ISEL : SPseudoInstSI<(outs), - (ins SSrc_b64:$src0, i32imm:$fpdiff), - [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> { - let isCall = 1; - let isTerminator = 1; - let isReturn = 1; - let isBarrier = 1; - let SchedRW = [WriteBranch]; - let usesCustomInserter = 1; -} - -def SI_TCRETURN : SPseudoInstSI < - (outs), - (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> { +def SI_TCRETURN : SPseudoInstSI <(outs), + (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff), + [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; let isCall = 1; let isTerminator = 1; @@ -413,6 +425,8 @@ def SI_TCRETURN : SPseudoInstSI < let isBarrier = 1; let UseNamedOperandTable = 1; let SchedRW = [WriteBranch]; + // TODO: Should really base this on the call target + let isConvergent = 1; } @@ -424,6 +438,8 @@ def ADJCALLSTACKUP : SPseudoInstSI< let FixedSize = 1; let hasSideEffects = 1; let usesCustomInserter = 1; + let SchedRW = [WriteSALU]; + let Defs = [SCC]; } def ADJCALLSTACKDOWN : SPseudoInstSI< @@ -433,6 +449,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI< let Size = 8; // Worst case. (s_add_u32 + constant) let hasSideEffects = 1; let usesCustomInserter = 1; + let SchedRW = [WriteSALU]; + let Defs = [SCC]; } let Defs = [M0, EXEC, SCC], @@ -490,9 +508,12 @@ multiclass SI_SPILL_SGPR { // SI_SPILL_32_* instructions. defm SI_SPILL_S32 : SI_SPILL_SGPR ; defm SI_SPILL_S64 : SI_SPILL_SGPR ; +defm SI_SPILL_S96 : SI_SPILL_SGPR ; defm SI_SPILL_S128 : SI_SPILL_SGPR ; +defm SI_SPILL_S160 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; +defm SI_SPILL_S1024 : SI_SPILL_SGPR ; multiclass SI_SPILL_VGPR { let UseNamedOperandTable = 1, VGPRSpill = 1, @@ -504,7 +525,9 @@ multiclass SI_SPILL_VGPR { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (8 * num_subregs) bytes maximum - let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); } def _RESTORE : VPseudoInstSI < @@ -515,7 +538,9 @@ multiclass SI_SPILL_VGPR { let mayLoad = 1; // (2 * 4) + (8 * num_subregs) bytes maximum - let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); } } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] } @@ -524,21 +549,74 @@ defm SI_SPILL_V32 : SI_SPILL_VGPR ; defm SI_SPILL_V64 : SI_SPILL_VGPR ; defm SI_SPILL_V96 : SI_SPILL_VGPR ; defm SI_SPILL_V128 : SI_SPILL_VGPR ; +defm SI_SPILL_V160 : SI_SPILL_VGPR ; defm SI_SPILL_V256 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; +defm SI_SPILL_V1024 : SI_SPILL_VGPR ; + +multiclass SI_SPILL_AGPR { + let UseNamedOperandTable = 1, VGPRSpill = 1, + Constraints = "@earlyclobber $tmp", + SchedRW = [WriteVMEM] in { + def _SAVE : VPseudoInstSI < + (outs VGPR_32:$tmp), + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + SReg_32:$soffset, i32imm:$offset)> { + let mayStore = 1; + let mayLoad = 0; + // (2 * 4) + (16 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + + def _RESTORE : VPseudoInstSI < + (outs vgpr_class:$vdata, VGPR_32:$tmp), + (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, + i32imm:$offset)> { + let mayStore = 0; + let mayLoad = 1; + + // (2 * 4) + (16 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] +} + +defm SI_SPILL_A32 : SI_SPILL_AGPR ; +defm SI_SPILL_A64 : SI_SPILL_AGPR ; +defm SI_SPILL_A128 : SI_SPILL_AGPR ; +defm SI_SPILL_A512 : SI_SPILL_AGPR ; +defm SI_SPILL_A1024 : SI_SPILL_AGPR ; def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), [(set SReg_64:$dst, - (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> { + (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { let Defs = [SCC]; } +def : GCNPat < + (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), + (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) +>; + def : GCNPat < (AMDGPUinit_exec i64:$src), (SI_INIT_EXEC (as_i64imm $src)) ->; +> { + let WaveSizePredicate = isWave64; +} + +def : GCNPat < + (AMDGPUinit_exec i64:$src), + (SI_INIT_EXEC_LO (as_i32imm $src)) +> { + let WaveSizePredicate = isWave32; +} def : GCNPat < (AMDGPUinit_exec_from_input i32:$input, i32:$shift), @@ -551,7 +629,7 @@ def : GCNPat< >; def : GCNPat< - (AMDGPUelse i64:$src, bb:$target), + (AMDGPUelse i1:$src, bb:$target), (SI_ELSE $src, $target, 0) >; @@ -584,7 +662,12 @@ def : Pat < // TODO: we could add more variants for other types of conditionals def : Pat < - (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)), + (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), + (COPY $src) // Return the SGPRs representing i1 src +>; + +def : Pat < + (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), (COPY $src) // Return the SGPRs representing i1 src >; @@ -592,7 +675,7 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in { +let OtherPredicates = [UnsafeFPMath] in { //def : RcpPat; //defm : RsqPat; @@ -615,7 +698,7 @@ def : GCNPat < (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) >; -} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] +} // End OtherPredicates = [UnsafeFPMath] // f16_to_fp patterns @@ -706,17 +789,18 @@ def : FMADModsPat { let SubtargetPredicate = Has16BitInsts; } -multiclass SelectPat { +multiclass SelectPat { def : GCNPat < - (vt (select i1:$src0, vt:$src1, vt:$src2)), - (inst $src2, $src1, $src0) + (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods), + (VOP3Mods_f32 vt:$src2, i32:$src2_mods))), + (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0) >; } -defm : SelectPat ; -defm : SelectPat ; -defm : SelectPat ; -defm : SelectPat ; +defm : SelectPat ; +defm : SelectPat ; +defm : SelectPat ; +defm : SelectPat ; let AddedComplexity = 1 in { def : GCNPat < @@ -749,6 +833,22 @@ foreach Index = 0-2 in { >; } +foreach Index = 0-2 in { + def Extract_Element_v3i32_#Index : Extract_Element < + i32, v3i32, Index, !cast(sub#Index) + >; + def Insert_Element_v3i32_#Index : Insert_Element < + i32, v3i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v3f32_#Index : Extract_Element < + f32, v3f32, Index, !cast(sub#Index) + >; + def Insert_Element_v3f32_#Index : Insert_Element < + f32, v3f32, Index, !cast(sub#Index) + >; +} + foreach Index = 0-3 in { def Extract_Element_v4i32_#Index : Extract_Element < i32, v4i32, Index, !cast(sub#Index) @@ -765,6 +865,22 @@ foreach Index = 0-3 in { >; } +foreach Index = 0-4 in { + def Extract_Element_v5i32_#Index : Extract_Element < + i32, v5i32, Index, !cast(sub#Index) + >; + def Insert_Element_v5i32_#Index : Insert_Element < + i32, v5i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v5f32_#Index : Extract_Element < + f32, v5f32, Index, !cast(sub#Index) + >; + def Insert_Element_v5f32_#Index : Insert_Element < + f32, v5f32, Index, !cast(sub#Index) + >; +} + foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < i32, v8i32, Index, !cast(sub#Index) @@ -818,7 +934,23 @@ def : Pat < (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; -let SubtargetPredicate = isGCN in { +foreach Index = 0-31 in { + def Extract_Element_v32i32_#Index : Extract_Element < + i32, v32i32, Index, !cast(sub#Index) + >; + + def Insert_Element_v32i32_#Index : Insert_Element < + i32, v32i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v32f32_#Index : Extract_Element < + f32, v32f32, Index, !cast(sub#Index) + >; + + def Insert_Element_v32f32_#Index : Insert_Element < + f32, v32f32, Index, !cast(sub#Index) + >; +} // FIXME: Why do only some of these type combinations for SReg and // VReg? @@ -882,6 +1014,10 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; +// 96-bit bitcast +def : BitConvert ; +def : BitConvert ; + // 128-bit bitcast def : BitConvert ; def : BitConvert ; @@ -892,6 +1028,10 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; +// 160-bit bitcast +def : BitConvert ; +def : BitConvert ; + // 256-bit bitcast def : BitConvert ; def : BitConvert ; @@ -902,7 +1042,9 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; -} // End SubtargetPredicate = isGCN +// 1024-bit bitcast +def : BitConvert ; +def : BitConvert ; /********** =================== **********/ /********** Src & Dst modifiers **********/ @@ -1070,6 +1212,16 @@ def : GCNPat < (S_MOV_B32 imm:$imm) >; +def : GCNPat < + (VGPRImm<(SIlds tglobaladdr:$ga)>), + (V_MOV_B32_e32 $ga) +>; + +def : GCNPat < + (SIlds tglobaladdr:$ga), + (S_MOV_B32 $ga) +>; + // FIXME: Workaround for ordering issue with peephole optimizer where // a register class copy interferes with immediate folding. Should // use s_mov_b32, which can be shrunk to s_movk_i32 @@ -1104,7 +1256,16 @@ def : GCNPat < def : GCNPat < (i1 imm:$imm), (S_MOV_B64 (i64 (as_i64imm $imm))) ->; +> { + let WaveSizePredicate = isWave64; +} + +def : GCNPat < + (i1 imm:$imm), + (S_MOV_B32 (i32 (as_i32imm $imm))) +> { + let WaveSizePredicate = isWave32; +} def : GCNPat < (f64 InlineFPImm:$imm), @@ -1115,18 +1276,18 @@ def : GCNPat < /********** Intrinsic Patterns **********/ /********** ================== **********/ -let SubtargetPredicate = isGCN in { def : POW_Common ; -} def : GCNPat < (i32 (sext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) >; class Ext32Pat : GCNPat < (i32 (ext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) >; def : Ext32Pat ; @@ -1144,8 +1305,6 @@ def : GCNPat < // VOP3 Patterns //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN in { - def : IMad24Pat; def : UMad24Pat; @@ -1153,8 +1312,6 @@ def : UMad24Pat; defm : BFIPatterns ; def : ROTRPattern ; -} - def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; @@ -1261,8 +1418,9 @@ def : GCNPat < class ZExt_i64_i1_Pat : GCNPat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 (i32 0)), sub1) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), $src), + sub0, (S_MOV_B32 (i32 0)), sub1) >; @@ -1280,8 +1438,10 @@ def : GCNPat < def : GCNPat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) >; class FPToI1Pat : GCNPat < @@ -1296,10 +1456,12 @@ def : FPToI1Pat; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector -// comparisons still write to a pair of SGPRs, so treat these as -// 64-bit comparisons. When legalizing SGPR copies, instructions -// resulting in the copies from SCC to these instructions will be -// moved to the VALU. +// comparisons may write to a pair of SGPRs or a single SGPR, so treat +// these as 32 or 64-bit comparisons. When legalizing SGPR copies, +// instructions resulting in the copies from SCC to these instructions +// will be moved to the VALU. + +let WaveSizePredicate = isWave64 in { def : GCNPat < (i1 (and i1:$src0, i1:$src1)), (S_AND_B64 $src0, $src1) @@ -1336,35 +1498,89 @@ def : GCNPat < (S_NOT_B64 $src0) >; } +} // end isWave64 + +let WaveSizePredicate = isWave32 in { +def : GCNPat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (add i1:$src0, i1:$src1)), + (S_XOR_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (sub i1:$src0, i1:$src1)), + (S_XOR_B32 $src0, $src1) +>; + +let AddedComplexity = 1 in { +def : GCNPat < + (i1 (add i1:$src0, (i1 -1))), + (S_NOT_B32 $src0) +>; + +def : GCNPat < + (i1 (sub i1:$src0, (i1 -1))), + (S_NOT_B32 $src0) +>; +} +} // end isWave32 def : GCNPat < (f16 (sint_to_fp i1:$src)), - (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)) + (V_CVT_F16_F32_e32 ( + V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), + $src)) >; def : GCNPat < (f16 (uint_to_fp i1:$src)), - (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)) + (V_CVT_F16_F32_e32 ( + V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), + $src)) >; def : GCNPat < (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), + $src) >; def : GCNPat < (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), + $src) >; def : GCNPat < (f64 (sint_to_fp i1:$src)), - (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), + $src)) >; def : GCNPat < (f64 (uint_to_fp i1:$src)), - (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), + $src)) >; //===----------------------------------------------------------------------===// @@ -1417,7 +1633,7 @@ def : GCNPat< def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) + (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; } @@ -1478,6 +1694,14 @@ def : GCNPat < >; } // End OtherPredicates = [HasDLInsts] +let SubtargetPredicate = isGFX10Plus in +def : GCNPat < + (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (f16 (VOP3NoMods f32:$src2))), + (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; // Allow integer inputs class ExpPattern : GCNPat< @@ -1568,7 +1792,7 @@ def : GCNPat < // Fract Patterns //===----------------------------------------------------------------------===// -let SubtargetPredicate = isSI in { +let SubtargetPredicate = isGFX6 in { // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is // used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient @@ -1595,7 +1819,7 @@ def : GCNPat < DSTCLAMP.NONE, DSTOMOD.NONE) >; -} // End SubtargetPredicates = isSI +} // End SubtargetPredicates = isGFX6 //============================================================================// // Miscellaneous Optimization Patterns @@ -1609,6 +1833,13 @@ def : GCNPat< (S_SUB_I32 $src0, NegSubInlineConst32:$src1) >; +// Avoid pointlessly materializing a constant in VGPR. +// FIXME: Should also do this for readlane, but tablegen crashes on +// the ignored src1. +def : GCNPat< + (int_amdgcn_readfirstlane (i32 imm:$src)), + (S_MOV_B32 $src) +>; multiclass BFMPatterns { def : GCNPat < @@ -1622,8 +1853,6 @@ multiclass BFMPatterns { >; } -let SubtargetPredicate = isGCN in { - defm : BFMPatterns ; // FIXME: defm : BFMPatterns ; @@ -1633,8 +1862,6 @@ defm : SHA256MaPattern ; defm : IntMed3Pat; defm : IntMed3Pat; -} - // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) class FPMed3Pat; -let OtherPredicates = [isGFX9] in { +let OtherPredicates = [isGFX9Plus] in { def : FP16Med3Pat; defm : Int16Med3Pat; defm : Int16Med3Pat; -} // End Predicates = [isGFX9] +} // End Predicates = [isGFX9Plus] diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td deleted file mode 100644 index e51ff4b4bc50..000000000000 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ /dev/null @@ -1,19 +0,0 @@ -//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Backend internal SI Intrinsic Definitions. User code should not -// directly use these. -// -//===----------------------------------------------------------------------===// - - -let TargetPrefix = "SI", isTarget = 1 in { - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - -} // End TargetPrefix = "SI", isTarget = 1 diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index be291b127301..ae8b967893a2 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1,9 +1,8 @@ //===- SILoadStoreOptimizer.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -132,6 +131,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool GLC1; bool SLC0; bool SLC1; + bool DLC0; + bool DLC1; bool UseST64; SmallVector InstsToMove; }; @@ -257,13 +258,11 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, - const SIInstrInfo *TII, AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder - return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); } // Add MI and its defs to the lists if MI reads one of the defs that are @@ -282,6 +281,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, // registers are in SSA form. if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && RegDefs.count(Use.getReg())) || (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); @@ -295,13 +295,13 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef InstsToMove, - const SIInstrInfo *TII, AliasAnalysis *AA) { + AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) + if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) return false; } return true; @@ -326,7 +326,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width0 == EltOffset1 || EltOffset1 + CI.Width1 == EltOffset0) && - CI.GLC0 == CI.GLC1 && + CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -567,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -640,6 +640,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); } + CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); + CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm(); } // Check both offsets fit in the reduced range. @@ -647,7 +649,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // move and make sure they are all safe to move down past the merged // instruction. if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) - if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) return true; } @@ -656,8 +658,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. // check if we can move I across MBBI and if we can move all I's users - if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) break; } return false; @@ -726,7 +728,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addImm(0); // clamp bit BaseSubReg = 0; } @@ -819,7 +822,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addImm(0); // clamp bit BaseSubReg = 0; } @@ -858,6 +862,7 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); std::pair SubRegIdx = getSubRegIdxs(CI); @@ -910,6 +915,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { .addImm(CI.GLC0) // glc .addImm(CI.SLC0) // slc .addImm(0) // tfe + .addImm(CI.DLC0) // dlc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); std::pair SubRegIdx = getSubRegIdxs(CI); @@ -1089,9 +1095,10 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(MIB, CI.InstsToMove); @@ -1137,9 +1144,10 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, MachineOperand OffsetLo = createRegOrImm(static_cast(Addr.Offset), MI); MachineOperand OffsetHi = createRegOrImm(static_cast(Addr.Offset >> 32), MI); - unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned DeadCarryReg = - MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned CarryReg = MRI->createVirtualRegister(CarryRC); + unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC); unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -1147,7 +1155,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) .addReg(CarryReg, RegState::Define) .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) - .add(OffsetLo); + .add(OffsetLo) + .addImm(0); // clamp bit (void)LoHalf; LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); @@ -1156,7 +1165,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, .addReg(DeadCarryReg, RegState::Define | RegState::Dead) .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) .add(OffsetHi) - .addReg(CarryReg, RegState::Kill); + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit (void)HiHalf; LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 1aa1feebbdae..78f409cd9555 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -1,9 +1,8 @@ //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,6 +82,16 @@ private: LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterClass *BoolRC = nullptr; + unsigned AndOpc; + unsigned OrOpc; + unsigned XorOpc; + unsigned MovTermOpc; + unsigned Andn2TermOpc; + unsigned XorTermrOpc; + unsigned OrSaveExecOpc; + unsigned Exec; + void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); @@ -176,7 +185,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && Cond.getSubReg() == AMDGPU::NoSubRegister); - unsigned SaveExecReg = SaveExec.getReg(); + Register SaveExecReg = SaveExec.getReg(); MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); @@ -188,26 +197,26 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. - unsigned CopyReg = SimpleIf ? SaveExecReg - : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register CopyReg = SimpleIf ? SaveExecReg + : MRI->createVirtualRegister(BoolRC); MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::EXEC, RegState::ImplicitDefine); + .addReg(Exec) + .addReg(Exec, RegState::ImplicitDefine); - unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned Tmp = MRI->createVirtualRegister(BoolRC); MachineInstr *And = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp) + BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) .addReg(CopyReg) - //.addReg(AMDGPU::EXEC) - .addReg(Cond.getReg()); + .add(Cond); + setImpSCCDefDead(*And, true); MachineInstr *Xor = nullptr; if (!SimpleIf) { Xor = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) .addReg(Tmp) .addReg(CopyReg); setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); @@ -216,7 +225,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Use a copy that is a terminator to get correct spill code placement it with // fast regalloc. MachineInstr *SetExec = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC) + BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); // Insert a pseudo terminator to help keep the verifier happy. This will also @@ -240,7 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); MI.eraseFromParent(); // FIXME: Is there a better way of adjusting the liveness? It shouldn't be @@ -257,7 +266,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); bool ExecModified = MI.getOperand(3).getImm() != 0; @@ -266,17 +275,17 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { // We are running before TwoAddressInstructions, and si_else's operands are // tied. In order to correctly tie the registers, split this into a copy of // the src like it does. - unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register CopyReg = MRI->createVirtualRegister(BoolRC); MachineInstr *CopyExec = BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) .add(MI.getOperand(1)); // Saved EXEC // This must be inserted before phis and any spill code inserted before the // else. - unsigned SaveReg = ExecModified ? - MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg; + Register SaveReg = ExecModified ? + MRI->createVirtualRegister(BoolRC) : DstReg; MachineInstr *OrSaveExec = - BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg) + BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) .addReg(CopyReg); MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); @@ -285,8 +294,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { if (ExecModified) { MachineInstr *And = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) + .addReg(Exec) .addReg(SaveReg); if (LIS) @@ -294,8 +303,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { } MachineInstr *Xor = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) + .addReg(Exec) .addReg(DstReg); MachineInstr *Branch = @@ -324,7 +333,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->createAndComputeVirtRegInterval(SaveReg); // Let this be recomputed. - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -348,14 +357,14 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { // exit" mask. MachineInstr *And = nullptr, *Or = nullptr; if (!SkipAnding) { - And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) - .addReg(AMDGPU::EXEC) + And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Dst) + .addReg(Exec) .add(MI.getOperand(1)); - Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .addReg(Dst) .add(MI.getOperand(2)); } else - Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .add(MI.getOperand(1)) .add(MI.getOperand(2)); @@ -373,8 +382,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) + .addReg(Exec) .add(MI.getOperand(0)); MachineInstr *Branch = @@ -395,8 +404,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock::iterator InsPt = MBB.begin(); MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) + .addReg(Exec) .add(MI.getOperand(0)); if (LIS) @@ -428,13 +437,13 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) if (I->modifiesRegister(AMDGPU::EXEC, TRI) && - !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC)) + !(I->isCopy() && I->getOperand(0).getReg() != Exec)) return; for (const auto &SrcOp : Def->explicit_operands()) if (SrcOp.isReg() && SrcOp.isUse() && (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || - SrcOp.getReg() == AMDGPU::EXEC)) + SrcOp.getReg() == Exec)) Src.push_back(SrcOp); } @@ -472,6 +481,27 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable(); MRI = &MF.getRegInfo(); + BoolRC = TRI->getBoolRC(); + + if (ST.isWave32()) { + AndOpc = AMDGPU::S_AND_B32; + OrOpc = AMDGPU::S_OR_B32; + XorOpc = AMDGPU::S_XOR_B32; + MovTermOpc = AMDGPU::S_MOV_B32_term; + Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; + XorTermrOpc = AMDGPU::S_XOR_B32_term; + OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + Exec = AMDGPU::EXEC_LO; + } else { + AndOpc = AMDGPU::S_AND_B64; + OrOpc = AMDGPU::S_OR_B64; + XorOpc = AMDGPU::S_XOR_B64; + MovTermOpc = AMDGPU::S_MOV_B64_term; + Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; + XorTermrOpc = AMDGPU::S_XOR_B64_term; + OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + Exec = AMDGPU::EXEC; + } MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -508,6 +538,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::S_AND_B64: case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: // Cleanup bit manipulations on exec mask combineMasks(MI); Last = I; diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index eb038bb5d5fc..1c0f836f07e6 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -1,15 +1,14 @@ //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass lowers all occurrences of i1 values (with a vreg_1 register class) -// to lane masks (64-bit scalar registers). The pass assumes machine SSA form -// and a wave-level control flow graph. +// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA +// form and a wave-level control flow graph. // // Before this pass, values that are semantically i1 and are defined and used // within the same basic block are already represented as lane masks in scalar @@ -51,6 +50,7 @@ public: static char ID; private: + bool IsWave32 = false; MachineFunction *MF = nullptr; MachineDominatorTree *DT = nullptr; MachinePostDominatorTree *PDT = nullptr; @@ -58,6 +58,14 @@ private: const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; + unsigned ExecReg; + unsigned MovOp; + unsigned AndOp; + unsigned OrOp; + unsigned XorOp; + unsigned AndN2Op; + unsigned OrN2Op; + DenseSet ConstrainRegs; public: @@ -87,6 +95,11 @@ private: MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + bool isVreg1(unsigned Reg) const { + return TargetRegisterInfo::isVirtualRegister(Reg) && + MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; + } + bool isLaneMaskReg(unsigned Reg) const { return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == @@ -412,8 +425,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { } static unsigned createLaneMaskReg(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); - return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); } static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { @@ -443,13 +458,32 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { ST = &MF->getSubtarget(); TII = ST->getInstrInfo(); + IsWave32 = ST->isWave32(); + + if (IsWave32) { + ExecReg = AMDGPU::EXEC_LO; + MovOp = AMDGPU::S_MOV_B32; + AndOp = AMDGPU::S_AND_B32; + OrOp = AMDGPU::S_OR_B32; + XorOp = AMDGPU::S_XOR_B32; + AndN2Op = AMDGPU::S_ANDN2_B32; + OrN2Op = AMDGPU::S_ORN2_B32; + } else { + ExecReg = AMDGPU::EXEC; + MovOp = AMDGPU::S_MOV_B64; + AndOp = AMDGPU::S_AND_B64; + OrOp = AMDGPU::S_OR_B64; + XorOp = AMDGPU::S_XOR_B64; + AndN2Op = AMDGPU::S_ANDN2_B64; + OrN2Op = AMDGPU::S_ORN2_B64; + } lowerCopiesFromI1(); lowerPhis(); lowerCopiesToI1(); for (unsigned Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass); + MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); ConstrainRegs.clear(); return true; @@ -465,13 +499,10 @@ void SILowerI1Copies::lowerCopiesFromI1() { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass) + if (!isVreg1(SrcReg)) continue; - if (isLaneMaskReg(DstReg) || - (TargetRegisterInfo::isVirtualRegister(DstReg) && - MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass)) + if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) continue; // Copy into a 32-bit vector register. @@ -483,6 +514,8 @@ void SILowerI1Copies::lowerCopiesFromI1() { ConstrainRegs.insert(SrcReg); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addImm(0) .addImm(0) .addImm(-1) .addReg(SrcReg); @@ -503,18 +536,22 @@ void SILowerI1Copies::lowerPhis() { SmallVector IncomingBlocks; SmallVector IncomingRegs; SmallVector IncomingUpdated; +#ifndef NDEBUG + DenseSet PhiRegisters; +#endif for (MachineBasicBlock &MBB : *MF) { LF.initialize(MBB); for (MachineInstr &MI : MBB.phis()) { unsigned DstReg = MI.getOperand(0).getReg(); - if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + if (!isVreg1(DstReg)) continue; LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); - MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); // Collect incoming values. for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { @@ -525,18 +562,22 @@ void SILowerI1Copies::lowerPhis() { if (IncomingDef->getOpcode() == AMDGPU::COPY) { IncomingReg = IncomingDef->getOperand(1).getReg(); - assert(isLaneMaskReg(IncomingReg)); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); assert(!IncomingDef->getOperand(1).getSubReg()); } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { continue; } else { - assert(IncomingDef->isPHI()); + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); } IncomingBlocks.push_back(IncomingMBB); IncomingRegs.push_back(IncomingReg); } +#ifndef NDEBUG + PhiRegisters.insert(DstReg); +#endif + // Phis in a loop that are observed outside the loop receive a simple but // conservatively correct treatment. MachineBasicBlock *PostDomBound = &MBB; @@ -629,8 +670,7 @@ void SILowerI1Copies::lowerCopiesToI1() { continue; unsigned DstReg = MI.getOperand(0).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DstReg) || - MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + if (!isVreg1(DstReg)) continue; if (MRI->use_empty(DstReg)) { @@ -640,7 +680,8 @@ void SILowerI1Copies::lowerCopiesToI1() { LLVM_DEBUG(dbgs() << "Lower Other: " << MI); - MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) continue; @@ -649,7 +690,7 @@ void SILowerI1Copies::lowerCopiesToI1() { assert(!MI.getOperand(1).getSubReg()); if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - !isLaneMaskReg(SrcReg)) { + (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); unsigned TmpReg = createLaneMaskReg(*MF); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) @@ -699,7 +740,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { return false; } - if (MI->getOpcode() != AMDGPU::S_MOV_B64) + if (MI->getOpcode() != MovOp) return false; if (!MI->getOperand(1).isImm()) @@ -774,10 +815,10 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, if (PrevVal == CurVal) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); } else if (CurVal) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC); + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) + .addReg(ExecReg) .addImm(-1); } return; @@ -790,9 +831,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, PrevMaskedReg = PrevReg; } else { PrevMaskedReg = createLaneMaskReg(*MF); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg) + BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) .addReg(PrevReg) - .addReg(AMDGPU::EXEC); + .addReg(ExecReg); } } if (!CurConstant) { @@ -801,9 +842,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, CurMaskedReg = CurReg; } else { CurMaskedReg = createLaneMaskReg(*MF); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg) + BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) .addReg(CurReg) - .addReg(AMDGPU::EXEC); + .addReg(ExecReg); } } @@ -814,12 +855,12 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) .addReg(PrevMaskedReg); } else if (PrevConstant && PrevVal) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg) + BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) .addReg(CurMaskedReg) - .addReg(AMDGPU::EXEC); + .addReg(ExecReg); } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg) + BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) .addReg(PrevMaskedReg) - .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC); + .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); } } diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp new file mode 100644 index 000000000000..a82047473370 --- /dev/null +++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -0,0 +1,323 @@ +//===-- SILowerSGPRSPills.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all +// SGPR spills, so must insert CSR SGPR spills as well as expand them. +// +// This pass must never create new SGPR virtual registers. +// +// FIXME: Must stop RegScavenger spills in later passes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-sgpr-spills" + +using MBBVector = SmallVector; + +namespace { + +static cl::opt EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + +class SILowerSGPRSpills : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + VirtRegMap *VRM = nullptr; + LiveIntervals *LIS = nullptr; + + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + MBBVector SaveBlocks; + MBBVector RestoreBlocks; + +public: + static char ID; + + SILowerSGPRSpills() : MachineFunctionPass(ID) {} + + void calculateSaveRestoreBlocks(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF); + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char SILowerSGPRSpills::ID = 0; + +INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) + +char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRSaves(MachineBasicBlock &SaveBlock, + ArrayRef CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *SaveBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + MachineBasicBlock::iterator I = SaveBlock.begin(); + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CS : CSI) { + // Insert the spill to the stack frame. + unsigned Reg = CS.getReg(); + + MachineInstrSpan MIS(I, &SaveBlock); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, + TRI); + + if (LIS) { + assert(std::distance(MIS.begin(), I) == 1); + MachineInstr &Inst = *std::prev(I); + + LIS->InsertMachineInstrInMaps(Inst); + LIS->removeAllRegUnitsForPhysReg(Reg); + } + } + } +} + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRRestores(MachineBasicBlock &RestoreBlock, + std::vector &CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *RestoreBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + // Restore all registers immediately before the return and any + // terminators that precede it. + MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); + + // FIXME: Just emit the readlane/writelane directly + if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CI : reverse(CSI)) { + unsigned Reg = CI.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI); + assert(I != RestoreBlock.begin() && + "loadRegFromStackSlot didn't insert any code!"); + // Insert in reverse order. loadRegFromStackSlot can insert + // multiple instructions. + + if (LIS) { + MachineInstr &Inst = *std::prev(I); + LIS->InsertMachineInstrInMaps(Inst); + LIS->removeAllRegUnitsForPhysReg(Reg); + } + } + } +} + +/// Compute the sets of entry and return blocks for saving and restoring +/// callee-saved registers, and placing prolog and epilog code. +void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Even when we do not change any CSR, we still want to insert the + // prologue and epilogue of the function. + // So set the save points for those. + + // Use the points found by shrink-wrapping, if any. + if (MFI.getSavePoint()) { + SaveBlocks.push_back(MFI.getSavePoint()); + assert(MFI.getRestorePoint() && "Both restore and save must be set"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + // If RestoreBlock does not have any successor and is not a return block + // then the end point is unreachable and we do not need to insert any + // epilogue. + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) + RestoreBlocks.push_back(RestoreBlock); + return; + } + + // Save refs to entry and return blocks. + SaveBlocks.push_back(&MF.front()); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } +} + +bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIFrameLowering *TFI = ST.getFrameLowering(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + RegScavenger *RS = nullptr; + + // Determine which of the registers in the callee save list should be saved. + BitVector SavedRegs; + TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS); + + // Add the code to save and restore the callee saved registers. + if (!F.hasFnAttribute(Attribute::Naked)) { + // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is + // necessary for verifier liveness checks. + MFI.setCalleeSavedInfoValid(true); + + std::vector CSI; + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + + for (unsigned I = 0; CSRegs[I]; ++I) { + unsigned Reg = CSRegs[I]; + if (SavedRegs.test(Reg)) { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlignment(*RC), + true); + + CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + } + } + + if (!CSI.empty()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + insertCSRSaves(*SaveBlock, CSI, LIS); + + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + insertCSRRestores(*RestoreBlock, CSI, LIS); + return true; + } + } + + return false; +} + +bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + VRM = getAnalysisIfAvailable(); + + assert(SaveBlocks.empty() && RestoreBlocks.empty()); + + // First, expose any CSR SGPR spills. This is mostly the same as what PEI + // does, but somewhat simpler. + calculateSaveRestoreBlocks(MF); + bool HasCSRs = spillCalleeSavedRegs(MF); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasStackObjects() && !HasCSRs) { + SaveBlocks.clear(); + RestoreBlocks.clear(); + return false; + } + + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + + bool MadeChange = false; + + const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); + + // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be + // handled as SpilledToReg in regular PrologEpilogInserter. + if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) || + SpillVGPRToAGPR) { + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // This operates under the assumption that only other SGPR spills are users + // of the frame index. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (SpillToAGPR && TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata) + ->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, + TRI->isAGPR(MRI, VReg))) { + TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr); + continue; + } + } + + if (!TII->isSGPRSpill(MI)) + continue; + + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } + } + } + + for (MachineBasicBlock &MBB : MF) { + for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) + MBB.addLiveIn(SSpill.VGPR); + + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } + + MadeChange = true; + } + + SaveBlocks.clear(); + RestoreBlocks.clear(); + + return MadeChange; +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 181cc41bd5ff..46da974a2f45 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -29,6 +28,7 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + Mode(MF.getFunction()), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -46,7 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitBufferPtr(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + HighBitsOf32BitAddress(0), + GDSSize(0) { const GCNSubtarget &ST = MF.getSubtarget(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -69,8 +70,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR4; - FrameOffsetReg = AMDGPU::SGPR5; + ScratchWaveOffsetReg = AMDGPU::SGPR33; + + // TODO: Pick a high register, and shift down, similar to a kernel. + FrameOffsetReg = AMDGPU::SGPR34; StackPtrOffsetReg = AMDGPU::SGPR32; ArgInfo.PrivateSegmentBuffer = @@ -88,33 +91,23 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } } - if (ST.debuggerEmitPrologue()) { - // Enable everything. + if (F.hasFnAttribute("amdgpu-work-group-id-x")) WorkGroupIDX = true; - WorkGroupIDY = true; - WorkGroupIDZ = true; - WorkItemIDX = true; - WorkItemIDY = true; - WorkItemIDZ = true; - } else { - if (F.hasFnAttribute("amdgpu-work-group-id-x")) - WorkGroupIDX = true; - if (F.hasFnAttribute("amdgpu-work-group-id-y")) - WorkGroupIDY = true; + if (F.hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; - if (F.hasFnAttribute("amdgpu-work-group-id-z")) - WorkGroupIDZ = true; + if (F.hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; - if (F.hasFnAttribute("amdgpu-work-item-id-x")) - WorkItemIDX = true; + if (F.hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; - if (F.hasFnAttribute("amdgpu-work-item-id-y")) - WorkItemIDY = true; + if (F.hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; - if (F.hasFnAttribute("amdgpu-work-item-id-z")) - WorkItemIDZ = true; - } + if (F.hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool HasStackObjects = FrameInfo.hasStackObjects(); @@ -154,9 +147,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { + auto hasNonSpillStackObjects = [&]() { + // Avoid expensive checking if there's no stack objects. + if (!HasStackObjects) + return false; + for (auto OI = FrameInfo.getObjectIndexBegin(), + OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI) + if (!FrameInfo.isSpillSlotObjectIndex(OI)) + return true; + // All stack objects are spill slots. + return false; + }; // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls that may require it before argument lowering. - if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch")) + if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch")) FlatScratchInit = true; } @@ -169,6 +173,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + + S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GDSSize); } void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { @@ -239,6 +247,17 @@ static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { return false; } +/// \p returns true if \p NumLanes slots are available in VGPRs already used for +/// SGPR spilling. +// +// FIXME: This only works after processFunctionBeforeFrameFinalized +bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumNeed) const { + const GCNSubtarget &ST = MF.getSubtarget(); + unsigned WaveSize = ST.getWavefrontSize(); + return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size(); +} + /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI) { @@ -260,7 +279,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int NumLanes = Size / 4; - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. @@ -300,26 +319,92 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return true; } -void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { - for (auto &R : SGPRToVGPRSpills) - MFI.RemoveStackObject(R.first); +/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. +/// Either AGPR is spilled to VGPR to vice versa. +/// Returns true if a \p FI can be eliminated completely. +bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, + int FI, + bool isAGPRtoVGPR) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + + assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI)); + + auto &Spill = VGPRToAGPRSpills[FI]; + + // This has already been allocated. + if (!Spill.Lanes.empty()) + return Spill.FullyAllocated; + + unsigned Size = FrameInfo.getObjectSize(FI); + unsigned NumLanes = Size / 4; + Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister); + + const TargetRegisterClass &RC = + isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass; + auto Regs = RC.getRegisters(); + + auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + Spill.FullyAllocated = true; + + // FIXME: Move allocation logic out of MachineFunctionInfo and initialize + // once. + BitVector OtherUsedRegs; + OtherUsedRegs.resize(TRI->getNumRegs()); + + const uint32_t *CSRMask = + TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv()); + if (CSRMask) + OtherUsedRegs.setBitsInMask(CSRMask); + + // TODO: Should include register tuples, but doesn't matter with current + // usage. + for (MCPhysReg Reg : SpillAGPR) + OtherUsedRegs.set(Reg); + for (MCPhysReg Reg : SpillVGPR) + OtherUsedRegs.set(Reg); + + SmallVectorImpl::const_iterator NextSpillReg = Regs.begin(); + for (unsigned I = 0; I < NumLanes; ++I) { + NextSpillReg = std::find_if( + NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) { + return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && + !OtherUsedRegs[Reg]; + }); + + if (NextSpillReg == Regs.end()) { // Registers exhausted + Spill.FullyAllocated = false; + break; + } + + OtherUsedRegs.set(*NextSpillReg); + SpillRegs.push_back(*NextSpillReg); + Spill.Lanes[I] = *NextSpillReg++; + } + + return Spill.FullyAllocated; } +void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { + // The FP spill hasn't been inserted yet, so keep it around. + for (auto &R : SGPRToVGPRSpills) { + if (R.first != FramePointerSaveIndex) + MFI.RemoveStackObject(R.first); + } -/// \returns VGPR used for \p Dim' work item ID. -unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const { - switch (Dim) { - case 0: - assert(hasWorkItemIDX()); - return AMDGPU::VGPR0; - case 1: - assert(hasWorkItemIDY()); - return AMDGPU::VGPR1; - case 2: - assert(hasWorkItemIDZ()); - return AMDGPU::VGPR2; + // All other SPGRs must be allocated on the default stack, so reset the stack + // ID. + for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; + ++i) + if (i != FramePointerSaveIndex) + MFI.setStackID(i, TargetStackID::Default); + + for (auto &R : VGPRToAGPRSpills) { + if (R.second.FullyAllocated) + MFI.RemoveStackObject(R.first); } - llvm_unreachable("unexpected dimension"); } MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { @@ -330,3 +415,97 @@ MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } + +static yaml::StringValue regToString(unsigned Reg, + const TargetRegisterInfo &TRI) { + yaml::StringValue Dest; + { + raw_string_ostream OS(Dest.Value); + OS << printReg(Reg, &TRI); + } + return Dest; +} + +static Optional +convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, + const TargetRegisterInfo &TRI) { + yaml::SIArgumentInfo AI; + + auto convertArg = [&](Optional &A, + const ArgDescriptor &Arg) { + if (!Arg) + return false; + + // Create a register or stack argument. + yaml::SIArgument SA = yaml::SIArgument::createArgument(Arg.isRegister()); + if (Arg.isRegister()) { + raw_string_ostream OS(SA.RegisterName.Value); + OS << printReg(Arg.getRegister(), &TRI); + } else + SA.StackOffset = Arg.getStackOffset(); + // Check and update the optional mask. + if (Arg.isMasked()) + SA.Mask = Arg.getMask(); + + A = SA; + return true; + }; + + bool Any = false; + Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); + Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); + Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr); + Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr); + Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID); + Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit); + Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize); + Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX); + Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY); + Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ); + Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo); + Any |= convertArg(AI.PrivateSegmentWaveByteOffset, + ArgInfo.PrivateSegmentWaveByteOffset); + Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr); + Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr); + Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX); + Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY); + Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ); + + if (Any) + return AI; + + return None; +} + +yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( + const llvm::SIMachineFunctionInfo& MFI, + const TargetRegisterInfo &TRI) + : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), + MaxKernArgAlign(MFI.getMaxKernArgAlign()), + LDSSize(MFI.getLDSSize()), + IsEntryFunction(MFI.isEntryFunction()), + NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), + MemoryBound(MFI.isMemoryBound()), + WaveLimiter(MFI.needsWaveLimiter()), + ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), + ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), + FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), + StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), + ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), + Mode(MFI.getMode()) {} + +void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits::mapping(YamlIO, *this); +} + +bool SIMachineFunctionInfo::initializeBaseYamlFields( + const yaml::SIMachineFunctionInfo &YamlMFI) { + ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; + MaxKernArgAlign = YamlMFI.MaxKernArgAlign; + LDSSize = YamlMFI.LDSSize; + IsEntryFunction = YamlMFI.IsEntryFunction; + NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; + MemoryBound = YamlMFI.MemoryBound; + WaveLimiter = YamlMFI.WaveLimiter; + return false; +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ef91d1e43075..f19b20ceb5da 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1,9 +1,8 @@ //==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,13 +15,16 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -38,12 +40,19 @@ class MachineFrameInfo; class MachineFunction; class TargetRegisterClass; -class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { +class AMDGPUPseudoSourceValue : public PseudoSourceValue { public: - // TODO: Is the img rsrc useful? - explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) : - PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {} + enum AMDGPUPSVKind : unsigned { + PSVBuffer = PseudoSourceValue::TargetCustom, + PSVImage, + GWSResource + }; + +protected: + AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII) + : PseudoSourceValue(Kind, TII) {} +public: bool isConstant(const MachineFrameInfo *) const override { // This should probably be true for most images, but we will start by being // conservative. @@ -59,29 +68,250 @@ public: } }; -class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { +class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue { public: - explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) : - PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } + explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(PSVBuffer, TII) {} - bool isConstant(const MachineFrameInfo *) const override { - // This should probably be true for most images, but we will start by being - // conservative. - return false; + static bool classof(const PseudoSourceValue *V) { + return V->kind() == PSVBuffer; } +}; +class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue { +public: + // TODO: Is the img rsrc useful? + explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(PSVImage, TII) {} + + static bool classof(const PseudoSourceValue *V) { + return V->kind() == PSVImage; + } +}; + +class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue { +public: + explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(GWSResource, TII) {} + + static bool classof(const PseudoSourceValue *V) { + return V->kind() == GWSResource; + } + + // These are inaccessible memory from IR. bool isAliased(const MachineFrameInfo *) const override { - return true; + return false; } + // These are inaccessible memory from IR. bool mayAlias(const MachineFrameInfo *) const override { - return true; + return false; + } + + void printCustom(raw_ostream &OS) const override { + OS << "GWSResource"; + } +}; + +namespace yaml { + +struct SIArgument { + bool IsRegister; + union { + StringValue RegisterName; + unsigned StackOffset; + }; + Optional Mask; + + // Default constructor, which creates a stack argument. + SIArgument() : IsRegister(false), StackOffset(0) {} + SIArgument(const SIArgument &Other) { + IsRegister = Other.IsRegister; + if (IsRegister) { + ::new ((void *)std::addressof(RegisterName)) + StringValue(Other.RegisterName); + } else + StackOffset = Other.StackOffset; + Mask = Other.Mask; + } + SIArgument &operator=(const SIArgument &Other) { + IsRegister = Other.IsRegister; + if (IsRegister) { + ::new ((void *)std::addressof(RegisterName)) + StringValue(Other.RegisterName); + } else + StackOffset = Other.StackOffset; + Mask = Other.Mask; + return *this; + } + ~SIArgument() { + if (IsRegister) + RegisterName.~StringValue(); + } + + // Helper to create a register or stack argument. + static inline SIArgument createArgument(bool IsReg) { + if (IsReg) + return SIArgument(IsReg); + return SIArgument(); + } + +private: + // Construct a register argument. + SIArgument(bool) : IsRegister(true), RegisterName() {} +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, SIArgument &A) { + if (YamlIO.outputting()) { + if (A.IsRegister) + YamlIO.mapRequired("reg", A.RegisterName); + else + YamlIO.mapRequired("offset", A.StackOffset); + } else { + auto Keys = YamlIO.keys(); + if (is_contained(Keys, "reg")) { + A = SIArgument::createArgument(true); + YamlIO.mapRequired("reg", A.RegisterName); + } else if (is_contained(Keys, "offset")) + YamlIO.mapRequired("offset", A.StackOffset); + else + YamlIO.setError("missing required key 'reg' or 'offset'"); + } + YamlIO.mapOptional("mask", A.Mask); + } + static const bool flow = true; +}; + +struct SIArgumentInfo { + Optional PrivateSegmentBuffer; + Optional DispatchPtr; + Optional QueuePtr; + Optional KernargSegmentPtr; + Optional DispatchID; + Optional FlatScratchInit; + Optional PrivateSegmentSize; + + Optional WorkGroupIDX; + Optional WorkGroupIDY; + Optional WorkGroupIDZ; + Optional WorkGroupInfo; + Optional PrivateSegmentWaveByteOffset; + + Optional ImplicitArgPtr; + Optional ImplicitBufferPtr; + + Optional WorkItemIDX; + Optional WorkItemIDY; + Optional WorkItemIDZ; +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, SIArgumentInfo &AI) { + YamlIO.mapOptional("privateSegmentBuffer", AI.PrivateSegmentBuffer); + YamlIO.mapOptional("dispatchPtr", AI.DispatchPtr); + YamlIO.mapOptional("queuePtr", AI.QueuePtr); + YamlIO.mapOptional("kernargSegmentPtr", AI.KernargSegmentPtr); + YamlIO.mapOptional("dispatchID", AI.DispatchID); + YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit); + YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize); + + YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX); + YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY); + YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ); + YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo); + YamlIO.mapOptional("privateSegmentWaveByteOffset", + AI.PrivateSegmentWaveByteOffset); + + YamlIO.mapOptional("implicitArgPtr", AI.ImplicitArgPtr); + YamlIO.mapOptional("implicitBufferPtr", AI.ImplicitBufferPtr); + + YamlIO.mapOptional("workItemIDX", AI.WorkItemIDX); + YamlIO.mapOptional("workItemIDY", AI.WorkItemIDY); + YamlIO.mapOptional("workItemIDZ", AI.WorkItemIDZ); + } +}; + +// Default to default mode for default calling convention. +struct SIMode { + bool IEEE = true; + bool DX10Clamp = true; + + SIMode() = default; + + + SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) { + IEEE = Mode.IEEE; + DX10Clamp = Mode.DX10Clamp; } + + bool operator ==(const SIMode Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp; + } +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, SIMode &Mode) { + YamlIO.mapOptional("ieee", Mode.IEEE, true); + YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true); + } +}; + +struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { + uint64_t ExplicitKernArgSize = 0; + unsigned MaxKernArgAlign = 0; + unsigned LDSSize = 0; + bool IsEntryFunction = false; + bool NoSignedZerosFPMath = false; + bool MemoryBound = false; + bool WaveLimiter = false; + + StringValue ScratchRSrcReg = "$private_rsrc_reg"; + StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; + StringValue FrameOffsetReg = "$fp_reg"; + StringValue StackPtrOffsetReg = "$sp_reg"; + + Optional ArgInfo; + SIMode Mode; + + SIMachineFunctionInfo() = default; + SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, + const TargetRegisterInfo &TRI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~SIMachineFunctionInfo() = default; }; +template <> struct MappingTraits { + static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) { + YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize, + UINT64_C(0)); + YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u); + YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u); + YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false); + YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); + YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false); + YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); + YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, + StringValue("$private_rsrc_reg")); + YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg, + StringValue("$scratch_wave_offset_reg")); + YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, + StringValue("$fp_reg")); + YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, + StringValue("$sp_reg")); + YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); + YamlIO.mapOptional("mode", MFI.Mode, SIMode()); + } +}; + +} // end namespace yaml + /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { + friend class GCNTargetMachine; + unsigned TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same @@ -99,6 +329,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { AMDGPUFunctionArgInfo ArgInfo; + // State of MODE register, assumed FP mode. + AMDGPU::SIModeRegisterDefaults Mode; + // Graphics info. unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; @@ -124,16 +357,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // unit. Minimum - first, maximum - second. std::pair WavesPerEU = {0, 0}; - // Stack object indices for work group IDs. - std::array DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}}; - - // Stack object indices for work item IDs. - std::array DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}}; - DenseMap> BufferPSVs; DenseMap> ImagePSVs; + std::unique_ptr GWSResourcePSV; private: unsigned LDSWaveSpillSize = 0; @@ -182,6 +410,7 @@ private: unsigned GITPtrHigh; unsigned HighBitsOf32BitAddress; + unsigned GDSSize; // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -213,6 +442,15 @@ public: SGPRSpillVGPRCSR(unsigned V, Optional F) : VGPR(V), FI(F) {} }; + struct VGPRSpillToAGPR { + SmallVector Lanes; + bool FullyAllocated = false; + }; + + SparseBitVector<> WWMReservedRegs; + + void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } + private: // SGPR->VGPR spilling support. using SpillRegMask = std::pair; @@ -223,9 +461,25 @@ private: unsigned NumVGPRSpillLanes = 0; SmallVector SpillVGPRs; + DenseMap VGPRToAGPRSpills; + + // AGPRs used for VGPR spills. + SmallVector SpillAGPR; + + // VGPRs used for AGPR spills. + SmallVector SpillVGPR; + +public: // FIXME + /// If this is set, an SGPR used for save/restore of the register used for the + /// frame pointer. + unsigned SGPRForFPSaveRestoreCopy = 0; + Optional FramePointerSaveIndex; + public: SIMachineFunctionInfo(const MachineFunction &MF); + bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI); + ArrayRef getSGPRToVGPRSpills(int FrameIndex) const { auto I = SGPRToVGPRSpills.find(FrameIndex); return (I == SGPRToVGPRSpills.end()) ? @@ -236,8 +490,29 @@ public: return SpillVGPRs; } + ArrayRef getAGPRSpillVGPRs() const { + return SpillAGPR; + } + + ArrayRef getVGPRSpillAGPRs() const { + return SpillVGPR; + } + + MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const { + auto I = VGPRToAGPRSpills.find(FrameIndex); + return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister + : I->second.Lanes[Lane]; + } + + AMDGPU::SIModeRegisterDefaults getMode() const { + return Mode; + } + + bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); - void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); + bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); + void removeDeadFrameIndices(MachineFrameInfo &MFI); bool hasCalculatedTID() const { return TIDReg != 0; }; unsigned getTIDReg() const { return TIDReg; }; @@ -386,8 +661,9 @@ public: return ArgInfo.getPreloadedValue(Value); } - unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { - return ArgInfo.getPreloadedValue(Value).first->getRegister(); + Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { + auto Arg = ArgInfo.getPreloadedValue(Value).first; + return Arg ? Arg->getRegister() : Register(); } unsigned getGITPtrHigh() const { @@ -398,6 +674,10 @@ public: return HighBitsOf32BitAddress; } + unsigned getGDSSize() const { + return GDSSize; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -429,6 +709,11 @@ public: return FrameOffsetReg; } + void setFrameOffsetReg(unsigned Reg) { + assert(Reg != 0 && "Should never be unset"); + FrameOffsetReg = Reg; + } + void setStackPtrOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; @@ -445,8 +730,6 @@ public: void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); ScratchWaveOffsetReg = Reg; - if (isEntryFunction()) - FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { @@ -565,30 +848,6 @@ public: return WavesPerEU.second; } - /// \returns Stack object index for \p Dim's work group ID. - int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { - assert(Dim < 3); - return DebuggerWorkGroupIDStackObjectIndices[Dim]; - } - - /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx. - void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { - assert(Dim < 3); - DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; - } - - /// \returns Stack object index for \p Dim's work item ID. - int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { - assert(Dim < 3); - return DebuggerWorkItemIDStackObjectIndices[Dim]; - } - - /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx. - void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { - assert(Dim < 3); - DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; - } - /// \returns SGPR used for \p Dim's work group ID. unsigned getWorkGroupIDSGPR(unsigned Dim) const { switch (Dim) { @@ -605,9 +864,6 @@ public: llvm_unreachable("unexpected dimension"); } - /// \returns VGPR used for \p Dim' work item ID. - unsigned getWorkItemIDVGPR(unsigned Dim) const; - unsigned getLDSWaveSpillSize() const { return LDSWaveSpillSize; } @@ -630,6 +886,15 @@ public: return PSV.first->second.get(); } + const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { + if (!GWSResourcePSV) { + GWSResourcePSV = + llvm::make_unique(TII); + } + + return GWSResourcePSV.get(); + } + unsigned getOccupancy() const { return Occupancy; } diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index fb7e670068fe..ebbdf80f9567 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1,9 +1,8 @@ //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1875,6 +1874,8 @@ void SIScheduleDAGMI::moveLowLatencies() { bool CopyForLowLat = false; for (SDep& SuccDep : SU->Succs) { SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { CopyForLowLat = true; } @@ -1955,7 +1956,7 @@ void SIScheduleDAGMI::schedule() for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; - MachineOperand *BaseLatOp; + const MachineOperand *BaseLatOp; int64_t OffLatReg; if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index 0ce68ac6a897..c28a7be4d03a 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -1,9 +1,8 @@ //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index b4a4e9e33133..4320e6c957a0 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1,9 +1,8 @@ //===- SIMemoryLegalizer.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -146,7 +145,7 @@ private: // only contains a single address space. if ((OrderingAddrSpace == InstrAddrSpace) && isPowerOf2_32(uint32_t(InstrAddrSpace))) - IsCrossAddressSpaceOrdering = false; + this->IsCrossAddressSpaceOrdering = false; } public: @@ -353,6 +352,40 @@ public: }; +class SIGfx10CacheControl : public SIGfx7CacheControl { +protected: + bool CuMode = false; + + /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + +public: + + SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) : + SIGfx7CacheControl(ST), CuMode(CuMode) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; + + bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -418,35 +451,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, Optional> SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const { - /// TODO: For now assume OpenCL memory model which treats each - /// address space as having a separate happens-before relation, and - /// so an instruction only has ordering with respect to the address - /// space it accesses, and if it accesses multiple address spaces it - /// does not require ordering of operations in different address - /// spaces. - if (SSID == SyncScope::System) + if (SSID == SyncScope::System) + return std::make_tuple(SIAtomicScope::SYSTEM, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getAgentSSID()) + return std::make_tuple(SIAtomicScope::AGENT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWorkgroupSSID()) + return std::make_tuple(SIAtomicScope::WORKGROUP, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWavefrontSSID()) + return std::make_tuple(SIAtomicScope::WAVEFRONT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == SyncScope::SingleThread) + return std::make_tuple(SIAtomicScope::SINGLETHREAD, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getSystemOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getAgentSSID()) + if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWorkgroupSSID()) + if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWavefrontSSID()) + if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == SyncScope::SingleThread) + if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - /// TODO: To support HSA Memory Model need to add additional memory - /// scopes that specify that do require cross address space - /// ordering. return None; } @@ -613,7 +657,9 @@ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return make_unique(ST); - return make_unique(ST); + if (Generation < AMDGPUSubtarget::GFX10) + return make_unique(ST); + return make_unique(ST, ST.isCuModeEnabled()); } bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -722,13 +768,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, bool VMCnt = false; bool LGKMCnt = false; - bool EXPCnt = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - VMCnt = true; + VMCnt |= true; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -752,7 +797,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // also synchronizing with global/GDS memory as LDS operations // could be reordered with respect to later global/GDS memory // operations of the same wave. - LGKMCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -774,7 +819,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // also synchronizing with global/LDS memory as GDS operations // could be reordered with respect to later global/LDS memory // operations of the same wave. - EXPCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -787,11 +832,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt || EXPCnt) { + if (VMCnt || LGKMCnt) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, VMCnt ? 0 : getVmcntBitMask(IV), - EXPCnt ? 0 : getExpcntBitMask(IV), + getExpcntBitMask(IV), LGKMCnt ? 0 : getLgkmcntBitMask(IV)); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); Changed = true; @@ -851,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx10CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + /// TODO Do not set glc for rmw atomic operations as they + /// implicitly bypass the L0/L1 caches. + + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + Changed |= enableGLCBit(MI); + Changed |= enableDLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in + // CU mode and all waves of a work-group are on the same CU, and so the + // L0 does not need to be bypassed. + if (!CuMode) Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + return Changed; +} + +bool SIGfx10CacheControl::enableNonTemporal( + const MachineBasicBlock::iterator &MI) const { + assert(MI->mayLoad() ^ MI->mayStore()); + bool Changed = false; + + Changed |= enableSLCBit(MI); + /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) + + return Changed; +} + +bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise + // in CU mode and all waves of a work-group are on the same CU, and so the + // L0 does not need to be invalidated. + if (!CuMode) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); + Changed = true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + +bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + bool VMCnt = false; + bool VSCnt = false; + bool LGKMCnt = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + VMCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + VSCnt |= true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to wait for operations to complete to ensure + // they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + if (!CuMode) { + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + VMCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + VSCnt |= true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L0 cache keeps all memory operations in order for + // work-items in the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an LDS waitcnt is not + // needed as LDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/GDS memory as LDS operations + // could be reordered with respect to later global/GDS memory + // operations of the same wave. + LGKMCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavesfront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + // If no cross address space ordering then an GDS waitcnt is not + // needed as GDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/LDS memory as GDS operations + // could be reordered with respect to later global/LDS memory + // operations of the same wave. + LGKMCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The GDS keeps all memory operations in order for + // the same work-group. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (VMCnt || LGKMCnt) { + unsigned WaitCntImmediate = + AMDGPU::encodeWaitcnt(IV, + VMCnt ? 0 : getVmcntBitMask(IV), + getExpcntBitMask(IV), + LGKMCnt ? 0 : getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); + Changed = true; + } + + if (VSCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + Changed = true; + } + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp index 883fd308f2f4..a5edd7b3554a 100644 --- a/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/lib/Target/AMDGPU/SIModeRegister.cpp @@ -1,9 +1,8 @@ //===-- SIModeRegister.cpp - Mode Register --------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -45,7 +44,7 @@ struct Status { Status() : Mask(0), Mode(0){}; - Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) { Mode &= Mask; }; diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index ebcad30a1866..3227bff20513 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -1,9 +1,8 @@ //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -57,13 +56,16 @@ char SIOptimizeExecMasking::ID = 0; char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; /// If \p MI is a copy from exec, return the register copied to. -static unsigned isCopyFromExec(const MachineInstr &MI) { +static unsigned isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: - case AMDGPU::S_MOV_B64_term: { + case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && Src.getReg() == AMDGPU::EXEC) + if (Src.isReg() && + Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) return MI.getOperand(0).getReg(); } } @@ -72,16 +74,20 @@ static unsigned isCopyFromExec(const MachineInstr &MI) { } /// If \p MI is a copy to exec, return the register copied from. -static unsigned isCopyToExec(const MachineInstr &MI) { +static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { switch (MI.getOpcode()) { case AMDGPU::COPY: - case AMDGPU::S_MOV_B64: { + case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg()) + if (Dst.isReg() && + Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; } case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_MOV_B32_term: llvm_unreachable("should have been replaced"); } @@ -106,6 +112,23 @@ static unsigned isLogicalOpOnExec(const MachineInstr &MI) { const MachineOperand &Src2 = MI.getOperand(2); if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) return MI.getOperand(0).getReg(); + break; + } + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_XOR_B32: + case AMDGPU::S_ANDN2_B32: + case AMDGPU::S_ORN2_B32: + case AMDGPU::S_NAND_B32: + case AMDGPU::S_NOR_B32: + case AMDGPU::S_XNOR_B32: { + const MachineOperand &Src1 = MI.getOperand(1); + if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) + return MI.getOperand(0).getReg(); + const MachineOperand &Src2 = MI.getOperand(2); + if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) + return MI.getOperand(0).getReg(); + break; } } @@ -130,6 +153,22 @@ static unsigned getSaveExecOp(unsigned Opc) { return AMDGPU::S_NOR_SAVEEXEC_B64; case AMDGPU::S_XNOR_B64: return AMDGPU::S_XNOR_SAVEEXEC_B64; + case AMDGPU::S_AND_B32: + return AMDGPU::S_AND_SAVEEXEC_B32; + case AMDGPU::S_OR_B32: + return AMDGPU::S_OR_SAVEEXEC_B32; + case AMDGPU::S_XOR_B32: + return AMDGPU::S_XOR_SAVEEXEC_B32; + case AMDGPU::S_ANDN2_B32: + return AMDGPU::S_ANDN2_SAVEEXEC_B32; + case AMDGPU::S_ORN2_B32: + return AMDGPU::S_ORN2_SAVEEXEC_B32; + case AMDGPU::S_NAND_B32: + return AMDGPU::S_NAND_SAVEEXEC_B32; + case AMDGPU::S_NOR_B32: + return AMDGPU::S_NOR_SAVEEXEC_B32; + case AMDGPU::S_XNOR_B32: + return AMDGPU::S_XNOR_SAVEEXEC_B32; default: return AMDGPU::INSTRUCTION_LIST_END; } @@ -140,7 +179,8 @@ static unsigned getSaveExecOp(unsigned Opc) { // these is expected per block. static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { switch (MI.getOpcode()) { - case AMDGPU::S_MOV_B64_term: { + case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_MOV_B32_term: { MI.setDesc(TII.get(AMDGPU::COPY)); return true; } @@ -150,12 +190,30 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); return true; } + case AMDGPU::S_XOR_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); + return true; + } + case AMDGPU::S_OR_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_OR_B32)); + return true; + } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); return true; } + case AMDGPU::S_ANDN2_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); + return true; + } default: return false; } @@ -178,6 +236,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators( static MachineBasicBlock::reverse_iterator findExecCopy( const SIInstrInfo &TII, + const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, unsigned CopyToExec) { @@ -185,7 +244,7 @@ static MachineBasicBlock::reverse_iterator findExecCopy( auto E = MBB.rend(); for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { - unsigned CopyFromExec = isCopyFromExec(*I); + unsigned CopyFromExec = isCopyFromExec(*I, ST); if (CopyFromExec != AMDGPU::NoRegister) return I; } @@ -194,8 +253,8 @@ static MachineBasicBlock::reverse_iterator findExecCopy( } // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly -// repor tthe register as unavailable because a super-register with a lane mask -// as unavailable. +// report the register as unavailable because a super-register with a lane mask +// is unavailable. static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(Reg)) @@ -212,6 +271,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally // emitted as the separate operations because spill code may need to be @@ -230,13 +290,13 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (I == E) continue; - unsigned CopyToExec = isCopyToExec(*I); + unsigned CopyToExec = isCopyToExec(*I, ST); if (CopyToExec == AMDGPU::NoRegister) continue; // Scan backwards to find the def. auto CopyToExecInst = &*I; - auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); + auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) @@ -246,7 +306,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); - PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); + PrepareExecInst->getOperand(0).setReg(Exec); LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); @@ -269,7 +329,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator J = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); J != JE; ++J) { - if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { + if (SaveExecInst && J->readsRegister(Exec, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); // Make sure this is inserted after any VALU ops that may have been // scheduled in between. @@ -353,7 +413,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, + OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, *TRI); } } diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index c671fed34bdf..7e10316eab92 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -1,9 +1,8 @@ //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,10 +33,22 @@ using namespace llvm; namespace { class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + MachineRegisterInfo *MRI; + public: - static char ID; + MachineBasicBlock::iterator skipIgnoreExecInsts( + MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const; + + MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock *&MBB, + MachineBasicBlock::iterator It) const; public: + static char ID; + SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) { initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry()); } @@ -71,38 +82,93 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { return new SIOptimizeExecMaskingPreRA(); } -static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) { +static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI, + const GCNSubtarget &ST) { + if (ST.isWave32()) { + return MI.getOpcode() == AMDGPU::S_OR_B32 && + MI.modifiesRegister(AMDGPU::EXEC_LO, TRI); + } + return MI.getOpcode() == AMDGPU::S_OR_B64 && MI.modifiesRegister(AMDGPU::EXEC, TRI); } -static bool isFullExecCopy(const MachineInstr& MI) { - return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC; +static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) { + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + + if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) { + assert(MI.isFullCopy()); + return true; + } + + return false; } static unsigned getOrNonExecReg(const MachineInstr &MI, - const SIInstrInfo &TII) { + const SIInstrInfo &TII, + const GCNSubtarget& ST) { + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1); - if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + if (Op->isReg() && Op->getReg() != Exec) return Op->getReg(); Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0); - if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + if (Op->isReg() && Op->getReg() != Exec) return Op->getReg(); return AMDGPU::NoRegister; } static MachineInstr* getOrExecSource(const MachineInstr &MI, const SIInstrInfo &TII, - const MachineRegisterInfo &MRI) { - auto SavedExec = getOrNonExecReg(MI, TII); + const MachineRegisterInfo &MRI, + const GCNSubtarget& ST) { + auto SavedExec = getOrNonExecReg(MI, TII, ST); if (SavedExec == AMDGPU::NoRegister) return nullptr; auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec); - if (!SaveExecInst || !isFullExecCopy(*SaveExecInst)) + if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST)) return nullptr; return SaveExecInst; } +/// Skip over instructions that don't care about the exec mask. +MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts( + MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const { + for ( ; I != E; ++I) { + if (TII->mayReadEXEC(*MRI, *I)) + break; + } + + return I; +} + +// Skip to the next instruction, ignoring debug instructions, and trivial block +// boundaries (blocks that have one (typically fallthrough) successor, and the +// successor has one predecessor. +MachineBasicBlock::iterator +SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock *&MBB, + MachineBasicBlock::iterator It) const { + + do { + It = skipIgnoreExecInsts(It, MBB->end()); + if (It != MBB->end() || MBB->succ_size() != 1) + break; + + // If there is one trivial successor, advance to the next block. + MachineBasicBlock *Succ = *MBB->succ_begin(); + + // TODO: Is this really necessary? + if (!MBB->isLayoutSuccessor(Succ)) + break; + + It = Succ->begin(); + MBB = Succ; + } while (true); + + return It; +} + + // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc // %cmp = V_CMP_NE_U32 1, %1 @@ -125,10 +191,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, LiveIntervals *LIS) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); - const unsigned AndOpc = AMDGPU::S_AND_B64; - const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64; - const unsigned CondReg = AMDGPU::VCC; - const unsigned ExecReg = AMDGPU::EXEC; + bool Wave32 = ST.isWave32(); + const unsigned AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + const unsigned CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; + const unsigned ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); @@ -172,6 +239,10 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return AMDGPU::NoRegister; + if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers)) + return AMDGPU::NoRegister; + Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0); Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1); MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2); @@ -187,7 +258,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), And->getOperand(0).getReg()) .addReg(ExecReg) - .addReg(CCReg, CC->getSubReg()); + .addReg(CCReg, 0, CC->getSubReg()); And->eraseFromParent(); LIS->InsertMachineInstrInMaps(*Andn2); @@ -224,11 +295,14 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { return false; const GCNSubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); LiveIntervals *LIS = &getAnalysis(); DenseSet RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI}); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; bool Changed = false; for (MachineBasicBlock &MBB : MF) { @@ -248,9 +322,10 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { // Skip this if the endpgm has any implicit uses, otherwise we would need // to be careful to update / remove them. + // S_ENDPGM always has a single imm operand that is not used other than to + // end up in the encoding MachineInstr &Term = MBB.back(); - if (Term.getOpcode() != AMDGPU::S_ENDPGM || - Term.getNumOperands() != 0) + if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1) continue; SmallVector Blocks({&MBB}); @@ -304,32 +379,21 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { } // Try to collapse adjacent endifs. - auto Lead = MBB.begin(), E = MBB.end(); - if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI)) - continue; - - const MachineBasicBlock* Succ = *MBB.succ_begin(); - if (!MBB.isLayoutSuccessor(Succ)) - continue; - - auto I = std::next(Lead); - - for ( ; I != E; ++I) - if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI)) - break; - - if (I != E) + auto E = MBB.end(); + auto Lead = skipDebugInstructionsForward(MBB.begin(), E); + if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST)) continue; - const auto NextLead = Succ->begin(); - if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) || - !getOrExecSource(*NextLead, *TII, MRI)) + MachineBasicBlock *TmpMBB = &MBB; + auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead)); + if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) || + !getOrExecSource(*NextLead, *TII, MRI, ST)) continue; LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); - auto SaveExec = getOrExecSource(*Lead, *TII, MRI); - unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII); + auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST); + unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST); for (auto &Op : Lead->operands()) { if (Op.isReg()) RecalcRegs.insert(Op.getReg()); @@ -363,7 +427,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (SafeToReplace) { LIS->RemoveMachineInstrFromMaps(*SaveExec); SaveExec->eraseFromParent(); - MRI.replaceRegWith(SavedExec, AMDGPU::EXEC); + MRI.replaceRegWith(SavedExec, Exec); LIS->removeInterval(SavedExec); } } @@ -375,8 +439,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (!MRI.reg_empty(Reg)) LIS->createAndComputeVirtRegInterval(Reg); } else { - for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U) - LIS->removeRegUnit(*U); + LIS->removeAllRegUnitsForPhysReg(Reg); } } } diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 2d43d5d05ef6..2d71abc0612a 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1,9 +1,8 @@ //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -348,8 +347,8 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, if (Abs || Neg) { assert(!Sext && "Float and integer src modifiers can't be set simulteniously"); - Mods |= Abs ? SISrcMods::ABS : 0; - Mods ^= Neg ? SISrcMods::NEG : 0; + Mods |= Abs ? SISrcMods::ABS : 0u; + Mods ^= Neg ? SISrcMods::NEG : 0u; } else if (Sext) { Mods |= SISrcMods::SEXT; } @@ -419,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { } assert(Src && Src->isReg()); - if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && !isSameReg(*Src, *getReplacedOperand())) { // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to @@ -461,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused - if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && getDstSel() != AMDGPU::SDWA::DWORD) { // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD @@ -951,7 +954,8 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, if (TII->isVOPC(Opc)) { if (!ST.hasSDWASdst()) { const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - if (SDst && SDst->getReg() != AMDGPU::VCC) + if (SDst && (SDst->getReg() != AMDGPU::VCC && + SDst->getReg() != AMDGPU::VCC_LO)) return false; } @@ -965,10 +969,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, return false; } - if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || + Opc == AMDGPU::V_FMAC_F32_e32 || + Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F32_e32)) return false; + // Check if target supports this SDWA opcode + if (TII->pseudoToMCOpcode(Opc) == -1) + return false; + // FIXME: has SDWA but require handling of implicit VCC use if (Opc == AMDGPU::V_CNDMASK_B32_e32) return false; @@ -1010,7 +1020,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAInst.add(*Dst); } else { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); - SDWAInst.addReg(AMDGPU::VCC, RegState::Define); + SDWAInst.addReg(TRI->getVCC(), RegState::Define); } // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and @@ -1039,7 +1049,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAInst.add(*Src1); } - if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { // v_mac_f16/32 has additional src2 operand tied to vdst MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp new file mode 100644 index 000000000000..f9bfe96f65cb --- /dev/null +++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -0,0 +1,221 @@ +//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to pre-allocated WWM registers +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-pre-allocate-wwm-regs" + +namespace { + +class SIPreAllocateWWMRegs : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + LiveRegMatrix *Matrix; + VirtRegMap *VRM; + RegisterClassInfo RegClassInfo; + + std::vector RegsToRewrite; + +public: + static char ID; + + SIPreAllocateWWMRegs() : MachineFunctionPass(ID) { + initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool processDef(MachineOperand &MO); + void rewriteRegs(MachineFunction &MF); +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) + +char SIPreAllocateWWMRegs::ID = 0; + +char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID; + +FunctionPass *llvm::createSIPreAllocateWWMRegsPass() { + return new SIPreAllocateWWMRegs(); +} + +bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVGPR(*MRI, Reg)) + return false; + + if (TRI->isPhysicalRegister(Reg)) + return false; + + if (VRM->hasPhys(Reg)) + return false; + + LiveInterval &LI = LIS->getInterval(Reg); + + for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) { + if (!MRI->isPhysRegUsed(PhysReg) && + Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) { + Matrix->assign(LI, PhysReg); + assert(PhysReg != 0); + RegsToRewrite.push_back(Reg); + return true; + } + } + + llvm_unreachable("physreg not found for WWM expression"); + return false; +} + +void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + + const unsigned VirtReg = MO.getReg(); + if (TRI->isPhysicalRegister(VirtReg)) + continue; + + if (!VRM->hasPhys(VirtReg)) + continue; + + unsigned PhysReg = VRM->getPhys(VirtReg); + const unsigned SubReg = MO.getSubReg(); + if (SubReg != 0) { + PhysReg = TRI->getSubReg(PhysReg, SubReg); + MO.setSubReg(0); + } + + MO.setReg(PhysReg); + MO.setIsRenamable(false); + } + } + } + + SIMachineFunctionInfo *MFI = MF.getInfo(); + + for (unsigned Reg : RegsToRewrite) { + LIS->removeInterval(Reg); + + const unsigned PhysReg = VRM->getPhys(Reg); + assert(PhysReg != 0); + MFI->ReserveWWMRegister(PhysReg); + } + + RegsToRewrite.clear(); + + // Update the set of reserved registers to include WWM ones. + MRI->freezeReservedRegs(MF); +} + +bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget(); + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis(); + Matrix = &getAnalysis(); + VRM = &getAnalysis(); + + RegClassInfo.runOnMachineFunction(MF); + + bool RegsAssigned = false; + + // We use a reverse post-order traversal of the control-flow graph to + // guarantee that we visit definitions in dominance order. Since WWM + // expressions are guaranteed to never involve phi nodes, and we can only + // escape WWM through the special WWM instruction, this means that this is a + // perfect elimination order, so we can never do any better. + ReversePostOrderTraversal RPOT(&MF); + + for (MachineBasicBlock *MBB : RPOT) { + bool InWWM = false; + for (MachineInstr &MI : *MBB) { + if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) + RegsAssigned |= processDef(MI.getOperand(0)); + + if (MI.getOpcode() == AMDGPU::ENTER_WWM) { + LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); + InWWM = true; + continue; + } + + if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); + InWWM = false; + } + + if (!InWWM) + continue; + + LLVM_DEBUG(dbgs() << "processing " << MI << "\n"); + + for (MachineOperand &DefOpnd : MI.defs()) { + RegsAssigned |= processDef(DefOpnd); + } + } + } + + if (!RegsAssigned) + return false; + + rewriteRegs(MF); + return true; +} diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h index 383f6b575808..168f05f8fdd6 100644 --- a/lib/Target/AMDGPU/SIProgramInfo.h +++ b/lib/Target/AMDGPU/SIProgramInfo.h @@ -1,9 +1,8 @@ //===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,6 +28,8 @@ struct SIProgramInfo { uint32_t DX10Clamp = 0; uint32_t DebugMode = 0; uint32_t IEEEMode = 0; + uint32_t WgpMode = 0; // GFX10+ + uint32_t MemOrdered = 0; // GFX10+ uint64_t ScratchSize = 0; uint64_t ComputePGMRSrc1 = 0; @@ -50,18 +51,6 @@ struct SIProgramInfo { // Number of VGPRs that meets number of waves per execution unit request. uint32_t NumVGPRsForWavesPerEU = 0; - // Fixed SGPR number used to hold wave scratch offset for entire kernel - // execution, or std::numeric_limits::max() if the register is not - // used or not known. - uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR = - std::numeric_limits::max(); - - // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire - // kernel execution, or std::numeric_limits::max() if the register - // is not used or not known. - uint16_t DebuggerPrivateSegmentBufferSGPR = - std::numeric_limits::max(); - // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. bool DynamicCallStack = false; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 97cfde2b2354..f152deb28004 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" @@ -63,8 +63,10 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), + AGPRPressureSets(getNumRegPressureSets()), SpillSGPRToVGPR(false), - SpillSGPRToSMEM(false) { + SpillSGPRToSMEM(false), + isWave32(ST.isWave32()) { if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) SpillSGPRToSMEM = true; else if (EnableSpillSGPRToVGPR) @@ -74,10 +76,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : SGPRSetID = NumRegPressureSets; VGPRSetID = NumRegPressureSets; + AGPRSetID = NumRegPressureSets; for (unsigned i = 0; i < NumRegPressureSets; ++i) { classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); + classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); } // Determine the number of reg units for each pressure set. @@ -89,7 +93,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : } } - unsigned VGPRMax = 0, SGPRMax = 0; + unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; for (unsigned i = 0; i < NumRegPressureSets; ++i) { if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { VGPRSetID = i; @@ -100,10 +104,16 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : SGPRSetID = i; SGPRMax = PressureSetRegUnits[i]; } + if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { + AGPRSetID = i; + AGPRMax = PressureSetRegUnits[i]; + continue; + } } assert(SGPRSetID < NumRegPressureSets && - VGPRSetID < NumRegPressureSets); + VGPRSetID < NumRegPressureSets && + AGPRSetID < NumRegPressureSets); } unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( @@ -139,11 +149,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( return AMDGPU::SGPR_32RegClass.getRegister(Reg); } -unsigned SIRegisterInfo::reservedStackPtrOffsetReg( - const MachineFunction &MF) const { - return AMDGPU::SGPR32; -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -155,15 +160,26 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // M0 has to be reserved so that llvm accepts it as a live-in into a block. reserveRegisterTuples(Reserved, AMDGPU::M0); + // Reserve src_vccz, src_execz, src_scc. + reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); + reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); + reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); + // Reserve the memory aperture registers. reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); + // Reserve xnack_mask registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); + // Reserve lds_direct register - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); @@ -176,6 +192,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); + // Reserve null register - it shall never be allocated + reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); + + // Disallow vcc_hi allocation in wave32. It may be allocated but most likely + // will result in bugs. + if (isWave32) { + Reserved.set(AMDGPU::VCC); + Reserved.set(AMDGPU::VCC_HI); + } + const GCNSubtarget &ST = MF.getSubtarget(); unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); @@ -190,6 +216,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); + Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -225,9 +253,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } + for (unsigned Reg : MFI->WWMReservedRegs) { + reserveRegisterTuples(Reserved, Reg); + } + + // FIXME: Stop using reserved registers for this. + for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) + reserveRegisterTuples(Reserved, Reg); + + for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) + reserveRegisterTuples(Reserved, Reg); + return Reserved; } +bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { + const SIMachineFunctionInfo *Info = MF.getInfo(); + // On entry, the base address is 0, so it can't possibly need any more + // alignment. + + // FIXME: Should be able to specify the entry frame alignment per calling + // convention instead. + if (Info->isEntryFunction()) + return false; + + return TargetRegisterInfo::canRealignStack(MF); +} + bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { const SIMachineFunctionInfo *Info = Fn.getInfo(); if (Info->isEntryFunction()) { @@ -252,11 +304,20 @@ bool SIRegisterInfo::requiresFrameIndexScavenging( bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const { - // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't - // create a virtual register for it during frame index elimination, so the - // scavenger is directly needed. - return MF.getFrameInfo().hasStackObjects() && - MF.getSubtarget().hasScalarStores() && + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasStackObjects()) + return false; + + // The scavenger is used for large frames which may require finding a free + // register for large offsets. + if (!isUInt<12>(MFI.getStackSize())) + return true; + + // If using scalar stores, for spills, m0 is needed for the scalar store + // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual + // register for it during frame index elimination, so the scavenger is + // directly needed. + return MF.getSubtarget().hasScalarStores() && MF.getInfo()->hasSpilledSGPRs(); } @@ -332,7 +393,8 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) - .addReg(FIReg); + .addReg(FIReg) + .addImm(0); // clamp bit } void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, @@ -394,21 +456,39 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( static unsigned getNumSubRegsForSpillOp(unsigned Op) { switch (Op) { + case AMDGPU::SI_SPILL_S1024_SAVE: + case AMDGPU::SI_SPILL_S1024_RESTORE: + case AMDGPU::SI_SPILL_V1024_SAVE: + case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_A1024_SAVE: + case AMDGPU::SI_SPILL_A1024_RESTORE: + return 32; case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V512_RESTORE: + case AMDGPU::SI_SPILL_A512_SAVE: + case AMDGPU::SI_SPILL_A512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V256_RESTORE: return 8; + case AMDGPU::SI_SPILL_S160_SAVE: + case AMDGPU::SI_SPILL_S160_RESTORE: + case AMDGPU::SI_SPILL_V160_SAVE: + case AMDGPU::SI_SPILL_V160_RESTORE: + return 5; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_A128_SAVE: + case AMDGPU::SI_SPILL_A128_RESTORE: return 4; + case AMDGPU::SI_SPILL_S96_SAVE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V96_RESTORE: return 3; @@ -416,11 +496,15 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_A64_SAVE: + case AMDGPU::SI_SPILL_A64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: case AMDGPU::SI_SPILL_V32_SAVE: case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_A32_SAVE: + case AMDGPU::SI_SPILL_A32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -480,6 +564,35 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } +static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, + int Index, + unsigned Lane, + unsigned ValueReg, + bool IsKill) { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MI->getParent()->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); + + if (Reg == AMDGPU::NoRegister) + return MachineInstrBuilder(); + + bool IsStore = MI->mayStore(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + auto *TRI = static_cast(MRI.getTargetRegisterInfo()); + + unsigned Dst = IsStore ? Reg : ValueReg; + unsigned Src = IsStore ? ValueReg : Reg; + unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 + : AMDGPU::V_ACCVGPR_READ_B32; + + return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + .addReg(Src, getKillRegState(IsKill)); +} + // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, @@ -498,6 +611,9 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) + return true; + MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .add(*Reg) @@ -507,6 +623,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe + .addImm(0) // dlc .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -549,6 +666,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned Align = MFI.getObjectAlignment(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); + Register TmpReg = + hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() + : Register(); + assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); if (!isUInt<12>(Offset + Size - EltSize)) { @@ -562,7 +683,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) - SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); if (SOffset == AMDGPU::NoRegister) { // There are no free SGPRs, and since we are in the process of spilling @@ -597,20 +718,38 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); - MachineMemOperand *NewMMO - = MF->getMachineMemOperand(PInfo, MMO->getFlags(), - EltSize, MinAlign(Align, EltSize * i)); - - auto MIB = BuildMI(*MBB, MI, DL, Desc) - .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg) - .addReg(SOffset, SOffsetRegState) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addMemOperand(NewMMO); + auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); + + if (!MIB.getInstr()) { + unsigned FinalReg = SubReg; + if (TmpReg != AMDGPU::NoRegister) { + if (IsStore) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) + .addReg(SubReg, getKillRegState(IsKill)); + SubReg = TmpReg; + } + + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); + MachineMemOperand *NewMMO + = MF->getMachineMemOperand(PInfo, MMO->getFlags(), + EltSize, MinAlign(Align, EltSize * i)); + + MIB = BuildMI(*MBB, MI, DL, Desc) + .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) + .addReg(SOffset, SOffsetRegState) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(NewMMO); + + if (!IsStore && TmpReg != AMDGPU::NoRegister) + MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), + FinalReg) + .addReg(TmpReg, RegState::Kill); + } if (NumSubRegs > 1) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); @@ -669,6 +808,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && OnlyToVGPR) return false; + Register FrameReg = getFrameRegister(*MF); + assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); @@ -728,11 +869,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) @@ -740,6 +881,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc + .addImm(0) // dlc .addMemOperand(MMO); continue; @@ -799,11 +941,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); } } @@ -859,6 +1001,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, unsigned EltSize = 4; unsigned ScalarLoadOp; + Register FrameReg = getFrameRegister(*MF); + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be @@ -890,18 +1034,19 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } auto MIB = BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc + .addImm(0) // dlc .addMemOperand(MMO); if (NumSubRegs > 1 && i == 0) @@ -937,10 +1082,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = @@ -969,15 +1114,21 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( int FI, RegScavenger *RS) const { switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: return spillSGPR(MI, FI, RS, true); + case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: return restoreSGPR(MI, FI, RS, true); @@ -998,14 +1149,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); + assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); + MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); + Register FrameReg = getFrameRegister(*MF); + switch (MI->getOpcode()) { // SGPR register spill + case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { spillSGPR(MI, Index, RS); @@ -1013,9 +1171,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // SGPR register restore + case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { restoreSGPR(MI, Index, RS); @@ -1023,19 +1184,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // VGPR register spill + case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V160_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: { + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_A1024_SAVE: + case AMDGPU::SI_SPILL_A512_SAVE: + case AMDGPU::SI_SPILL_A128_SAVE: + case AMDGPU::SI_SPILL_A64_SAVE: + case AMDGPU::SI_SPILL_A32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1047,16 +1218,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: - case AMDGPU::SI_SPILL_V512_RESTORE: { + case AMDGPU::SI_SPILL_V512_RESTORE: + case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_A32_RESTORE: + case AMDGPU::SI_SPILL_A64_RESTORE: + case AMDGPU::SI_SPILL_A128_RESTORE: + case AMDGPU::SI_SPILL_A512_RESTORE: + case AMDGPU::SI_SPILL_A1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1068,24 +1248,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, const DebugLoc &DL = MI->getDebugLoc(); bool IsMUBUF = TII->isMUBUF(*MI); - if (!IsMUBUF && - MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + if (!IsMUBUF && !MFI->isEntryFunction()) { // Convert to an absolute stack address by finding the offset from the // scratch wave base and scaling by the wave size. // - // In an entry function/kernel the stack address is already the - // absolute address relative to the scratch wave offset. + // In an entry function/kernel the offset is already the absolute + // address relative to the frame register. unsigned DiffReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - unsigned ResultReg = IsCopy ? + Register ResultReg = IsCopy ? MI->getOperand(0).getReg() : MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addReg(MFI->getScratchWaveOffsetReg()); int64_t Offset = FrameInfo.getObjectOffset(Index); @@ -1106,7 +1285,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addImm(Offset) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill) + .addImm(0); // clamp bit } else { unsigned ConstOffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -1115,7 +1295,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addImm(Offset); TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addReg(ConstOffsetReg, RegState::Kill) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill) + .addImm(0); // clamp bit } } @@ -1133,8 +1314,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() - == MFI->getFrameOffsetReg()); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm @@ -1164,63 +1347,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { - #define AMDGPU_REG_ASM_NAMES - #include "AMDGPURegAsmNames.inc.cpp" - - #define REG_RANGE(BeginReg, EndReg, RegTable) \ - if (Reg >= BeginReg && Reg <= EndReg) { \ - unsigned Index = Reg - BeginReg; \ - assert(Index < array_lengthof(RegTable)); \ - return RegTable[Index]; \ - } + const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); + unsigned Size = getRegSizeInBits(*RC); + unsigned AltName = AMDGPU::NoRegAltName; - REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames); - REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames); - REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames); - REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames); - REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255, - VGPR96RegNames); - - REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3, - AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255, - VGPR128RegNames); - REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, - AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103, - SGPR128RegNames); - - REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7, - AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, - VGPR256RegNames); - - REG_RANGE( - AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15, - AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, - VGPR512RegNames); - - REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7, - AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, - SGPR256RegNames); - - REG_RANGE( - AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15, - AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, - SGPR512RegNames - ); - -#undef REG_RANGE - - // FIXME: Rename flat_scr so we don't need to special case this. - switch (Reg) { - case AMDGPU::FLAT_SCR: - return "flat_scratch"; - case AMDGPU::FLAT_SCR_LO: - return "flat_scratch_lo"; - case AMDGPU::FLAT_SCR_HI: - return "flat_scratch_hi"; - default: - // For the special named registers the default is fine. - return TargetRegisterInfo::getRegAsmName(Reg); + switch (Size) { + case 32: AltName = AMDGPU::Reg32; break; + case 64: AltName = AMDGPU::Reg64; break; + case 96: AltName = AMDGPU::Reg96; break; + case 128: AltName = AMDGPU::Reg128; break; + case 160: AltName = AMDGPU::Reg160; break; + case 256: AltName = AMDGPU::Reg256; break; + case 512: AltName = AMDGPU::Reg512; break; + case 1024: AltName = AMDGPU::Reg1024; break; } + return AMDGPUInstPrinter::getRegisterName(Reg, AltName); } // FIXME: This is very slow. It might be worth creating a map from physreg to @@ -1231,15 +1372,25 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, + &AMDGPU::AGPR_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::AReg_64RegClass, &AMDGPU::VReg_96RegClass, + &AMDGPU::SReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::AReg_128RegClass, + &AMDGPU::VReg_160RegClass, + &AMDGPU::SReg_160RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, + &AMDGPU::AReg_512RegClass, + &AMDGPU::SReg_1024RegClass, + &AMDGPU::VReg_1024RegClass, + &AMDGPU::AReg_1024RegClass, &AMDGPU::SCC_CLASSRegClass, &AMDGPU::Pseudo_SReg_32RegClass, &AMDGPU::Pseudo_SReg_128RegClass, @@ -1268,10 +1419,39 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; case 128: return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 160: + return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; case 256: return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } +} + +bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: + return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; + case 96: + return false; + case 128: + return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; + case 160: + case 256: + return false; + case 512: + return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); } @@ -1288,10 +1468,32 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_96RegClass; case 128: return &AMDGPU::VReg_128RegClass; + case 160: + return &AMDGPU::VReg_160RegClass; case 256: return &AMDGPU::VReg_256RegClass; case 512: return &AMDGPU::VReg_512RegClass; + case 1024: + return &AMDGPU::VReg_1024RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const { + switch (getRegSizeInBits(*SRC)) { + case 32: + return &AMDGPU::AGPR_32RegClass; + case 64: + return &AMDGPU::AReg_64RegClass; + case 128: + return &AMDGPU::AReg_128RegClass; + case 512: + return &AMDGPU::AReg_512RegClass; + case 1024: + return &AMDGPU::AReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1304,12 +1506,18 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( return &AMDGPU::SGPR_32RegClass; case 64: return &AMDGPU::SReg_64RegClass; + case 96: + return &AMDGPU::SReg_96RegClass; case 128: return &AMDGPU::SReg_128RegClass; + case 160: + return &AMDGPU::SReg_160RegClass; case 256: return &AMDGPU::SReg_256RegClass; case 512: return &AMDGPU::SReg_512RegClass; + case 1024: + return &AMDGPU::SReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1328,11 +1536,31 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::SGPR_32RegClass; case 2: return &AMDGPU::SReg_64RegClass; + case 3: + return &AMDGPU::SReg_96RegClass; case 4: return &AMDGPU::SReg_128RegClass; + case 5: + return &AMDGPU::SReg_160RegClass; case 8: return &AMDGPU::SReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::SReg_512RegClass; + case 32: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } + } else if (hasAGPRs(RC)) { + switch (Count) { + case 1: + return &AMDGPU::AGPR_32RegClass; + case 2: + return &AMDGPU::AReg_64RegClass; + case 4: + return &AMDGPU::AReg_128RegClass; + case 16: + return &AMDGPU::AReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1346,9 +1574,13 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::VReg_96RegClass; case 4: return &AMDGPU::VReg_128RegClass; + case 5: + return &AMDGPU::VReg_160RegClass; case 8: return &AMDGPU::VReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::VReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1396,6 +1628,17 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { if (EltSize == 4) { + static const int16_t Sub0_31[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, + }; + static const int16_t Sub0_15[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, @@ -1408,6 +1651,10 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, }; + static const int16_t Sub0_4[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + }; + static const int16_t Sub0_3[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, }; @@ -1429,16 +1676,31 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_2); case 128: return makeArrayRef(Sub0_3); + case 160: + return makeArrayRef(Sub0_4); case 256: return makeArrayRef(Sub0_7); case 512: return makeArrayRef(Sub0_15); + case 1024: + return makeArrayRef(Sub0_31); default: llvm_unreachable("unhandled register size"); } } if (EltSize == 8) { + static const int16_t Sub0_31_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, + AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, + AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, + AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 + }; + static const int16_t Sub0_15_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, @@ -1465,32 +1727,73 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_7_64); case 512: return makeArrayRef(Sub0_15_64); + case 1024: + return makeArrayRef(Sub0_31_64); default: llvm_unreachable("unhandled register size"); } } - assert(EltSize == 16 && "unhandled register spill split size"); + if (EltSize == 16) { + + static const int16_t Sub0_31_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19, + AMDGPU::sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27, + AMDGPU::sub28_sub29_sub30_sub31 + }; + + static const int16_t Sub0_15_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15 + }; + + static const int16_t Sub0_7_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7 + }; - static const int16_t Sub0_15_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11, - AMDGPU::sub12_sub13_sub14_sub15 + switch (AMDGPU::getRegBitWidth(*RC->MC)) { + case 128: + return {}; + case 256: + return makeArrayRef(Sub0_7_128); + case 512: + return makeArrayRef(Sub0_15_128); + case 1024: + return makeArrayRef(Sub0_31_128); + default: + llvm_unreachable("unhandled register size"); + } + } + + assert(EltSize == 32 && "unhandled elt size"); + + static const int16_t Sub0_31_256[] = { + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 }; - static const int16_t Sub0_7_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7 + static const int16_t Sub0_15_256[] = { + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 }; switch (AMDGPU::getRegBitWidth(*RC->MC)) { - case 128: - return {}; case 256: - return makeArrayRef(Sub0_7_128); + return {}; case 512: - return makeArrayRef(Sub0_15_128); + return makeArrayRef(Sub0_15_256); + case 1024: + return makeArrayRef(Sub0_31_256); default: llvm_unreachable("unhandled register size"); } @@ -1512,6 +1815,13 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, return hasVGPRs(RC); } +bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); + assert(RC && "Register class for the reg not found"); + return hasAGPRs(RC); +} + bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, @@ -1553,7 +1863,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - if (Idx == getVGPRPressureSet()) + if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast(MF)); @@ -1578,28 +1888,80 @@ unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { } const TargetRegisterClass * -SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, +SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, + const RegisterBank &RB, const MachineRegisterInfo &MRI) const { - unsigned Size = getRegSizeInBits(MO.getReg(), MRI); - const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); - if (!RB) - return nullptr; - switch (Size) { + case 1: { + switch (RB.getID()) { + case AMDGPU::VGPRRegBankID: + return &AMDGPU::VGPR_32RegClass; + case AMDGPU::VCCRegBankID: + return isWave32 ? + &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; + case AMDGPU::SGPRRegBankID: + return &AMDGPU::SReg_32_XM0RegClass; + case AMDGPU::SCCRegBankID: + // This needs to return an allocatable class, so don't bother returning + // the dummy SCC class. + return &AMDGPU::SReg_32_XM0RegClass; + default: + llvm_unreachable("unknown register bank"); + } + } case 32: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : + &AMDGPU::SReg_32_XM0RegClass; case 64: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : - &AMDGPU::SReg_64_XEXECRegClass; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : + &AMDGPU::SReg_64_XEXECRegClass; case 96: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : - nullptr; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : + &AMDGPU::SReg_96RegClass; case 128: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : - &AMDGPU::SReg_128RegClass; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : + &AMDGPU::SReg_128RegClass; + case 160: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : + &AMDGPU::SReg_160RegClass; + case 256: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : + &AMDGPU::SReg_256RegClass; + case 512: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : + &AMDGPU::SReg_512RegClass; + default: + if (Size < 32) + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : + &AMDGPU::SReg_32_XM0RegClass; + return nullptr; + } +} + +const TargetRegisterClass * +SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, + const MachineRegisterInfo &MRI) const { + if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) + return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); + return nullptr; +} + +unsigned SIRegisterInfo::getVCC() const { + return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; +} + +const TargetRegisterClass * +SIRegisterInfo::getRegClass(unsigned RCID) const { + switch ((int)RCID) { + case AMDGPU::SReg_1RegClassID: + return getBoolRC(); + case AMDGPU::SReg_1_XEXECRegClassID: + return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass + : &AMDGPU::SReg_64_XEXECRegClass; + case -1: + return nullptr; default: - llvm_unreachable("not implemented"); + return AMDGPURegisterInfo::getRegClass(RCID); } } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index b82fefde47e1..34487c96e72e 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -1,9 +1,8 @@ //===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,10 +29,13 @@ class SIRegisterInfo final : public AMDGPURegisterInfo { private: unsigned SGPRSetID; unsigned VGPRSetID; + unsigned AGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; + BitVector AGPRPressureSets; bool SpillSGPRToVGPR; bool SpillSGPRToSMEM; + bool isWave32; void classifyPressureSet(unsigned PSetID, unsigned Reg, BitVector &PressureSets) const; @@ -57,8 +59,6 @@ public: unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; - unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; @@ -72,8 +72,9 @@ public: return 100; } - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; @@ -130,7 +131,7 @@ public: /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - return !hasVGPRs(RC); + return !hasVGPRs(RC) && !hasAGPRs(RC); } /// \returns true if this class ID contains only SGPR registers @@ -150,10 +151,22 @@ public: /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// \returns true if this class contains AGPR registers. + bool hasAGPRs(const TargetRegisterClass *RC) const; + + /// \returns true if this class contains any vector registers. + bool hasVectorRegisters(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) || hasAGPRs(RC); + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; + /// \returns An AGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentSGPRClass( const TargetRegisterClass *VRC) const; @@ -191,16 +204,32 @@ public: unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; + unsigned getAGPRPressureSet() const { return AGPRSetID; }; const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const { + return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); + } + + virtual bool + isDivergentRegClass(const TargetRegisterClass *RC) const override { + return !isSGPRClass(RC); + } bool isSGPRPressureSet(unsigned SetID) const { - return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); + return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); } bool isVGPRPressureSet(unsigned SetID) const { - return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); + return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); + } + bool isAGPRPressureSet(unsigned SetID) const { + return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !VGPRPressureSets.test(SetID); } ArrayRef getRegSplitParts(const TargetRegisterClass *RC, @@ -224,16 +253,45 @@ public: unsigned getReturnAddressReg(const MachineFunction &MF) const; + const TargetRegisterClass * + getRegClassForSizeOnBank(unsigned Size, + const RegisterBank &Bank, + const MachineRegisterInfo &MRI) const; + + const TargetRegisterClass * + getRegClassForTypeOnBank(LLT Ty, + const RegisterBank &Bank, + const MachineRegisterInfo &MRI) const { + return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI); + } + const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override; + const TargetRegisterClass *getBoolRC() const { + return isWave32 ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::SReg_64RegClass; + } + + const TargetRegisterClass *getWaveMaskRegClass() const { + return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass + : &AMDGPU::SReg_64_XEXECRegClass; + } + + unsigned getVCC() const; + + const TargetRegisterClass *getRegClass(unsigned RCID) const; + // Find reaching register definition MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const; + const uint32_t *getAllVGPRRegMask() const; + const uint32_t *getAllAllocatableSRegMask() const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index c625ecc9b750..d5948a7862cc 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1,9 +1,8 @@ //===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,43 +14,86 @@ class getSubRegs { list ret2 = [sub0, sub1]; list ret3 = [sub0, sub1, sub2]; list ret4 = [sub0, sub1, sub2, sub3]; + list ret5 = [sub0, sub1, sub2, sub3, sub4]; list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15]; + list ret32 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, + sub12, sub13, sub14, sub15, + sub16, sub17, sub18, sub19, + sub20, sub21, sub22, sub23, + sub24, sub25, sub26, sub27, + sub28, sub29, sub30, sub31]; list ret = !if(!eq(size, 2), ret2, !if(!eq(size, 3), ret3, !if(!eq(size, 4), ret4, - !if(!eq(size, 8), ret8, ret16)))); + !if(!eq(size, 5), ret5, + !if(!eq(size, 8), ret8, + !if(!eq(size, 16), ret16, ret32)))))); +} + +let Namespace = "AMDGPU" in { +defset list AllRegAltNameIndices = { + def Reg32 : RegAltNameIndex; + def Reg64 : RegAltNameIndex; + def Reg96 : RegAltNameIndex; + def Reg128 : RegAltNameIndex; + def Reg160 : RegAltNameIndex; + def Reg256 : RegAltNameIndex; + def Reg512 : RegAltNameIndex; + def Reg1024 : RegAltNameIndex; +} } //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// -class SIReg regIdx = 0> : Register, +class SIReg regIdx = 0, string prefix = "", + int regNo = !cast(regIdx)> : + Register, DwarfRegNum<[!cast(HWEncoding)]> { let Namespace = "AMDGPU"; + let RegAltNameIndices = AllRegAltNameIndices; // This is the not yet the complete register encoding. An additional // bit is set for VGPRs. let HWEncoding = regIdx; } +class SIRegisterWithSubRegs subregs> : + RegisterWithSubRegs { + let RegAltNameIndices = AllRegAltNameIndices; + let AltNames = [ n, n, n, n, n, n, n, n ]; +} + // Special Registers def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // Pseudo-registers: Used as placeholders during isel and immediately // replaced, never seeing the verifier. -def PRIVATE_RSRC_REG : SIReg<"", 0>; -def FP_REG : SIReg<"", 0>; -def SP_REG : SIReg<"", 0>; -def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>; +def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>; +def FP_REG : SIReg<"fp", 0>; +def SP_REG : SIReg<"sp", 0>; +def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, +def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -61,25 +103,38 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, +def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegAlias { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; } -def SCC : SIReg<"scc", 253>; +// 32-bit real registers, for MC only. +// May be used with both 32-bit and 64-bit operands. +def SRC_VCCZ : SIReg<"src_vccz", 251>; +def SRC_EXECZ : SIReg<"src_execz", 252>; +def SRC_SCC : SIReg<"src_scc", 253>; + +// 1-bit pseudo register, for codegen only. +// Should never be emitted. +def SCC : SIReg<"scc">; + def M0 : SIReg <"m0", 124>; +def SGPR_NULL : SIReg<"null", 125>; def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; +def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>; + +def LDS_DIRECT : SIReg <"src_lds_direct", 254>; def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; -def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, +def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, DwarfRegAlias { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -90,7 +145,7 @@ def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; -def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, +def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, DwarfRegAlias { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -100,7 +155,7 @@ def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, def TMA_LO : SIReg<"tma_lo", 110>; def TMA_HI : SIReg<"tma_hi", 111>; -def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, +def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, DwarfRegAlias { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -108,19 +163,19 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, } foreach Index = 0-15 in { - def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; - def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>; - def TTMP#Index : SIReg<"", 0>; + def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; + def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>; + def TTMP#Index : SIReg<"ttmp"#Index, 0>; } multiclass FLAT_SCR_LOHI_m ci_e, bits<16> vi_e> { def _ci : SIReg; def _vi : SIReg; - def "" : SIReg<"", 0>; + def "" : SIReg; } class FlatReg encoding> : - RegisterWithSubRegs<"flat_scratch", [lo, hi]>, + SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>, DwarfRegAlias { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -135,13 +190,20 @@ def FLAT_SCR_vi : FlatReg; def FLAT_SCR : FlatReg; // SGPR registers -foreach Index = 0-103 in { - def SGPR#Index : SIReg <"SGPR"#Index, Index>; +foreach Index = 0-105 in { + def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">; } // VGPR registers foreach Index = 0-255 in { - def VGPR#Index : SIReg <"VGPR"#Index, Index> { + def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> { + let HWEncoding{8} = 1; + } +} + +// AccVGPR registers +foreach Index = 0-255 in { + def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> { let HWEncoding{8} = 1; } } @@ -164,10 +226,10 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "SGPR%u", 0, 103))> { + (add (sequence "SGPR%u", 0, 105)), Reg32> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. - let AllocationPriority = 7; + let AllocationPriority = 9; } // SGPR 64-bit registers @@ -175,6 +237,12 @@ def SGPR_64Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; +// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. +def SGPR_96Regs : RegisterTuples.ret, + [(add (decimate SGPR_32, 3)), + (add (decimate (shl SGPR_32, 1), 3)), + (add (decimate (shl SGPR_32, 2), 3))]>; + // SGPR 128-bit registers def SGPR_128Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), @@ -182,6 +250,14 @@ def SGPR_128Regs : RegisterTuples.ret, (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; +// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. +def SGPR_160Regs : RegisterTuples.ret, + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4))]>; + // SGPR 256-bit registers def SGPR_256Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), @@ -212,6 +288,41 @@ def SGPR_512Regs : RegisterTuples.ret, (add (decimate (shl SGPR_32, 14), 4)), (add (decimate (shl SGPR_32, 15), 4))]>; +// SGPR 1024-bit registers +def SGPR_1024Regs : RegisterTuples.ret, + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4)), + (add (decimate (shl SGPR_32, 8), 4)), + (add (decimate (shl SGPR_32, 9), 4)), + (add (decimate (shl SGPR_32, 10), 4)), + (add (decimate (shl SGPR_32, 11), 4)), + (add (decimate (shl SGPR_32, 12), 4)), + (add (decimate (shl SGPR_32, 13), 4)), + (add (decimate (shl SGPR_32, 14), 4)), + (add (decimate (shl SGPR_32, 15), 4)), + (add (decimate (shl SGPR_32, 16), 4)), + (add (decimate (shl SGPR_32, 17), 4)), + (add (decimate (shl SGPR_32, 18), 4)), + (add (decimate (shl SGPR_32, 19), 4)), + (add (decimate (shl SGPR_32, 20), 4)), + (add (decimate (shl SGPR_32, 21), 4)), + (add (decimate (shl SGPR_32, 22), 4)), + (add (decimate (shl SGPR_32, 23), 4)), + (add (decimate (shl SGPR_32, 24), 4)), + (add (decimate (shl SGPR_32, 25), 4)), + (add (decimate (shl SGPR_32, 26), 4)), + (add (decimate (shl SGPR_32, 27), 4)), + (add (decimate (shl SGPR_32, 28), 4)), + (add (decimate (shl SGPR_32, 29), 4)), + (add (decimate (shl SGPR_32, 30), 4)), + (add (decimate (shl SGPR_32, 31), 4))]>; + // Trap handler TMP 32-bit registers def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, (add (sequence "TTMP%u", 0, 15))> { @@ -263,7 +374,7 @@ class TmpRegTuplesBase indices = getSubRegs.ret, int index1 = !add(index, !add(size, -1)), string name = "ttmp["#index#":"#index1#"]"> : - RegisterWithSubRegs { + SIRegisterWithSubRegs { let HWEncoding = subRegs[0].HWEncoding; let SubRegIndices = indices; } @@ -293,8 +404,8 @@ class TmpRegTuples.ret>; foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { - def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; - def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>; } foreach Index = {0, 4, 8, 12} in { @@ -303,7 +414,7 @@ foreach Index = {0, 4, 8, 12} in { _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>; def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; + _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>; } foreach Index = {0, 4, 8} in { @@ -320,7 +431,7 @@ foreach Index = {0, 4, 8} in { _TTMP#!add(Index,4)# _TTMP#!add(Index,5)# _TTMP#!add(Index,6)# - _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; + _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>; } def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : @@ -330,18 +441,17 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; -def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 : TmpRegTuplesBase<0, 16, - [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, - TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, - TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, - TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; - + [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10, + TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10, + TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, + TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>; // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "VGPR%u", 0, 255))> { + (add (sequence "VGPR%u", 0, 255)), Reg32> { let AllocationPriority = 1; let Size = 32; } @@ -364,6 +474,14 @@ def VGPR_128 : RegisterTuples.ret, (add (shl VGPR_32, 2)), (add (shl VGPR_32, 3))]>; +// VGPR 160-bit registers +def VGPR_160 : RegisterTuples.ret, + [(add (trunc VGPR_32, 252)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4))]>; + // VGPR 256-bit registers def VGPR_256 : RegisterTuples.ret, [(add (trunc VGPR_32, 249)), @@ -394,88 +512,257 @@ def VGPR_512 : RegisterTuples.ret, (add (shl VGPR_32, 14)), (add (shl VGPR_32, 15))]>; +// VGPR 1024-bit registers +def VGPR_1024 : RegisterTuples.ret, + [(add (trunc VGPR_32, 225)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7)), + (add (shl VGPR_32, 8)), + (add (shl VGPR_32, 9)), + (add (shl VGPR_32, 10)), + (add (shl VGPR_32, 11)), + (add (shl VGPR_32, 12)), + (add (shl VGPR_32, 13)), + (add (shl VGPR_32, 14)), + (add (shl VGPR_32, 15)), + (add (shl VGPR_32, 16)), + (add (shl VGPR_32, 17)), + (add (shl VGPR_32, 18)), + (add (shl VGPR_32, 19)), + (add (shl VGPR_32, 20)), + (add (shl VGPR_32, 21)), + (add (shl VGPR_32, 22)), + (add (shl VGPR_32, 23)), + (add (shl VGPR_32, 24)), + (add (shl VGPR_32, 25)), + (add (shl VGPR_32, 26)), + (add (shl VGPR_32, 27)), + (add (shl VGPR_32, 28)), + (add (shl VGPR_32, 29)), + (add (shl VGPR_32, 30)), + (add (shl VGPR_32, 31))]>; + +// AccVGPR 32-bit registers +def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add (sequence "AGPR%u", 0, 255)), Reg32> { + let AllocationPriority = 1; + let Size = 32; +} + +// AGPR 64-bit registers +def AGPR_64 : RegisterTuples.ret, + [(add (trunc AGPR_32, 255)), + (add (shl AGPR_32, 1))]>; + +// AGPR 128-bit registers +def AGPR_128 : RegisterTuples.ret, + [(add (trunc AGPR_32, 253)), + (add (shl AGPR_32, 1)), + (add (shl AGPR_32, 2)), + (add (shl AGPR_32, 3))]>; + +// AGPR 512-bit registers +def AGPR_512 : RegisterTuples.ret, + [(add (trunc AGPR_32, 241)), + (add (shl AGPR_32, 1)), + (add (shl AGPR_32, 2)), + (add (shl AGPR_32, 3)), + (add (shl AGPR_32, 4)), + (add (shl AGPR_32, 5)), + (add (shl AGPR_32, 6)), + (add (shl AGPR_32, 7)), + (add (shl AGPR_32, 8)), + (add (shl AGPR_32, 9)), + (add (shl AGPR_32, 10)), + (add (shl AGPR_32, 11)), + (add (shl AGPR_32, 12)), + (add (shl AGPR_32, 13)), + (add (shl AGPR_32, 14)), + (add (shl AGPR_32, 15))]>; + +// AGPR 1024-bit registers +def AGPR_1024 : RegisterTuples.ret, + [(add (trunc AGPR_32, 225)), + (add (shl AGPR_32, 1)), + (add (shl AGPR_32, 2)), + (add (shl AGPR_32, 3)), + (add (shl AGPR_32, 4)), + (add (shl AGPR_32, 5)), + (add (shl AGPR_32, 6)), + (add (shl AGPR_32, 7)), + (add (shl AGPR_32, 8)), + (add (shl AGPR_32, 9)), + (add (shl AGPR_32, 10)), + (add (shl AGPR_32, 11)), + (add (shl AGPR_32, 12)), + (add (shl AGPR_32, 13)), + (add (shl AGPR_32, 14)), + (add (shl AGPR_32, 15)), + (add (shl AGPR_32, 16)), + (add (shl AGPR_32, 17)), + (add (shl AGPR_32, 18)), + (add (shl AGPR_32, 19)), + (add (shl AGPR_32, 20)), + (add (shl AGPR_32, 21)), + (add (shl AGPR_32, 22)), + (add (shl AGPR_32, 23)), + (add (shl AGPR_32, 24)), + (add (shl AGPR_32, 25)), + (add (shl AGPR_32, 26)), + (add (shl AGPR_32, 27)), + (add (shl AGPR_32, 28)), + (add (shl AGPR_32, 29)), + (add (shl AGPR_32, 30)), + (add (shl AGPR_32, 31))]>; + //===----------------------------------------------------------------------===// // Register classes used as source and destination //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> { let isAllocatable = 0; let CopyCost = -1; } def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, - (add PRIVATE_RSRC_REG)> { + (add PRIVATE_RSRC_REG), Reg128> { + let isAllocatable = 0; + let CopyCost = -1; +} + +def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add LDS_DIRECT), Reg32> { let isAllocatable = 0; let CopyCost = -1; } // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, - TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, - SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { - let AllocationPriority = 7; + SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, + SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> { + let AllocationPriority = 10; } -def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { - let AllocationPriority = 7; +def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> { + let AllocationPriority = 10; } -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { - let AllocationPriority = 7; +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> { + let AllocationPriority = 10; } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { - let AllocationPriority = 7; +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> { + let AllocationPriority = 10; +} + +def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS), + Reg32> { + let isAllocatable = 0; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, + (add SGPR_64Regs), Reg64> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 11; +} + +// CCR (call clobbered registers) SGPR 64-bit registers +def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, + (add (trunc SGPR_64, 16)), Reg64> { + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; } -def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, + (add TTMP_64Regs)> { let isAllocatable = 0; } def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 13; } def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SReg_64_XEXEC, EXEC)> { + (add SReg_64_XEXEC, EXEC), Reg64> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 13; +} + +def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32, + (add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> { + let CopyCost = 1; + let isAllocatable = 0; +} + +def SReg_1 : RegisterClass<"AMDGPU", [i1], 32, + (add SReg_1_XEXEC, EXEC, EXEC_LO)> { + let CopyCost = 1; + let isAllocatable = 0; } // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> { - let AllocationPriority = 10; +// There are no 3-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, + (add SGPR_96Regs), Reg96> { + let AllocationPriority = 14; } -def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> { +def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, + (add SGPR_96), Reg96> { + let AllocationPriority = 14; +} + +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, + (add SGPR_128Regs), Reg128> { + let AllocationPriority = 15; +} + +def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, + (add TTMP_128Regs)> { let isAllocatable = 0; } def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add SGPR_128, TTMP_128)> { - let AllocationPriority = 10; + (add SGPR_128, TTMP_128), Reg128> { + let AllocationPriority = 15; } } // End CopyCost = 2 -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { - let AllocationPriority = 11; +// There are no 5-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add SGPR_160Regs), Reg160> { + let AllocationPriority = 16; +} + +def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add SGPR_160), Reg160> { + let AllocationPriority = 16; +} + +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs), + Reg256> { + let AllocationPriority = 17; } def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { @@ -483,29 +770,48 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { } def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add SGPR_256, TTMP_256)> { + (add SGPR_256, TTMP_256), Reg256> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; - let AllocationPriority = 11; + let AllocationPriority = 17; } -def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { - let AllocationPriority = 12; +def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add SGPR_512Regs), Reg512> { + let AllocationPriority = 18; } -def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> { +def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add TTMP_512Regs)> { let isAllocatable = 0; } def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add SGPR_512, TTMP_512)> { + (add SGPR_512, TTMP_512), Reg512> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; - let AllocationPriority = 12; + let AllocationPriority = 18; +} + +def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add VGPR_32, LDS_DIRECT_CLASS), Reg32> { + let isAllocatable = 0; +} + +def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add SGPR_1024Regs), Reg1024> { + let AllocationPriority = 19; +} + +def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add SGPR_1024), Reg1024> { + let CopyCost = 16; + let AllocationPriority = 19; } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, + (add VGPR_64), Reg64> { let Size = 64; // Requires 2 v_mov_b32 to copy @@ -513,7 +819,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32 let AllocationPriority = 2; } -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { +def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> { let Size = 96; // Requires 3 v_mov_b32 to copy @@ -521,7 +827,8 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { let AllocationPriority = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, + (add VGPR_128), Reg128> { let Size = 128; // Requires 4 v_mov_b32 to copy @@ -529,28 +836,88 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VG let AllocationPriority = 4; } -def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { +def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add VGPR_160), Reg160> { + let Size = 160; + + // Requires 5 v_mov_b32 to copy + let CopyCost = 5; + let AllocationPriority = 5; +} + +def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, + (add VGPR_256), Reg256> { let Size = 256; let CopyCost = 8; - let AllocationPriority = 5; + let AllocationPriority = 6; } -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add VGPR_512), Reg512> { let Size = 512; let CopyCost = 16; - let AllocationPriority = 6; + let AllocationPriority = 7; +} + +def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add VGPR_1024), Reg1024> { + let Size = 1024; + let CopyCost = 32; + let AllocationPriority = 8; } -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { +def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, + (add AGPR_64), Reg64> { + let Size = 64; + + let CopyCost = 5; + let AllocationPriority = 2; +} + +def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, + (add AGPR_128), Reg128> { + let Size = 128; + + // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr + let CopyCost = 9; + let AllocationPriority = 4; +} + +def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add AGPR_512), Reg512> { + let Size = 512; + let CopyCost = 33; + let AllocationPriority = 7; +} + +def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add AGPR_1024), Reg1024> { + let Size = 1024; + let CopyCost = 65; + let AllocationPriority = 8; +} + +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> { let Size = 32; } def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, SReg_32)> { + (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> { + let isAllocatable = 0; +} + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64), + Reg64> { let isAllocatable = 0; } -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { +def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add AGPR_32, VGPR_32), Reg32> { + let isAllocatable = 0; +} + +def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, + (add AReg_64, VReg_64), Reg64> { let isAllocatable = 0; } @@ -563,47 +930,40 @@ class RegImmMatcher : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } -multiclass SIRegOperand { +multiclass SIRegOperand32 { let OperandNamespace = "AMDGPU" in { - def _b16 : RegisterOperand(rc#"_32")> { + def _b16 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_INT16"; let ParserMatchClass = RegImmMatcher; let DecoderMethod = "decodeOperand_VSrc16"; } - def _f16 : RegisterOperand(rc#"_32")> { + def _f16 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_FP16"; let ParserMatchClass = RegImmMatcher; - let DecoderMethod = "decodeOperand_VSrc16"; + let DecoderMethod = "decodeOperand_" # rc # "_16"; } - def _b32 : RegisterOperand(rc#"_32")> { + def _b32 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_INT32"; let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_" # rc # rc_suffix; } - def _f32 : RegisterOperand(rc#"_32")> { + def _f32 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_FP32"; let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_" # rc # rc_suffix; } - def _b64 : RegisterOperand(rc#"_64")> { - let OperandType = opType#"_INT64"; - let ParserMatchClass = RegImmMatcher; - } - - def _f64 : RegisterOperand(rc#"_64")> { - let OperandType = opType#"_FP64"; - let ParserMatchClass = RegImmMatcher; - } - - def _v2b16 : RegisterOperand(rc#"_32")> { + def _v2b16 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_V2INT16"; let ParserMatchClass = RegImmMatcher; let DecoderMethod = "decodeOperand_VSrcV216"; } - def _v2f16 : RegisterOperand(rc#"_32")> { + def _v2f16 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_V2FP16"; let ParserMatchClass = RegImmMatcher; let DecoderMethod = "decodeOperand_VSrcV216"; @@ -611,6 +971,21 @@ multiclass SIRegOperand { } } +multiclass SIRegOperand : + SIRegOperand32 { + let OperandNamespace = "AMDGPU" in { + def _b64 : RegisterOperand(rc#"_64")> { + let OperandType = opType#"_INT64"; + let ParserMatchClass = RegImmMatcher; + } + + def _f64 : RegisterOperand(rc#"_64")> { + let OperandType = opType#"_FP64"; + let ParserMatchClass = RegImmMatcher; + } + } +} + // FIXME: 64-bit sources can sometimes use 32-bit constants. multiclass RegImmOperand : SIRegOperand; @@ -618,20 +993,32 @@ multiclass RegImmOperand multiclass RegInlineOperand : SIRegOperand; +multiclass RegInlineOperand32 + : SIRegOperand32; + +multiclass RegInlineOperandAC + : SIRegOperand32; + //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// defm SSrc : RegImmOperand<"SReg", "SSrc">; +def SSrcOrLds_b32 : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM_INT32"; + let ParserMatchClass = RegImmMatcher<"SSrcOrLdsB32">; +} + //===----------------------------------------------------------------------===// // SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; -def SCSrc_i1 : RegisterOperand; - //===----------------------------------------------------------------------===// // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// @@ -653,8 +1040,46 @@ def VRegSrc_32 : RegisterOperand { let DecoderMethod = "DecodeVS_32RegisterClass"; } +//===----------------------------------------------------------------------===// +// ASrc_* Operands with an AccVGPR +//===----------------------------------------------------------------------===// + +def ARegSrc_32 : RegisterOperand { + let DecoderMethod = "DecodeAGPR_32RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + //===----------------------------------------------------------------------===// // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// defm VCSrc : RegInlineOperand<"VS", "VCSrc">; + +//===----------------------------------------------------------------------===// +// VISrc_* Operands with a VGPR or an inline constant +//===----------------------------------------------------------------------===// + +defm VISrc : RegInlineOperand32<"VGPR", "VISrc">; + +//===----------------------------------------------------------------------===// +// AVSrc_* Operands with an AGPR or VGPR +//===----------------------------------------------------------------------===// + +def AVSrc_32 : RegisterOperand { + let DecoderMethod = "DecodeAV_32RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVSrc_64 : RegisterOperand { + let DecoderMethod = "DecodeAV_64RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +//===----------------------------------------------------------------------===// +// ACSrc_* Operands with an AGPR or an inline constant +//===----------------------------------------------------------------------===// + +defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">; +defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">; +defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">; +defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">; diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index 7af69cb6a46d..824d1aeb0df9 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -1,9 +1,8 @@ //===-- SISchedule.td - SI Scheduling definitons -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,6 +24,9 @@ def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; def WriteBarrier : SchedWrite; +def MIVGPRRead : SchedRead; +def MIMFMARead : SchedRead; + // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; @@ -38,9 +40,17 @@ def WriteDouble : SchedWrite; // half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Conversion to or from f64 instruction +def WriteDoubleCvt : SchedWrite; + // Half rate 64-bit instructions. def Write64Bit : SchedWrite; +// mAI multipass instructions. +def Write2PassMAI : SchedWrite; +def Write8PassMAI : SchedWrite; +def Write16PassMAI : SchedWrite; + // FIXME: Should there be a class for instructions which are VALU // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) @@ -62,6 +72,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; +def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? def HWBranch : ProcResource<1> { @@ -82,6 +93,9 @@ def HWVMEM : ProcResource<1> { def HWVALU : ProcResource<1> { let BufferSize = 1; } +def HWRC : ProcResource<1> { // Register destination cache + let BufferSize = 1; +} class HWWriteRes resources, int latency> : WriteRes { @@ -91,6 +105,11 @@ class HWWriteRes resources, class HWVALUWriteRes : HWWriteRes; +def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>; + +def MIReadVGPR : SchedReadVariant<[ + SchedVar, + SchedVar]>; // The latency numbers are taken from AMD Accelerated Parallel Processing // guide. They may not be accurate. @@ -109,6 +128,24 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; + def : HWVALUWriteRes; + def : HWVALUWriteRes; + def : HWVALUWriteRes; + + def : ReadAdvance; + def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>; + + // Technicaly mfma reads can be from 0 to 4 cycles but that does not make + // sense to model because its register setup is huge. In particular if we + // properly model read advanice as -2 for a vgpr read it will result in a + // bad scheduling of acc writes before that mfma. To avoid it we would + // need to consume 2 or 4 more vgprs to be initialized before the acc + // write sequence. Just assume worst case here. + def : ReadAdvance; + + def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; + def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; + def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; @@ -125,6 +162,7 @@ defm : SICommonWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +def : HWVALUWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -137,7 +175,32 @@ defm : SICommonWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +def : HWVALUWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = SIQuarterSpeedModel + +let SchedModel = GFX10SpeedModel in { + +// The latency values are 1 / (operations / cycle). +// Add 1 stall cycle for VGPR read. +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; + +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +} // End SchedModel = GFX10SpeedModel diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 6ad7dd0e3a7c..7ee178149c7a 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -1,9 +1,8 @@ //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// The pass tries to use the 32-bit encoding for instructions when possible. //===----------------------------------------------------------------------===// @@ -39,6 +38,8 @@ class SIShrinkInstructions : public MachineFunctionPass { public: static char ID; + void shrinkMIMG(MachineInstr &MI); + public: SIShrinkInstructions() : MachineFunctionPass(ID) { } @@ -94,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, Src0.setSubReg(0); Src0.ChangeToFrameIndex(MovSrc.getIndex()); ConstantFolded = true; + } else if (MovSrc.isGlobal()) { + Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), + MovSrc.getTargetFlags()); + ConstantFolded = true; } if (ConstantFolded) { @@ -212,6 +217,96 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { } } +// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. +void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + return; + + MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + unsigned NewAddrDwords = Info->VAddrDwords; + const TargetRegisterClass *RC; + + if (Info->VAddrDwords == 2) { + RC = &AMDGPU::VReg_64RegClass; + } else if (Info->VAddrDwords == 3) { + RC = &AMDGPU::VReg_96RegClass; + } else if (Info->VAddrDwords == 4) { + RC = &AMDGPU::VReg_128RegClass; + } else if (Info->VAddrDwords <= 8) { + RC = &AMDGPU::VReg_256RegClass; + NewAddrDwords = 8; + } else { + RC = &AMDGPU::VReg_512RegClass; + NewAddrDwords = 16; + } + + unsigned VgprBase = 0; + bool IsUndef = true; + bool IsKill = NewAddrDwords == Info->VAddrDwords; + for (unsigned i = 0; i < Info->VAddrDwords; ++i) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); + unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); + + if (i == 0) { + VgprBase = Vgpr; + } else if (VgprBase + i != Vgpr) + return; + + if (!Op.isUndef()) + IsUndef = false; + if (!Op.isKill()) + IsKill = false; + } + + if (VgprBase + NewAddrDwords > 256) + return; + + // Further check for implicit tied operands - this may be present if TFE is + // enabled + int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); + int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); + unsigned TFEVal = MI.getOperand(TFEIdx).getImm(); + unsigned LWEVal = MI.getOperand(LWEIdx).getImm(); + int ToUntie = -1; + if (TFEVal || LWEVal) { + // TFE/LWE is enabled so we need to deal with an implicit tied operand + for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { + if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && + MI.getOperand(i).isImplicit()) { + // This is the tied operand + assert( + ToUntie == -1 && + "found more than one tied implicit operand when expecting only 1"); + ToUntie = i; + MI.untieRegOperand(ToUntie); + } + } + } + + unsigned NewOpcode = + AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, + Info->VDataDwords, NewAddrDwords); + MI.setDesc(TII->get(NewOpcode)); + MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); + MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); + MI.getOperand(VAddr0Idx).setIsKill(IsKill); + + for (unsigned i = 1; i < Info->VAddrDwords; ++i) + MI.RemoveOperand(VAddr0Idx + 1); + + if (ToUntie >= 0) { + MI.tieOperands( + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), + ToUntie - (Info->VAddrDwords - 1)); + } +} + /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. /// If the inverse of the immediate is legal, use ANDN2, ORN2 or @@ -277,7 +372,9 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) { Src0->ChangeToImmediate(NewImm); - MI.RemoveOperand(2); + // Remove the immediate and add the tied input. + MI.getOperand(2).ChangeToRegister(Dest->getReg(), false); + MI.tieOperands(0, 2); } else { SrcImm->setImm(NewImm); } @@ -458,6 +555,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; std::vector I1Defs; @@ -596,6 +694,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + if (TII->isMIMG(MI.getOpcode()) && + ST.getGeneration() >= AMDGPUSubtarget::GFX10 && + MF.getProperties().hasProperty( + MachineFunctionProperties::Property::NoVRegs)) { + shrinkMIMG(MI); + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; @@ -625,10 +731,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we will run // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); continue; } - if (DstReg != AMDGPU::VCC) + if (DstReg != VCCReg) continue; } @@ -641,10 +747,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; unsigned SReg = Src2->getReg(); if (TargetRegisterInfo::isVirtualRegister(SReg)) { - MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); + MRI.setRegAllocationHint(SReg, 0, VCCReg); continue; } - if (SReg != AMDGPU::VCC) + if (SReg != VCCReg) continue; } @@ -657,20 +763,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { AMDGPU::OpName::src2); if (SDst) { - if (SDst->getReg() != AMDGPU::VCC) { + bool Next = false; + + if (SDst->getReg() != VCCReg) { if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) - MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); - continue; + MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); + Next = true; } // All of the instructions with carry outs also have an SGPR input in // src2. - if (Src2 && Src2->getReg() != AMDGPU::VCC) { + if (Src2 && Src2->getReg() != VCCReg) { if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) - MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); + MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); + Next = true; + } + if (Next) continue; - } } // We can shrink this instruction diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 879726b1528c..4e07efff55d8 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1,9 +1,8 @@ //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -149,6 +148,7 @@ private: CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; @@ -201,6 +201,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -277,7 +279,7 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - if (Reg == AMDGPU::EXEC) + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) continue; for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { @@ -386,7 +388,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, unsigned Reg = MO.getReg(); if (!TRI->isVirtualRegister(Reg) && - TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; } @@ -619,13 +621,16 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64), SaveWQM) .addReg(LiveMaskReg); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64), + Exec) + .addReg(Exec) .addReg(LiveMaskReg); } @@ -637,13 +642,15 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, unsigned SavedWQM) { MachineInstr *MI; + unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (SavedWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); } LIS->InsertMachineInstrInMaps(*MI); @@ -655,8 +662,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, MachineInstr *MI; assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), - SaveOrig) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); } @@ -667,7 +673,8 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), + ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); } @@ -693,6 +700,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool WQMFromExec = isEntry; char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; char NonWWMState = 0; + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (isEntry) @@ -780,13 +788,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (Needs == StateWWM) { NonWWMState = State; - SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + SavedNonWWMReg = MRI->createVirtualRegister(BoolRC); toWWM(MBB, Before, SavedNonWWMReg); State = StateWWM; } else { if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) - SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + SavedWQMReg = MRI->createVirtualRegister(BoolRC); toExact(MBB, Before, SavedWQMReg, LiveMaskReg); State = StateExact; @@ -838,7 +846,23 @@ void SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToCopyInstrs) { for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) MI->RemoveOperand(i); - MI->setDesc(TII->get(AMDGPU::COPY)); + + const unsigned Reg = MI->getOperand(0).getReg(); + + if (TRI->isVGPR(*MRI, Reg)) { + const TargetRegisterClass *regClass = + TargetRegisterInfo::isVirtualRegister(Reg) + ? MRI->getRegClass(Reg) + : TRI->getPhysRegClass(Reg); + + const unsigned MovOp = TII->getMovOpcode(regClass); + MI->setDesc(TII->get(MovOp)); + + // And make it implicitly depend on exec (like all VALU movs should do). + MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + } else { + MI->setDesc(TII->get(AMDGPU::COPY)); + } } } @@ -849,17 +873,18 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToCopyInstrs.clear(); CallingConv = MF.getFunction().getCallingConv(); - const GCNSubtarget &ST = MF.getSubtarget(); + ST = &MF.getSubtarget(); - TII = ST.getInstrInfo(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); unsigned LiveMaskReg = 0; + unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(AMDGPU::EXEC); + lowerLiveMaskQueries(Exec); if (!(GlobalFlags & StateWWM)) return !LiveMaskQueries.empty(); } else { @@ -868,10 +893,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + .addReg(Exec); LIS->InsertMachineInstrInMaps(*MI); } @@ -879,9 +904,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); lowerCopyInstrs(); // EntryMI may become invalid here diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 8a063e1a4867..1b410b6b5912 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -1,9 +1,8 @@ //===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -34,7 +33,6 @@ class SM_Pseudo patt let hasSideEffects = 0; let UseNamedOperandTable = 1; let SchedRW = [WriteSMEM]; - let SubtargetPredicate = isGCN; string Mnemonic = opName; string AsmOperands = asmOps; @@ -42,6 +40,7 @@ class SM_Pseudo patt bits<1> has_sbase = 1; bits<1> has_sdst = 1; bit has_glc = 0; + bit has_dlc = 0; bits<1> has_offset = 1; bits<1> offset_is_imm = 0; } @@ -81,6 +80,7 @@ class SM_Load_Pseudo let mayLoad = 1; let mayStore = 0; let has_glc = 1; + let has_dlc = 1; } class SM_Store_Pseudo pattern = []> @@ -90,6 +90,7 @@ class SM_Store_Pseudo pattern let mayLoad = 0; let mayStore = 1; let has_glc = 1; + let has_dlc = 1; let ScalarStore = 1; } @@ -110,21 +111,23 @@ multiclass SM_Pseudo_Loads { def _IMM : SM_Load_Pseudo { + (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc), + " $sdst, $sbase, $offset$glc$dlc", []> { let offset_is_imm = 1; let BaseClass = baseClass; let PseudoInstr = opName # "_IMM"; let has_glc = 1; + let has_dlc = 1; } def _SGPR : SM_Load_Pseudo { + (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc), + " $sdst, $sbase, $offset$glc$dlc", []> { let BaseClass = baseClass; let PseudoInstr = opName # "_SGPR"; let has_glc = 1; + let has_dlc = 1; } } @@ -132,8 +135,8 @@ multiclass SM_Pseudo_Stores { def _IMM : SM_Store_Pseudo { + (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc), + " $sdata, $sbase, $offset$glc$dlc", []> { let offset_is_imm = 1; let BaseClass = baseClass; let SrcClass = srcClass; @@ -141,8 +144,8 @@ multiclass SM_Pseudo_Stores { + (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc), + " $sdata, $sbase, $offset$glc$dlc", []> { let BaseClass = baseClass; let SrcClass = srcClass; let PseudoInstr = opName # "_SGPR"; @@ -154,17 +157,25 @@ multiclass SM_Pseudo_Discards { def _SGPR : SM_Discard_Pseudo ; } -class SM_Time_Pseudo : SM_Pseudo< +class SM_Time_Pseudo : SM_Pseudo< opName, (outs SReg_64_XEXEC:$sdst), (ins), " $sdst", [(set i64:$sdst, (node))]> { let hasSideEffects = 1; - let mayStore = 0; + + // FIXME: This should be definitively mayStore = 0. TableGen + // brokenly tries to infer these based on the intrinsic properties + // corresponding to the IR attributes. The target intrinsics are + // considered as writing to memory for IR dependency purposes, but + // those can be modeled with hasSideEffects here. These also end up + // inferring differently for llvm.readcyclecounter and the amdgcn + // intrinsics. + let mayStore = ?; let mayLoad = 1; let has_sbase = 0; let has_offset = 0; } -class SM_Inval_Pseudo : SM_Pseudo< +class SM_Inval_Pseudo : SM_Pseudo< opName, (outs), (ins), "", [(node)]> { let hasSideEffects = 1; let mayStore = 1; @@ -178,6 +189,16 @@ multiclass SM_Pseudo_Probe { def _SGPR : SM_Probe_Pseudo ; } +class SM_WaveId_Pseudo : SM_Pseudo< + opName, (outs SReg_32_XM0_XEXEC:$sdst), (ins), + " $sdst", [(set i32:$sdst, (node))]> { + let hasSideEffects = 1; + let mayStore = 0; + let mayLoad = 1; + let has_sbase = 0; + let has_offset = 0; +} + //===----------------------------------------------------------------------===// // Scalar Atomic Memory Classes //===----------------------------------------------------------------------===// @@ -191,6 +212,7 @@ class SM_Atomic_Pseudo { let offset_is_imm = isImm; let PseudoInstr = opName # !if(isImm, @@ -266,6 +288,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads < "s_buffer_load_dwordx16", SReg_128, SReg_512 >; +let SubtargetPredicate = HasScalarStores in { defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; @@ -281,25 +304,32 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < "s_buffer_store_dwordx4", SReg_128, SReg_128 >; - +} // End SubtargetPredicate = HasScalarStores def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>; def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>; -let SubtargetPredicate = isCIVI in { +let SubtargetPredicate = isGFX7GFX8GFX9 in { def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; -} // let SubtargetPredicate = isCIVI +} // let SubtargetPredicate = isGFX7GFX8GFX9 -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8Plus in { +let OtherPredicates = [HasScalarStores] in { def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>; def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +} // End OtherPredicates = [HasScalarStores] def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>; defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>; defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>; -} // SubtargetPredicate = isVI +} // SubtargetPredicate = isGFX8Plus + +let SubtargetPredicate = isGFX10Plus in { +def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">; +def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>; +} // End SubtargetPredicate = isGFX10Plus -let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in { +let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in { defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>; @@ -307,7 +337,7 @@ defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_6 defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>; -} // SubtargetPredicate = HasFlatScratchInsts +} // SubtargetPredicate = HasScalarFlatScratchInsts let SubtargetPredicate = HasScalarAtomics in { @@ -369,7 +399,7 @@ defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_6 } // let SubtargetPredicate = HasScalarAtomics -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = HasScalarAtomics in { defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">; defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">; } @@ -387,8 +417,8 @@ class SMRD_Real_si op, SM_Pseudo ps> , SIMCInstr , Enc32 { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; + let AssemblerPredicates = [isGFX6GFX7]; + let DecoderNamespace = "GFX6GFX7"; let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); let Inst{8} = imm; @@ -405,13 +435,13 @@ multiclass SM_Real_Loads_si op, string ps, SM_Load_Pseudo sgprPs = !cast(ps#_SGPR)> { def _IMM_si : SMRD_Real_si { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc); } // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _SGPR_si : SMRD_Real_si { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); } } @@ -441,8 +471,8 @@ class SMEM_Real_vi op, SM_Pseudo ps> , Enc64 { bit glc; - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; + let AssemblerPredicates = [isGFX8GFX9]; + let DecoderNamespace = "GFX8"; let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); @@ -458,10 +488,10 @@ multiclass SM_Real_Loads_vi op, string ps, SM_Load_Pseudo immPs = !cast(ps#_IMM), SM_Load_Pseudo sgprPs = !cast(ps#_SGPR)> { def _IMM_vi : SMEM_Real_vi { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_vi : SMEM_Real_vi { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); } } @@ -479,11 +509,11 @@ multiclass SM_Real_Stores_vi op, string ps, // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _IMM_vi : SMEM_Real_Store_vi { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc); + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_vi : SMEM_Real_Store_vi { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); } } @@ -630,9 +660,9 @@ class SMRD_Real_Load_IMM_ci op, SM_Load_Pseudo ps> : SM_Real, Enc64 { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; - let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc); + let AssemblerPredicates = [isGFX7Only]; + let DecoderNamespace = "GFX7"; + let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc); let LGKM_CNT = ps.LGKM_CNT; let SMRD = ps.SMRD; @@ -667,8 +697,8 @@ class SMRD_Real_ci op, SM_Pseudo ps> , SIMCInstr , Enc32 { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; + let AssemblerPredicates = [isGFX7Only]; + let DecoderNamespace = "GFX7"; let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); let Inst{8} = imm; @@ -684,7 +714,22 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; // Scalar Memory Patterns //===----------------------------------------------------------------------===// -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>; +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> { + let GISelPredicateCode = [{ + if (!MI.hasOneMemOperand()) + return false; + if (!isInstrUniform(MI)) + return false; + + // FIXME: We should probably be caching this. + SmallVector AddrInfo; + getAddrModeInfo(MI, MRI, AddrInfo); + + if (hasVgprParts(AddrInfo)) + return false; + return true; + }]; +} def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; @@ -697,41 +742,49 @@ multiclass SMRD_Pattern { // 1. IMM offset def : GCNPat < (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_IMM") $sbase, $offset, 0)) + (vt (!cast(Instr#"_IMM") $sbase, $offset, 0, 0)) >; // 2. 32-bit IMM offset on CI def : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_IMM_ci") $sbase, $offset, 0))> { - let OtherPredicates = [isCIOnly]; + (vt (!cast(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> { + let OtherPredicates = [isGFX7Only]; } // 3. SGPR offset def : GCNPat < (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_SGPR") $sbase, $offset, 0)) + (vt (!cast(Instr#"_SGPR") $sbase, $offset, 0, 0)) + >; + + // 4. No offset + def : GCNPat < + (vt (smrd_load (i64 SReg_64:$sbase))), + (vt (!cast(Instr#"_IMM") i64:$sbase, 0, 0, 0)) >; } multiclass SMLoad_Pattern { // 1. Offset as an immediate def : GCNPat < - (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc), - (vt (!cast(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc))) + (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc), + (vt (!cast(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc), + (as_i1imm $dlc))) >; // 2. 32-bit IMM offset on CI def : GCNPat < - (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)), - (!cast(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> { - let OtherPredicates = [isCIOnly]; + (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)), + (!cast(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> { + let OtherPredicates = [isGFX7Only]; } // 3. Offset loaded in an 32bit SGPR def : GCNPat < - (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc), - (vt (!cast(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc))) + (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc), + (vt (!cast(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc), + (as_i1imm $dlc))) >; } @@ -759,18 +812,202 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 -let OtherPredicates = [isSICI] in { def : GCNPat < (i64 (readcyclecounter)), (S_MEMTIME) >; + +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +class SMEM_Real_gfx10 op, SM_Pseudo ps> : + SM_Real, SIMCInstr, Enc64 { + bit glc; + bit dlc; + + let AssemblerPredicates = [isGFX10Plus]; + let DecoderNamespace = "GFX10"; + + let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); + let Inst{14} = !if(ps.has_dlc, dlc, ?); + let Inst{16} = !if(ps.has_glc, glc, ?); + let Inst{25-18} = op; + let Inst{31-26} = 0x3d; + let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?); + let Inst{63-57} = !if(ps.offset_is_imm, !cast(SGPR_NULL.HWEncoding), + !if(ps.has_offset, offset{6-0}, ?)); } -let OtherPredicates = [isVI] in { +multiclass SM_Real_Loads_gfx10 op, string ps, + SM_Load_Pseudo immPs = !cast(ps#_IMM), + SM_Load_Pseudo sgprPs = !cast(ps#_SGPR)> { + def _IMM_gfx10 : SMEM_Real_gfx10 { + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + } + def _SGPR_gfx10 : SMEM_Real_gfx10 { + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + } +} -def : GCNPat < - (i64 (readcyclecounter)), - (S_MEMREALTIME) ->; +class SMEM_Real_Store_gfx10 op, SM_Pseudo ps> : SMEM_Real_gfx10 { + bits<7> sdata; + + let sdst = ?; + let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); +} + +multiclass SM_Real_Stores_gfx10 op, string ps, + SM_Store_Pseudo immPs = !cast(ps#_IMM), + SM_Store_Pseudo sgprPs = !cast(ps#_SGPR)> { + // FIXME: The operand name $offset is inconsistent with $soff used + // in the pseudo + def _IMM_gfx10 : SMEM_Real_Store_gfx10 { + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + } + + def _SGPR_gfx10 : SMEM_Real_Store_gfx10 { + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + } +} + +defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000, "S_LOAD_DWORD">; +defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001, "S_LOAD_DWORDX2">; +defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002, "S_LOAD_DWORDX4">; +defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003, "S_LOAD_DWORDX8">; +defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004, "S_LOAD_DWORDX16">; + +let SubtargetPredicate = HasScalarFlatScratchInsts in { +defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005, "S_SCRATCH_LOAD_DWORD">; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006, "S_SCRATCH_LOAD_DWORDX2">; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007, "S_SCRATCH_LOAD_DWORDX4">; +} // End SubtargetPredicate = HasScalarFlatScratchInsts + +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c, "S_BUFFER_LOAD_DWORDX16">; + +let SubtargetPredicate = HasScalarStores in { +defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010, "S_STORE_DWORD">; +defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011, "S_STORE_DWORDX2">; +defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012, "S_STORE_DWORDX4">; +let OtherPredicates = [HasScalarFlatScratchInsts] in { +defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015, "S_SCRATCH_STORE_DWORD">; +defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016, "S_SCRATCH_STORE_DWORDX2">; +defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017, "S_SCRATCH_STORE_DWORDX4">; +} // End OtherPredicates = [HasScalarFlatScratchInsts] +defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018, "S_BUFFER_STORE_DWORD">; +defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019, "S_BUFFER_STORE_DWORDX2">; +defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a, "S_BUFFER_STORE_DWORDX4">; +} // End SubtargetPredicate = HasScalarStores + +def S_MEMREALTIME_gfx10 : SMEM_Real_gfx10<0x025, S_MEMREALTIME>; +def S_MEMTIME_gfx10 : SMEM_Real_gfx10<0x024, S_MEMTIME>; +def S_GL1_INV_gfx10 : SMEM_Real_gfx10<0x01f, S_GL1_INV>; +def S_GET_WAVEID_IN_WORKGROUP_gfx10 : SMEM_Real_gfx10<0x02a, S_GET_WAVEID_IN_WORKGROUP>; +def S_DCACHE_INV_gfx10 : SMEM_Real_gfx10<0x020, S_DCACHE_INV>; + +let SubtargetPredicate = HasScalarStores in { +def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>; +} // End SubtargetPredicate = HasScalarStores + +multiclass SM_Real_Probe_gfx10 op, string ps> { + def _IMM_gfx10 : SMEM_Real_Store_gfx10 (ps#_IMM)>; + def _SGPR_gfx10 : SMEM_Real_Store_gfx10 (ps#_SGPR)>; +} + +defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">; + +class SMEM_Atomic_Real_gfx10 op, SM_Atomic_Pseudo ps> + : SMEM_Real_gfx10 { + + bits<7> sdata; + bit dlc; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + let glc = ps.glc; + + let Inst{14} = !if(ps.has_dlc, dlc, 0); + let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0}); +} + +multiclass SM_Real_Atomics_gfx10 op, string ps> { + def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 (ps#_IMM)>; + def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 (ps#_SGPR)>; + def _IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 (ps#_IMM_RTN)>; + def _SGPR_RTN_gfx10 : SMEM_Atomic_Real_gfx10 (ps#_SGPR_RTN)>; +} + +let SubtargetPredicate = HasScalarAtomics in { -} // let OtherPredicates = [isVI] +defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40, "S_BUFFER_ATOMIC_SWAP">; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41, "S_BUFFER_ATOMIC_CMPSWAP">; +defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42, "S_BUFFER_ATOMIC_ADD">; +defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43, "S_BUFFER_ATOMIC_SUB">; +defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44, "S_BUFFER_ATOMIC_SMIN">; +defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45, "S_BUFFER_ATOMIC_UMIN">; +defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46, "S_BUFFER_ATOMIC_SMAX">; +defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47, "S_BUFFER_ATOMIC_UMAX">; +defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48, "S_BUFFER_ATOMIC_AND">; +defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49, "S_BUFFER_ATOMIC_OR">; +defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a, "S_BUFFER_ATOMIC_XOR">; +defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b, "S_BUFFER_ATOMIC_INC">; +defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c, "S_BUFFER_ATOMIC_DEC">; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60, "S_BUFFER_ATOMIC_SWAP_X2">; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62, "S_BUFFER_ATOMIC_ADD_X2">; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63, "S_BUFFER_ATOMIC_SUB_X2">; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64, "S_BUFFER_ATOMIC_SMIN_X2">; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65, "S_BUFFER_ATOMIC_UMIN_X2">; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66, "S_BUFFER_ATOMIC_SMAX_X2">; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67, "S_BUFFER_ATOMIC_UMAX_X2">; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68, "S_BUFFER_ATOMIC_AND_X2">; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69, "S_BUFFER_ATOMIC_OR_X2">; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a, "S_BUFFER_ATOMIC_XOR_X2">; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b, "S_BUFFER_ATOMIC_INC_X2">; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c, "S_BUFFER_ATOMIC_DEC_X2">; + +defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80, "S_ATOMIC_SWAP">; +defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81, "S_ATOMIC_CMPSWAP">; +defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82, "S_ATOMIC_ADD">; +defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83, "S_ATOMIC_SUB">; +defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84, "S_ATOMIC_SMIN">; +defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85, "S_ATOMIC_UMIN">; +defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86, "S_ATOMIC_SMAX">; +defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87, "S_ATOMIC_UMAX">; +defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88, "S_ATOMIC_AND">; +defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89, "S_ATOMIC_OR">; +defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a, "S_ATOMIC_XOR">; +defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b, "S_ATOMIC_INC">; +defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c, "S_ATOMIC_DEC">; + +defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0, "S_ATOMIC_SWAP_X2">; +defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1, "S_ATOMIC_CMPSWAP_X2">; +defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2, "S_ATOMIC_ADD_X2">; +defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3, "S_ATOMIC_SUB_X2">; +defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4, "S_ATOMIC_SMIN_X2">; +defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5, "S_ATOMIC_UMIN_X2">; +defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6, "S_ATOMIC_SMAX_X2">; +defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7, "S_ATOMIC_UMAX_X2">; +defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8, "S_ATOMIC_AND_X2">; +defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9, "S_ATOMIC_OR_X2">; +defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa, "S_ATOMIC_XOR_X2">; +defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab, "S_ATOMIC_INC_X2">; +defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X2">; + +multiclass SM_Real_Discard_gfx10 op, string ps> { + def _IMM_gfx10 : SMEM_Real_gfx10 (ps#_IMM)>; + def _SGPR_gfx10 : SMEM_Real_gfx10 (ps#_SGPR)>; +} + +defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">; +defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">; + +} // End SubtargetPredicate = HasScalarAtomics diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index ca5e981ac5c2..dfafdccc05a3 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -1,15 +1,15 @@ //===-- SOPInstructions.td - SOP Instruction Defintions -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// def GPRIdxModeMatchClass : AsmOperandClass { let Name = "GPRIdxMode"; let PredicateMethod = "isGPRIdxMode"; + let ParserMethod = "parseGPRIdxMode"; let RenderMethod = "addImmOperands"; } @@ -26,7 +26,6 @@ class SOP_Pseudo op, SOP1_Pseudo ps> : let Inst{31-23} = 0x17d; //encoding; } -class SOP1_32 pattern=[]> : SOP1_Pseudo < - opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0), - "$sdst, $src0", pattern ->; +class SOP1_32 pattern=[], bit tied_in = 0> : SOP1_Pseudo < + opName, (outs SReg_32:$sdst), + !if(tied_in, (ins SSrc_b32:$src0, SReg_32:$sdst_in), + (ins SSrc_b32:$src0)), + "$sdst, $src0", pattern> { + let Constraints = !if(tied_in, "$sdst = $sdst_in", ""); +} // 32-bit input, no output. class SOP1_0_32 pattern = []> : SOP1_Pseudo < @@ -108,10 +110,13 @@ class SOP1_32_64 pattern=[]> : SOP1_Pseudo < >; // 32-bit input, 64-bit output. -class SOP1_64_32 pattern=[]> : SOP1_Pseudo < - opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0), - "$sdst, $src0", pattern ->; +class SOP1_64_32 pattern=[], bit tied_in = 0> : SOP1_Pseudo < + opName, (outs SReg_64:$sdst), + !if(tied_in, (ins SSrc_b32:$src0, SReg_64:$sdst_in), + (ins SSrc_b32:$src0)), + "$sdst, $src0", pattern> { + let Constraints = !if(tied_in, "$sdst = $sdst_in", ""); +} // no input, 64-bit output. class SOP1_64_0 pattern=[]> : SOP1_Pseudo < @@ -120,8 +125,8 @@ class SOP1_64_0 pattern=[]> : SOP1_Pseudo < } // 64-bit input, no output -class SOP1_1 pattern=[]> : SOP1_Pseudo < - opName, (outs), (ins SReg_64:$src0), "$src0", pattern> { +class SOP1_1 pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins rc:$src0), "$src0", pattern> { let has_sdst = 0; } @@ -147,12 +152,24 @@ let Defs = [SCC] in { [(set i64:$sdst, (not i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; - def S_WQM_B64 : SOP1_64 <"s_wqm_b64", - [(set i1:$sdst, (int_amdgcn_wqm_vote i1:$src0))] - >; + def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; } // End Defs = [SCC] +let WaveSizePredicate = isWave32 in { +def : GCNPat < + (int_amdgcn_wqm_vote i1:$src0), + (S_WQM_B32 $src0) +>; +} + +let WaveSizePredicate = isWave64 in { +def : GCNPat < + (int_amdgcn_wqm_vote i1:$src0), + (S_WQM_B64 $src0) +>; +} + def S_BREV_B32 : SOP1_32 <"s_brev_b32", [(set i32:$sdst, (bitreverse i32:$src0))] >; @@ -191,10 +208,10 @@ def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16", [(set i32:$sdst, (sext_inreg i32:$src0, i16))] >; -def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; -def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; -def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; -def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; +def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>; +def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>; +def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>; +def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>; def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", [(set i64:$sdst, (int_amdgcn_s_getpc))] >; @@ -207,7 +224,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; let isReturn = 1 in { // Define variant marked as return rather than branch. -def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>; +def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>; } } // End isTerminator = 1, isBarrier = 1 @@ -241,8 +258,11 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">; def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">; } // End Uses = [M0] +let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in { def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">; def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">; +} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9 + let Defs = [SCC] in { def S_ABS_I32 : SOP1_32 <"s_abs_i32">; } // End Defs = [SCC] @@ -255,7 +275,7 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> { } } -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in { def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">; def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">; @@ -264,7 +284,28 @@ let SubtargetPredicate = isGFX9 in { } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">; -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX10Plus in { + let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in { + def S_AND_SAVEEXEC_B32 : SOP1_32<"s_and_saveexec_b32">; + def S_OR_SAVEEXEC_B32 : SOP1_32<"s_or_saveexec_b32">; + def S_XOR_SAVEEXEC_B32 : SOP1_32<"s_xor_saveexec_b32">; + def S_ANDN2_SAVEEXEC_B32 : SOP1_32<"s_andn2_saveexec_b32">; + def S_ORN2_SAVEEXEC_B32 : SOP1_32<"s_orn2_saveexec_b32">; + def S_NAND_SAVEEXEC_B32 : SOP1_32<"s_nand_saveexec_b32">; + def S_NOR_SAVEEXEC_B32 : SOP1_32<"s_nor_saveexec_b32">; + def S_XNOR_SAVEEXEC_B32 : SOP1_32<"s_xnor_saveexec_b32">; + def S_ANDN1_SAVEEXEC_B32 : SOP1_32<"s_andn1_saveexec_b32">; + def S_ORN1_SAVEEXEC_B32 : SOP1_32<"s_orn1_saveexec_b32">; + def S_ANDN1_WREXEC_B32 : SOP1_32<"s_andn1_wrexec_b32">; + def S_ANDN2_WREXEC_B32 : SOP1_32<"s_andn2_wrexec_b32">; + } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] + + let Uses = [M0] in { + def S_MOVRELSD_2_B32 : SOP1_32<"s_movrelsd_2_b32">; + } // End Uses = [M0] +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // SOP2 Instructions @@ -302,6 +343,8 @@ class SOP2_Real op, SOP_Pseudo ps> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let TSFlags = ps.TSFlags; // encoding bits<7> sdst; @@ -468,22 +511,22 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] + [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", - [(set i64:$sdst, (UniformBinFrag i64:$src0, i32:$src1))] + [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] + [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", - [(set i64:$sdst, (UniformBinFrag i64:$src0, i32:$src1))] + [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] + [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", - [(set i64:$sdst, (UniformBinFrag i64:$src0, i32:$src1))] + [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; } // End Defs = [SCC] @@ -512,13 +555,14 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo < "$src0, $src1" > { let has_sdst = 0; + let SubtargetPredicate = isGFX6GFX7GFX8GFX9; } let Defs = [SCC] in { def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; } // End Defs = [SCC] -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { def S_RFE_RESTORE_B64 : SOP2_Pseudo < "s_rfe_restore_b64", (outs), (ins SSrc_b64:$src0, SSrc_b32:$src1), @@ -529,7 +573,7 @@ let SubtargetPredicate = isVI in { } } -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; @@ -543,7 +587,7 @@ let SubtargetPredicate = isGFX9 in { def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">; def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">; -} +} // End SubtargetPredicate = isGFX9Plus //===----------------------------------------------------------------------===// // SOPK Instructions @@ -555,7 +599,6 @@ class SOPK_Pseudo { let isPseudo = 1; let isCodeGenOnly = 1; - let SubtargetPredicate = isGCN; let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -618,6 +661,19 @@ class SOPK_32 pattern=[]> : SOPK_Pseudo < "$sdst, $simm16", pattern>; +class SOPK_32_BR pattern=[]> : SOPK_Pseudo < + opName, + (outs), + (ins sopp_brtarget:$simm16, SReg_32:$sdst), + "$sdst, $simm16", + pattern> { + let Defs = [EXEC]; + let Uses = [EXEC]; + let isBranch = 1; + let isTerminator = 1; + let SchedRW = [WriteBranch]; +} + class SOPK_SCC : SOPK_Pseudo < opName, (outs), @@ -684,9 +740,10 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">; } +let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in def S_CBRANCH_I_FORK : SOPK_Pseudo < "s_cbranch_i_fork", - (outs), (ins SReg_64:$sdst, s16imm:$simm16), + (outs), (ins SReg_64:$sdst, sopp_brtarget:$simm16), "$sdst, $simm16" >; @@ -720,15 +777,46 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo < } // End hasSideEffects = 1 -let SubtargetPredicate = isGFX9 in { +class SOPK_WAITCNT pat=[]> : + SOPK_Pseudo< + opName, + (outs), + (ins SReg_32:$sdst, s16imm:$simm16), + "$sdst, $simm16", + pat> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; + let has_sdst = 1; // First source takes place of sdst in encoding +} + +let SubtargetPredicate = isGFX9Plus in { def S_CALL_B64 : SOPK_Pseudo< "s_call_b64", (outs SReg_64:$sdst), - (ins s16imm:$simm16), + (ins sopp_brtarget:$simm16), "$sdst, $simm16"> { let isCall = 1; } -} +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX10Plus in { + def S_VERSION : SOPK_Pseudo< + "s_version", + (outs), + (ins s16imm:$simm16), + "$simm16"> { + let has_sdst = 0; + } + + def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">; + def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">; + + def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">; + def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">; + def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">; + def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // SOPC Instructions @@ -756,7 +844,6 @@ class SOPC op, dag outs, dag ins, string asm, let Defs = [SCC]; let SchedRW = [WriteSALU]; let UseNamedOperandTable = 1; - let SubtargetPredicate = isGCN; } class SOPC_Base op, RegisterOperand rc0, RegisterOperand rc1, @@ -811,12 +898,13 @@ def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">; def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">; def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">; def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">; +let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">; -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8Plus in { def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>; def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>; -} +} // End SubtargetPredicate = isGFX8Plus let SubtargetPredicate = HasVGPRIndexMode in { def S_SET_GPR_IDX_ON : SOPC <0x11, @@ -834,6 +922,10 @@ def S_SET_GPR_IDX_ON : SOPC <0x11, // SOPP Instructions //===----------------------------------------------------------------------===// +class Base_SOPP { + string AsmString = asm; +} + class SOPPe op> : Enc32 { bits <16> simm16; @@ -843,7 +935,7 @@ class SOPPe op> : Enc32 { } class SOPP op, dag ins, string asm, list pattern = []> : - InstSI <(outs), ins, asm, pattern >, SOPPe { + InstSI <(outs), ins, asm, pattern >, SOPPe , Base_SOPP { let mayLoad = 0; let mayStore = 0; @@ -854,92 +946,124 @@ class SOPP op, dag ins, string asm, list pattern = []> : let SchedRW = [WriteSALU]; let UseNamedOperandTable = 1; - let SubtargetPredicate = isGCN; } - def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; +class SOPP_w_nop_e op> : Enc64 { + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding + let Inst{47-32} = 0x0; + let Inst{54-48} = S_NOP.Inst{22-16}; // opcode + let Inst{63-55} = S_NOP.Inst{31-23}; // encoding +} + +class SOPP_w_nop op, dag ins, string asm, list pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPP_w_nop_e , Base_SOPP { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPP = 1; + let Size = 8; + let SchedRW = [WriteSALU]; + + let UseNamedOperandTable = 1; +} + +multiclass SOPP_With_Relaxation op, dag ins, string asm, list pattern = []> { + def "" : SOPP ; + def _pad_s_nop : SOPP_w_nop ; +} + let isTerminator = 1 in { -def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(AMDGPUendpgm)]> { - let simm16 = 0; +def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm$simm16"> { let isBarrier = 1; let isReturn = 1; } -let SubtargetPredicate = isVI in { def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> { + let SubtargetPredicate = isGFX8Plus; let simm16 = 0; let isBarrier = 1; let isReturn = 1; } -} -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { let isBarrier = 1, isReturn = 1, simm16 = 0 in { def S_ENDPGM_ORDERED_PS_DONE : SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">; } // End isBarrier = 1, isReturn = 1, simm16 = 0 -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX10Plus in { + let isBarrier = 1, isReturn = 1, simm16 = 0 in { + def S_CODE_END : + SOPP<0x01f, (ins), "s_code_end">; + } // End isBarrier = 1, isReturn = 1, simm16 = 0 +} // End SubtargetPredicate = isGFX10Plus let isBranch = 1, SchedRW = [WriteBranch] in { -def S_BRANCH : SOPP < +let isBarrier = 1 in { +defm S_BRANCH : SOPP_With_Relaxation < 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", - [(br bb:$simm16)]> { - let isBarrier = 1; + [(br bb:$simm16)]>; } let Uses = [SCC] in { -def S_CBRANCH_SCC0 : SOPP < +defm S_CBRANCH_SCC0 : SOPP_With_Relaxation < 0x00000004, (ins sopp_brtarget:$simm16), "s_cbranch_scc0 $simm16" >; -def S_CBRANCH_SCC1 : SOPP < +defm S_CBRANCH_SCC1 : SOPP_With_Relaxation < 0x00000005, (ins sopp_brtarget:$simm16), "s_cbranch_scc1 $simm16" >; } // End Uses = [SCC] let Uses = [VCC] in { -def S_CBRANCH_VCCZ : SOPP < +defm S_CBRANCH_VCCZ : SOPP_With_Relaxation < 0x00000006, (ins sopp_brtarget:$simm16), "s_cbranch_vccz $simm16" >; -def S_CBRANCH_VCCNZ : SOPP < +defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation < 0x00000007, (ins sopp_brtarget:$simm16), "s_cbranch_vccnz $simm16" >; } // End Uses = [VCC] let Uses = [EXEC] in { -def S_CBRANCH_EXECZ : SOPP < +defm S_CBRANCH_EXECZ : SOPP_With_Relaxation < 0x00000008, (ins sopp_brtarget:$simm16), "s_cbranch_execz $simm16" >; -def S_CBRANCH_EXECNZ : SOPP < +defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation < 0x00000009, (ins sopp_brtarget:$simm16), "s_cbranch_execnz $simm16" >; } // End Uses = [EXEC] -def S_CBRANCH_CDBGSYS : SOPP < +defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation < 0x00000017, (ins sopp_brtarget:$simm16), "s_cbranch_cdbgsys $simm16" >; -def S_CBRANCH_CDBGSYS_AND_USER : SOPP < +defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation < 0x0000001A, (ins sopp_brtarget:$simm16), "s_cbranch_cdbgsys_and_user $simm16" >; -def S_CBRANCH_CDBGSYS_OR_USER : SOPP < +defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation < 0x00000019, (ins sopp_brtarget:$simm16), "s_cbranch_cdbgsys_or_user $simm16" >; -def S_CBRANCH_CDBGUSER : SOPP < +defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation < 0x00000018, (ins sopp_brtarget:$simm16), "s_cbranch_cdbguser $simm16" >; @@ -957,16 +1081,16 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", let isConvergent = 1; } -let SubtargetPredicate = isVI in { def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { + let SubtargetPredicate = isGFX8Plus; let simm16 = 0; let mayLoad = 1; let mayStore = 1; } -} let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16", + [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; @@ -994,7 +1118,10 @@ def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $ >; } // End Uses = [EXEC, M0] -def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> { + let isTrap = 1; +} + def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; } @@ -1028,6 +1155,25 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16), } } +let SubtargetPredicate = isGFX10Plus in { + def S_INST_PREFETCH : + SOPP<0x020, (ins s16imm:$simm16), "s_inst_prefetch $simm16">; + def S_CLAUSE : + SOPP<0x021, (ins s16imm:$simm16), "s_clause $simm16">; + def S_WAITCNT_IDLE : + SOPP <0x022, (ins), "s_wait_idle"> { + let simm16 = 0; + } + def S_WAITCNT_DEPCTR : + SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">; + def S_ROUND_MODE : + SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; + def S_DENORM_MODE : + SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">; + def S_TTRACEDATA_IMM : + SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; +} // End SubtargetPredicate = isGFX10Plus + //===----------------------------------------------------------------------===// // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// @@ -1040,6 +1186,11 @@ def : GCNPat < // SOP1 Patterns //===----------------------------------------------------------------------===// +def : GCNPat < + (AMDGPUendpgm), + (S_ENDPGM (i16 0)) +>; + def : GCNPat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, @@ -1097,162 +1248,261 @@ def : GCNPat< >; +//===----------------------------------------------------------------------===// +// Target-specific instruction encodings. +//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SOPP Patterns +// SOP1 - GFX10. //===----------------------------------------------------------------------===// -def : GCNPat < - (int_amdgcn_s_waitcnt i32:$simm16), - (S_WAITCNT (as_i16imm $simm16)) ->; +class Select_gfx10 : SIMCInstr { + Predicate AssemblerPredicate = isGFX10Plus; + string DecoderNamespace = "GFX10"; +} + +multiclass SOP1_Real_gfx10 op> { + def _gfx10 : SOP1_Real(NAME)>, + Select_gfx10(NAME).Mnemonic>; +} +defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>; +defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>; +defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>; +defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>; +defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>; +defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>; +defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>; +defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>; +defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>; +defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>; +defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>; +defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>; +defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>; +defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>; +defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>; +defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>; +defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>; +defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; //===----------------------------------------------------------------------===// -// Real target instructions, move this to the appropriate subtarget TD file +// SOP1 - GFX6, GFX7. //===----------------------------------------------------------------------===// -class Select_si : - SIMCInstr { - list AssemblerPredicates = [isSICI]; - string DecoderNamespace = "SICI"; +class Select_gfx6_gfx7 : SIMCInstr { + Predicate AssemblerPredicate = isGFX6GFX7; + string DecoderNamespace = "GFX6GFX7"; } -class SOP1_Real_si op, SOP1_Pseudo ps> : - SOP1_Real, - Select_si; +multiclass SOP1_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : SOP1_Real(NAME)>, + Select_gfx6_gfx7(NAME).Mnemonic>; +} -class SOP2_Real_si op, SOP2_Pseudo ps> : - SOP2_Real, - Select_si; +multiclass SOP1_Real_gfx6_gfx7_gfx10 op> : + SOP1_Real_gfx6_gfx7, SOP1_Real_gfx10; + +defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>; +defm S_MOV_REGRD_B32 : SOP1_Real_gfx6_gfx7<0x033>; + +defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>; +defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>; +defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x005>; +defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x006>; +defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x007>; +defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x008>; +defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x009>; +defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00a>; +defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00b>; +defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00c>; +defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00d>; +defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00e>; +defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00f>; +defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x010>; +defm S_FF0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x011>; +defm S_FF0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x012>; +defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x013>; +defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x014>; +defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x015>; +defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x016>; +defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x017>; +defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10<0x018>; +defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10<0x019>; +defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10<0x01a>; +defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01b>; +defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01c>; +defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01d>; +defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01e>; +defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01f>; +defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x020>; +defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x021>; +defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x022>; +defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x024>; +defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x025>; +defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x026>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>; +defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>; +defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02b>; +defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>; +defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>; +defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>; +defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>; +defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>; +defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>; +defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; +defm S_MOV_FED_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x035>; -class SOPK_Real_si op, SOPK_Pseudo ps> : - SOPK_Real32, - Select_si; - -def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>; -def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>; -def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>; -def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>; -def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>; -def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>; -def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>; -def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>; -def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>; -def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>; -def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>; -def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>; -def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>; -def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>; -def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>; -def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>; -def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>; -def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>; -def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>; -def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>; -def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>; -def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>; -def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>; -def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>; -def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>; -def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>; -def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>; -def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>; -def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>; -def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>; -def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>; -def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>; -def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>; -def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>; -def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>; -def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>; -def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>; -def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>; -def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>; -def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>; -def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>; -def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>; -def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>; -def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>; -def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>; -def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>; -def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>; -def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>; -def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>; -def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>; - -def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>; -def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>; -def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>; -def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>; -def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>; -def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>; -def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>; -def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>; -def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>; -def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>; -def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>; -def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>; -def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>; -def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>; -def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>; -def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>; -def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>; -def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>; -def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>; -def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>; -def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>; -def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>; -def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>; -def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>; -def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>; -def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>; -def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>; -def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>; -def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>; -def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>; -def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>; -def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>; -def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>; -def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>; -def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>; -def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>; -def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>; -def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>; -def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>; -def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>; -def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>; -def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>; -def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>; - -def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>; -def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>; -def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>; -def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>; -def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>; -def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>; -def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>; -def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>; -def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>; -def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>; -def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>; -def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>; -def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>; -def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>; -def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>; -def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>; -def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>; -def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>; -def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>; -//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments -def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>, - Select_si; +//===----------------------------------------------------------------------===// +// SOP2 - GFX10. +//===----------------------------------------------------------------------===// + +multiclass SOP2_Real_gfx10 op> { + def _gfx10 : SOP2_Real(NAME)>, + Select_gfx10(NAME).Mnemonic>; +} + +defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>; +defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>; +defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>; +defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>; +defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>; +defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>; +defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>; +defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>; +defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX6, GFX7. +//===----------------------------------------------------------------------===// +multiclass SOP2_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : SOP2_Real(NAME)>, + Select_gfx6_gfx7(NAME).Mnemonic>; +} + +multiclass SOP2_Real_gfx6_gfx7_gfx10 op> : + SOP2_Real_gfx6_gfx7, SOP2_Real_gfx10; + +defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>; + +defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x000>; +defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x001>; +defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x002>; +defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x003>; +defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x004>; +defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>; +defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>; +defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>; +defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x009>; +defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>; +defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>; +defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00e>; +defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00f>; +defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x010>; +defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x011>; +defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x012>; +defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x013>; +defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x014>; +defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x015>; +defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x016>; +defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x017>; +defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x018>; +defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x019>; +defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01a>; +defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01b>; +defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01c>; +defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01d>; +defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01e>; +defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01f>; +defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x020>; +defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x021>; +defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x022>; +defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x023>; +defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x024>; +defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x025>; +defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x026>; +defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x027>; +defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x028>; +defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10<0x029>; +defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>; +defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX10. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx10 op> { + def _gfx10 : SOPK_Real32(NAME)>, + Select_gfx10(NAME).Mnemonic>; +} + +multiclass SOPK_Real64_gfx10 op> { + def _gfx10 : SOPK_Real64(NAME)>, + Select_gfx10(NAME).Mnemonic>; +} + +defm S_VERSION : SOPK_Real32_gfx10<0x001>; +defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>; +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>; +defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>; +defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX6, GFX7. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx6_gfx7 op> { + def _gfx6_gfx7 : SOPK_Real32(NAME)>, + Select_gfx6_gfx7(NAME).Mnemonic>; +} + +multiclass SOPK_Real64_gfx6_gfx7 op> { + def _gfx6_gfx7 : SOPK_Real64(NAME)>, + Select_gfx6_gfx7(NAME).Mnemonic>; +} + +multiclass SOPK_Real32_gfx6_gfx7_gfx10 op> : + SOPK_Real32_gfx6_gfx7, SOPK_Real32_gfx10; + +multiclass SOPK_Real64_gfx6_gfx7_gfx10 op> : + SOPK_Real64_gfx6_gfx7, SOPK_Real64_gfx10; + +defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>; + +defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x000>; +defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x002>; +defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x003>; +defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x004>; +defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x005>; +defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x006>; +defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x007>; +defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x008>; +defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x009>; +defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>; +defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>; +defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>; +defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>; +defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>; +defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>; +defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x010>; +defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>; +defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>; +defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; + +//===----------------------------------------------------------------------===// +// GFX8, GFX9 (VI). +//===----------------------------------------------------------------------===// class Select_vi : SIMCInstr { - list AssemblerPredicates = [isVI]; - string DecoderNamespace = "VI"; + list AssemblerPredicates = [isGFX8GFX9]; + string DecoderNamespace = "GFX8"; } class SOP1_Real_vi op, SOP1_Pseudo ps> : diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index e4c442db3016..30cf12337c6e 100644 --- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -1,9 +1,8 @@ //===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUTargetMachine.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h new file mode 100644 index 000000000000..1e6dbd90b0c1 --- /dev/null +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h @@ -0,0 +1,29 @@ +//===-- TargetInfo/AMDGPUTargetInfo.h - TargetInfo for AMDGPU ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H +#define LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H + +namespace llvm { + +class Target; + +/// The target which supports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. +Target &getTheAMDGPUTarget(); + +/// The target for GCN GPUs +Target &getTheGCNTarget(); + +} + +#endif // LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 9eb4c6513cce..075e08986c0c 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "AMDGPUAsmUtils.h" @@ -23,8 +22,8 @@ const char* const IdSymbolic[] = { nullptr, nullptr, nullptr, - nullptr, - nullptr, + "MSG_GS_ALLOC_REQ", + "MSG_GET_DOORBELL", nullptr, nullptr, nullptr, @@ -69,7 +68,17 @@ const char* const IdSymbolic[] = { nullptr, nullptr, nullptr, - "HW_REG_SH_MEM_BASES" + "HW_REG_SH_MEM_BASES", + "HW_REG_TBA_LO", + "HW_REG_TBA_HI", + "HW_REG_TMA_LO", + "HW_REG_TMA_HI", + "HW_REG_FLAT_SCR_LO", + "HW_REG_FLAT_SCR_HI", + "HW_REG_XNACK_MASK", + nullptr, // HW_ID1, no predictable values + nullptr, // HW_ID2, no predictable values + "HW_REG_POPS_PACKER" }; } // namespace Hwreg @@ -86,5 +95,18 @@ const char* const IdSymbolic[] = { }; } // namespace Swizzle + +namespace VGPRIndexMode { + +// This must be in sync with llvm::AMDGPU::VGPRIndexMode::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + "SRC0", + "SRC1", + "SRC2", + "DST", +}; + +} // namespace VGPRIndexMode + } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index ebb2be22b487..cd91c5f6edd5 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -1,9 +1,8 @@ //===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -31,6 +30,13 @@ namespace Swizzle { // Symbolic names for the swizzle(...) syntax. extern const char* const IdSymbolic[]; } // namespace Swizzle + +namespace VGPRIndexMode { // Symbolic names for the gpr_idx(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace VGPRIndexMode + } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 54c866bdc63c..e90f40e6abea 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,6 +10,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPU.h" #include "SIDefines.h" +#include "AMDGPUAsmUtils.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/ELF.h" @@ -85,7 +85,9 @@ unsigned getExpcntBitWidth() { return 3; } unsigned getLgkmcntBitShift() { return 8; } /// \returns Lgkmcnt bit width. -unsigned getLgkmcntBitWidth() { return 4; } +unsigned getLgkmcntBitWidth(unsigned VersionMajor) { + return (VersionMajor >= 10) ? 6 : 4; +} /// \returns Vmcnt bit shift (higher bits). unsigned getVmcntBitShiftHi() { return 14; } @@ -99,18 +101,11 @@ namespace llvm { namespace AMDGPU { -struct MIMGInfo { - uint16_t Opcode; - uint16_t BaseOpcode; - uint8_t MIMGEncoding; - uint8_t VDataDwords; - uint8_t VAddrDwords; -}; - #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL +#define GET_MIMGMIPMappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -120,6 +115,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, return Info ? Info->Opcode : -1; } +const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc) { + const MIMGInfo *Info = getMIMGInfo(Opc); + return Info ? getMIMGBaseOpcodeInfo(Info->BaseOpcode) : nullptr; +} + int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { const MIMGInfo *OrigInfo = getMIMGInfo(Opc); const MIMGInfo *NewInfo = @@ -230,7 +230,8 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI) { unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - if (!STI->getFeatureBits().test(FeatureGCN)) + assert(FlatWorkGroupSize != 0); + if (STI->getTargetTriple().getArch() != Triple::amdgcn) return 8; unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize); if (N == 1) @@ -279,6 +280,8 @@ unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) { IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return getAddressableNumSGPRs(STI); if (Version.Major >= 8) return 16; return 8; @@ -300,6 +303,8 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) { return FIXED_NUM_SGPRS_FOR_INIT_BUG; IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return 106; if (Version.Major >= 8) return 102; return 104; @@ -308,6 +313,10 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) { unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); + IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return 0; + if (WavesPerEU >= getMaxWavesPerEU()) return 0; @@ -322,8 +331,10 @@ unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable) { assert(WavesPerEU != 0); - IsaVersion Version = getIsaVersion(STI->getCPU()); unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI); + IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return Addressable ? AddressableNumSGPRs : 108; if (Version.Major >= 8 && !Addressable) AddressableNumSGPRs = 112; unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU; @@ -340,6 +351,9 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, ExtraSGPRs = 2; IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return ExtraSGPRs; + if (Version.Major < 8) { if (FlatScrUsed) ExtraSGPRs = 4; @@ -366,12 +380,17 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) { return NumSGPRs / getSGPREncodingGranule(STI) - 1; } -unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) { - return 4; +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32) { + bool IsWave32 = EnableWavefrontSize32 ? + *EnableWavefrontSize32 : + STI->getFeatureBits().test(FeatureWavefrontSize32); + return IsWave32 ? 8 : 4; } -unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) { - return getVGPRAllocGranule(STI); +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32) { + return getVGPRAllocGranule(STI, EnableWavefrontSize32); } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { @@ -402,10 +421,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { return std::min(MaxNumVGPRs, AddressableNumVGPRs); } -unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) { - NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI)); +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, + Optional EnableWavefrontSize32) { + NumVGPRs = alignTo(std::max(1u, NumVGPRs), + getVGPREncodingGranule(STI, EnableWavefrontSize32)); // VGPRBlocks is actual number of VGPR blocks minus 1. - return NumVGPRs / getVGPREncodingGranule(STI) - 1; + return NumVGPRs / getVGPREncodingGranule(STI, EnableWavefrontSize32) - 1; } } // end namespace IsaInfo @@ -423,7 +444,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.amd_machine_version_minor = Version.Minor; Header.amd_machine_version_stepping = Version.Stepping; Header.kernel_code_entry_byte_offset = sizeof(Header); - // wavefront_size is specified as a power of 2: 2^6 = 64 threads. Header.wavefront_size = 6; // If the code object does not support indirect functions, then the value must @@ -435,11 +455,25 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.kernarg_segment_alignment = 4; Header.group_segment_alignment = 4; Header.private_segment_alignment = 4; + + if (Version.Major >= 10) { + if (STI->getFeatureBits().test(FeatureWavefrontSize32)) { + Header.wavefront_size = 5; + Header.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; + } + Header.compute_pgm_resource_registers |= + S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) | + S_00B848_MEM_ORDERED(1); + } } -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); + amdhsa::kernel_descriptor_t KD; memset(&KD, 0, sizeof(KD)); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); @@ -449,6 +483,16 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); + if (Version.Major >= 10) { + AMDHSA_BITS_SET(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, + STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE, + STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); + } return KD; } @@ -523,13 +567,14 @@ unsigned getExpcntBitMask(const IsaVersion &Version) { } unsigned getLgkmcntBitMask(const IsaVersion &Version) { - return (1 << getLgkmcntBitWidth()) - 1; + return (1 << getLgkmcntBitWidth(Version.Major)) - 1; } unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); - unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), + getLgkmcntBitWidth(Version.Major)); unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt; if (Version.Major < 9) return Waitcnt; @@ -555,7 +600,8 @@ unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) { } unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); + return unpackBits(Waitcnt, getLgkmcntBitShift(), + getLgkmcntBitWidth(Version.Major)); } void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, @@ -591,7 +637,8 @@ unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt) { - return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); + return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), + getLgkmcntBitWidth(Version.Major)); } unsigned encodeWaitcnt(const IsaVersion &Version, @@ -607,6 +654,181 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); } +//===----------------------------------------------------------------------===// +// hwreg +//===----------------------------------------------------------------------===// + +namespace Hwreg { + +int64_t getHwregId(const StringRef Name) { + for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) { + if (IdSymbolic[Id] && Name == IdSymbolic[Id]) + return Id; + } + return ID_UNKNOWN_; +} + +static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) { + if (isSI(STI) || isCI(STI) || isVI(STI)) + return ID_SYMBOLIC_FIRST_GFX9_; + else if (isGFX9(STI)) + return ID_SYMBOLIC_FIRST_GFX10_; + else + return ID_SYMBOLIC_LAST_; +} + +bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { + return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && + IdSymbolic[Id]; +} + +bool isValidHwreg(int64_t Id) { + return 0 <= Id && isUInt(Id); +} + +bool isValidHwregOffset(int64_t Offset) { + return 0 <= Offset && isUInt(Offset); +} + +bool isValidHwregWidth(int64_t Width) { + return 0 <= (Width - 1) && isUInt(Width - 1); +} + +uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { + return (Id << ID_SHIFT_) | + (Offset << OFFSET_SHIFT_) | + ((Width - 1) << WIDTH_M1_SHIFT_); +} + +StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { + return isValidHwreg(Id, STI) ? IdSymbolic[Id] : ""; +} + +void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) { + Id = (Val & ID_MASK_) >> ID_SHIFT_; + Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_; + Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; +} + +} // namespace Hwreg + +//===----------------------------------------------------------------------===// +// SendMsg +//===----------------------------------------------------------------------===// + +namespace SendMsg { + +int64_t getMsgId(const StringRef Name) { + for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { + if (IdSymbolic[i] && Name == IdSymbolic[i]) + return i; + } + return ID_UNKNOWN_; +} + +static bool isValidMsgId(int64_t MsgId) { + return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId]; +} + +bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) { + if (Strict) { + if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL) + return isGFX9(STI) || isGFX10(STI); + else + return isValidMsgId(MsgId); + } else { + return 0 <= MsgId && isUInt(MsgId); + } +} + +StringRef getMsgName(int64_t MsgId) { + return isValidMsgId(MsgId)? IdSymbolic[MsgId] : ""; +} + +int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { + const char* const *S = (MsgId == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic; + const int F = (MsgId == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_; + const int L = (MsgId == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_; + for (int i = F; i < L; ++i) { + if (Name == S[i]) { + return i; + } + } + return OP_UNKNOWN_; +} + +bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) { + + if (!Strict) + return 0 <= OpId && isUInt(OpId); + + switch(MsgId) + { + case ID_GS: + return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP; + case ID_GS_DONE: + return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_; + case ID_SYSMSG: + return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_; + default: + return OpId == OP_NONE_; + } +} + +StringRef getMsgOpName(int64_t MsgId, int64_t OpId) { + assert(msgRequiresOp(MsgId)); + return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId]; +} + +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) { + + if (!Strict) + return 0 <= StreamId && isUInt(StreamId); + + switch(MsgId) + { + case ID_GS: + return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_; + case ID_GS_DONE: + return (OpId == OP_GS_NOP)? + (StreamId == STREAM_ID_NONE_) : + (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_); + default: + return StreamId == STREAM_ID_NONE_; + } +} + +bool msgRequiresOp(int64_t MsgId) { + return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG; +} + +bool msgSupportsStream(int64_t MsgId, int64_t OpId) { + return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP; +} + +void decodeMsg(unsigned Val, + uint16_t &MsgId, + uint16_t &OpId, + uint16_t &StreamId) { + MsgId = Val & ID_MASK_; + OpId = (Val & OP_MASK_) >> OP_SHIFT_; + StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; +} + +uint64_t encodeMsg(uint64_t MsgId, + uint64_t OpId, + uint64_t StreamId) { + return (MsgId << ID_SHIFT_) | + (OpId << OP_SHIFT_) | + (StreamId << STREAM_ID_SHIFT_); +} + +} // namespace SendMsg + +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } @@ -679,6 +901,10 @@ bool isGFX9(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool isGFX10(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} + bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } @@ -704,46 +930,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { CASE_CI_VI(FLAT_SCR) \ CASE_CI_VI(FLAT_SCR_LO) \ CASE_CI_VI(FLAT_SCR_HI) \ - CASE_VI_GFX9(TTMP0) \ - CASE_VI_GFX9(TTMP1) \ - CASE_VI_GFX9(TTMP2) \ - CASE_VI_GFX9(TTMP3) \ - CASE_VI_GFX9(TTMP4) \ - CASE_VI_GFX9(TTMP5) \ - CASE_VI_GFX9(TTMP6) \ - CASE_VI_GFX9(TTMP7) \ - CASE_VI_GFX9(TTMP8) \ - CASE_VI_GFX9(TTMP9) \ - CASE_VI_GFX9(TTMP10) \ - CASE_VI_GFX9(TTMP11) \ - CASE_VI_GFX9(TTMP12) \ - CASE_VI_GFX9(TTMP13) \ - CASE_VI_GFX9(TTMP14) \ - CASE_VI_GFX9(TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1) \ - CASE_VI_GFX9(TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5) \ - CASE_VI_GFX9(TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9) \ - CASE_VI_GFX9(TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13) \ - CASE_VI_GFX9(TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0) \ + CASE_VI_GFX9_GFX10(TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2) \ + CASE_VI_GFX9_GFX10(TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4) \ + CASE_VI_GFX9_GFX10(TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6) \ + CASE_VI_GFX9_GFX10(TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8) \ + CASE_VI_GFX9_GFX10(TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10) \ + CASE_VI_GFX9_GFX10(TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12) \ + CASE_VI_GFX9_GFX10(TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14) \ + CASE_VI_GFX9_GFX10(TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ } #define CASE_CI_VI(node) \ assert(!isSI(STI)); \ case node: return isCI(STI) ? node##_ci : node##_vi; -#define CASE_VI_GFX9(node) \ - case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; +#define CASE_VI_GFX9_GFX10(node) \ + case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi; unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { if (STI.getTargetTriple().getArch() == Triple::r600) @@ -752,17 +978,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #define CASE_CI_VI(node) case node##_ci: case node##_vi: return node; -#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node; +#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node; unsigned mc2PseudoReg(unsigned Reg) { MAP_REG2REG } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #undef MAP_REG2REG bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { @@ -779,10 +1005,17 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: return true; default: return false; @@ -802,28 +1035,46 @@ unsigned getRegBitWidth(unsigned RCID) { switch (RCID) { case AMDGPU::SGPR_32RegClassID: case AMDGPU::VGPR_32RegClassID: + case AMDGPU::VRegOrLds_32RegClassID: + case AMDGPU::AGPR_32RegClassID: case AMDGPU::VS_32RegClassID: + case AMDGPU::AV_32RegClassID: case AMDGPU::SReg_32RegClassID: case AMDGPU::SReg_32_XM0RegClassID: + case AMDGPU::SRegOrLds_32RegClassID: return 32; case AMDGPU::SGPR_64RegClassID: case AMDGPU::VS_64RegClassID: + case AMDGPU::AV_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: + case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: return 64; + case AMDGPU::SGPR_96RegClassID: + case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: + case AMDGPU::AReg_128RegClassID: return 128; + case AMDGPU::SGPR_160RegClassID: + case AMDGPU::SReg_160RegClassID: + case AMDGPU::VReg_160RegClassID: + return 160; case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: return 256; case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: + case AMDGPU::AReg_512RegClassID: return 512; + case AMDGPU::SReg_1024RegClassID: + case AMDGPU::VReg_1024RegClassID: + case AMDGPU::AReg_1024RegClassID: + return 1024; default: llvm_unreachable("Unexpected register class"); } @@ -905,6 +1156,13 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); + if (isInt<16>(Literal) || isUInt<16>(Literal)) { + int16_t Trunc = static_cast(Literal); + return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi); + } + if (!(Literal & 0xffff)) + return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi); + int16_t Lo16 = static_cast(Literal); int16_t Hi16 = static_cast(Literal >> 16); return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); @@ -936,15 +1194,19 @@ bool isArgPassedInSGPR(const Argument *A) { } } +static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { + return isGCN3Encoding(ST) || isGFX10(ST); +} + int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { - if (isGCN3Encoding(ST)) + if (hasSMEMByteOffset(ST)) return ByteOffset; return ByteOffset >> 2; } bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset); - return isGCN3Encoding(ST) ? + return (hasSMEMByteOffset(ST)) ? isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } @@ -994,6 +1256,19 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, return true; } +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { + *this = getDefaultForCallingConv(F.getCallingConv()); + + StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); + if (!IEEEAttr.empty()) + IEEE = IEEEAttr == "true"; + + StringRef DX10ClampAttr + = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); + if (!DX10ClampAttr.empty()) + DX10Clamp = DX10ClampAttr == "true"; +} + namespace { struct SourceOfDivergence { @@ -1009,5 +1284,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); bool isIntrinsicSourceOfDivergence(unsigned IntrID) { return lookupSourceOfDivergence(IntrID); } + } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 20123ed4ac81..209ef7eef749 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1,9 +1,8 @@ //===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -46,6 +45,7 @@ namespace AMDGPU { #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL #define GET_MIMGLZMapping_DECL +#define GET_MIMGMIPMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -150,10 +150,18 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); /// \returns VGPR allocation granularity for given subtarget \p STI. -unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match +/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32 = None); /// \returns VGPR encoding granularity for given subtarget \p STI. -unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match +/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32 = None); /// \returns Total number of VGPRs for given subtarget \p STI. unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI); @@ -171,13 +179,20 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); /// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. -unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match the +/// ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs, + Optional EnableWavefrontSize32 = None); } // end namespace IsaInfo LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +LLVM_READONLY +int getSOPPWithRelaxation(uint16_t Opcode); + struct MIMGBaseOpcodeInfo { MIMGBaseOpcode BaseOpcode; bool Store; @@ -201,19 +216,35 @@ struct MIMGDimInfo { uint8_t NumCoords; uint8_t NumGradients; bool DA; + uint8_t Encoding; + const char *AsmSuffix; }; LLVM_READONLY -const MIMGDimInfo *getMIMGDimInfo(unsigned Dim); +const MIMGDimInfo *getMIMGDimInfo(unsigned DimEnum); + +LLVM_READONLY +const MIMGDimInfo *getMIMGDimInfoByEncoding(uint8_t DimEnc); + +LLVM_READONLY +const MIMGDimInfo *getMIMGDimInfoByAsmSuffix(StringRef AsmSuffix); struct MIMGLZMappingInfo { MIMGBaseOpcode L; MIMGBaseOpcode LZ; }; +struct MIMGMIPMappingInfo { + MIMGBaseOpcode MIP; + MIMGBaseOpcode NONMIP; +}; + LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); +LLVM_READONLY +const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L); + LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords); @@ -221,6 +252,17 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, LLVM_READONLY int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); +struct MIMGInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t MIMGEncoding; + uint8_t VDataDwords; + uint8_t VAddrDwords; +}; + +LLVM_READONLY +const MIMGInfo *getMIMGInfo(unsigned Opc); + LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); @@ -245,7 +287,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI); bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); @@ -285,21 +328,30 @@ struct Waitcnt { unsigned VmCnt = ~0u; unsigned ExpCnt = ~0u; unsigned LgkmCnt = ~0u; + unsigned VsCnt = ~0u; Waitcnt() {} - Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt) - : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {} + Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) + : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {} + + static Waitcnt allZero(const IsaVersion &Version) { + return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u); + } + static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); } - static Waitcnt allZero() { return Waitcnt(0, 0, 0); } + bool hasWait() const { + return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; + } bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && - LgkmCnt <= Other.LgkmCnt; + LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; } Waitcnt combined(const Waitcnt &Other) const { return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), - std::min(LgkmCnt, Other.LgkmCnt)); + std::min(LgkmCnt, Other.LgkmCnt), + std::min(VsCnt, Other.VsCnt)); } }; @@ -332,7 +384,8 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); /// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only) /// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) /// \p Expcnt = \p Waitcnt[6:4] -/// \p Lgkmcnt = \p Waitcnt[11:8] +/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10 only) +/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10+ only) void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); @@ -357,7 +410,8 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, /// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only) /// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only) /// Waitcnt[6:4] = \p Expcnt -/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10 only) +/// Waitcnt[13:8] = \p Lgkmcnt (gfx10+ only) /// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given @@ -367,6 +421,75 @@ unsigned encodeWaitcnt(const IsaVersion &Version, unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); +namespace Hwreg { + +LLVM_READONLY +int64_t getHwregId(const StringRef Name); + +LLVM_READNONE +bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI); + +LLVM_READNONE +bool isValidHwreg(int64_t Id); + +LLVM_READNONE +bool isValidHwregOffset(int64_t Offset); + +LLVM_READNONE +bool isValidHwregWidth(int64_t Width); + +LLVM_READNONE +uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width); + +LLVM_READNONE +StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI); + +void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width); + +} // namespace Hwreg + +namespace SendMsg { + +LLVM_READONLY +int64_t getMsgId(const StringRef Name); + +LLVM_READONLY +int64_t getMsgOpId(int64_t MsgId, const StringRef Name); + +LLVM_READNONE +StringRef getMsgName(int64_t MsgId); + +LLVM_READNONE +StringRef getMsgOpName(int64_t MsgId, int64_t OpId); + +LLVM_READNONE +bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true); + +LLVM_READNONE +bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true); + +LLVM_READNONE +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true); + +LLVM_READNONE +bool msgRequiresOp(int64_t MsgId); + +LLVM_READNONE +bool msgSupportsStream(int64_t MsgId, int64_t OpId); + +void decodeMsg(unsigned Val, + uint16_t &MsgId, + uint16_t &OpId, + uint16_t &StreamId); + +LLVM_READNONE +uint64_t encodeMsg(uint64_t MsgId, + uint64_t OpId, + uint64_t StreamId); + +} // namespace SendMsg + + unsigned getInitialPSInputAddr(const Function &F); LLVM_READNONE @@ -399,6 +522,7 @@ bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); +bool isGFX10(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -440,6 +564,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: @@ -454,6 +580,12 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: return 2; default: @@ -496,6 +628,45 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); + +// Track defaults for fields in the MODE registser. +struct SIModeRegisterDefaults { + /// Floating point opcodes that support exception flag gathering quiet and + /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 + /// become IEEE 754- 2008 compliant due to signaling NaN propagation and + /// quieting. + bool IEEE : 1; + + /// Used by the vector ALU to force DX10-style treatment of NaNs: when set, + /// clamp NaN to zero; otherwise, pass NaN through. + bool DX10Clamp : 1; + + // TODO: FP mode fields + + SIModeRegisterDefaults() : + IEEE(true), + DX10Clamp(true) {} + + SIModeRegisterDefaults(const Function &F); + + static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { + SIModeRegisterDefaults Mode; + Mode.DX10Clamp = true; + Mode.IEEE = AMDGPU::isCompute(CC); + return Mode; + } + + bool operator ==(const SIModeRegisterDefaults Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp; + } + + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should + // be able to override. + bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { + return *this == CalleeMode; + } +}; + } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp new file mode 100644 index 000000000000..db20d5ccf5f9 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -0,0 +1,723 @@ +//===-- AMDGPUPALMetadata.cpp - Accumulate and print AMDGPU PAL metadata -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This class has methods called by AMDGPUAsmPrinter to accumulate and print +/// the PAL metadata. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUPALMetadata.h" +#include "AMDGPU.h" +#include "AMDGPUAsmPrinter.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" +#include "SIDefines.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/EndianStream.h" + +using namespace llvm; +using namespace llvm::AMDGPU; + +// Read the PAL metadata from IR metadata, where it was put by the frontend. +void AMDGPUPALMetadata::readFromIR(Module &M) { + auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack"); + if (NamedMD && NamedMD->getNumOperands()) { + // This is the new msgpack format for metadata. It is a NamedMD containing + // an MDTuple containing an MDString containing the msgpack data. + BlobType = ELF::NT_AMDGPU_METADATA; + auto MDN = dyn_cast(NamedMD->getOperand(0)); + if (MDN && MDN->getNumOperands()) { + if (auto MDS = dyn_cast(MDN->getOperand(0))) + setFromMsgPackBlob(MDS->getString()); + } + return; + } + BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; + NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); + if (!NamedMD || !NamedMD->getNumOperands()) + return; + // This is the old reg=value pair format for metadata. It is a NamedMD + // containing an MDTuple containing a number of MDNodes each of which is an + // integer value, and each two integer values forms a key=value pair that we + // store as Registers[key]=value in the map. + auto Tuple = dyn_cast(NamedMD->getOperand(0)); + if (!Tuple) + return; + for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { + auto Key = mdconst::dyn_extract(Tuple->getOperand(I)); + auto Val = mdconst::dyn_extract(Tuple->getOperand(I + 1)); + if (!Key || !Val) + continue; + setRegister(Key->getZExtValue(), Val->getZExtValue()); + } +} + +// Set PAL metadata from a binary blob from the applicable .note record. +// Returns false if bad format. Blob must remain valid for the lifetime of the +// Metadata. +bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) { + BlobType = Type; + if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + return setFromLegacyBlob(Blob); + return setFromMsgPackBlob(Blob); +} + +// Set PAL metadata from legacy (array of key=value pairs) blob. +bool AMDGPUPALMetadata::setFromLegacyBlob(StringRef Blob) { + auto Data = reinterpret_cast(Blob.data()); + for (unsigned I = 0; I != Blob.size() / sizeof(uint32_t) / 2; ++I) + setRegister(Data[I * 2], Data[I * 2 + 1]); + return true; +} + +// Set PAL metadata from msgpack blob. +bool AMDGPUPALMetadata::setFromMsgPackBlob(StringRef Blob) { + msgpack::Reader Reader(Blob); + return MsgPackDoc.readFromBlob(Blob, /*Multi=*/false); +} + +// Given the calling convention, calculate the register number for rsrc1. In +// principle the register number could change in future hardware, but we know +// it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so +// we can use fixed values. +static unsigned getRsrc1Reg(CallingConv::ID CC) { + switch (CC) { + default: + return PALMD::R_2E12_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_LS: + return PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS; + case CallingConv::AMDGPU_HS: + return PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS; + case CallingConv::AMDGPU_ES: + return PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES; + case CallingConv::AMDGPU_GS: + return PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS; + case CallingConv::AMDGPU_VS: + return PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_PS: + return PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS; + } +} + +// Calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used +// with a constant offset to access any non-register shader-specific PAL +// metadata key. +static unsigned getScratchSizeKey(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return PALMD::Key::PS_SCRATCH_SIZE; + case CallingConv::AMDGPU_VS: + return PALMD::Key::VS_SCRATCH_SIZE; + case CallingConv::AMDGPU_GS: + return PALMD::Key::GS_SCRATCH_SIZE; + case CallingConv::AMDGPU_ES: + return PALMD::Key::ES_SCRATCH_SIZE; + case CallingConv::AMDGPU_HS: + return PALMD::Key::HS_SCRATCH_SIZE; + case CallingConv::AMDGPU_LS: + return PALMD::Key::LS_SCRATCH_SIZE; + default: + return PALMD::Key::CS_SCRATCH_SIZE; + } +} + +// Set the rsrc1 register in the metadata for a particular shader stage. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) { + setRegister(getRsrc1Reg(CC), Val); +} + +// Set the rsrc2 register in the metadata for a particular shader stage. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) { + setRegister(getRsrc1Reg(CC) + 1, Val); +} + +// Set the SPI_PS_INPUT_ENA register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) { + setRegister(PALMD::R_A1B3_SPI_PS_INPUT_ENA, Val); +} + +// Set the SPI_PS_INPUT_ADDR register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setSpiPsInputAddr(unsigned Val) { + setRegister(PALMD::R_A1B4_SPI_PS_INPUT_ADDR, Val); +} + +// Get a register from the metadata, or 0 if not currently set. +unsigned AMDGPUPALMetadata::getRegister(unsigned Reg) { + auto Regs = getRegisters(); + auto It = Regs.find(MsgPackDoc.getNode(Reg)); + if (It == Regs.end()) + return 0; + auto N = It->second; + if (N.getKind() != msgpack::Type::UInt) + return 0; + return N.getUInt(); +} + +// Set a register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) { + if (!isLegacy()) { + // In the new MsgPack format, ignore register numbered >= 0x10000000. It + // is a PAL ABI pseudo-register in the old non-MsgPack format. + if (Reg >= 0x10000000) + return; + } + auto &N = getRegisters()[MsgPackDoc.getNode(Reg)]; + if (N.getKind() == msgpack::Type::UInt) + Val |= N.getUInt(); + N = N.getDocument()->getNode(Val); +} + +// Set the entry point name for one shader. +void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { + if (isLegacy()) + return; + // Msgpack format. + getHwStage(CC)[".entry_point"] = MsgPackDoc.getNode(Name, /*Copy=*/true); +} + +// Set the number of used vgprs in the metadata. This is an optional +// advisory record for logging etc; wave dispatch actually uses the rsrc1 +// register for the shader stage to determine the number of vgprs to +// allocate. +void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedVgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_VGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedVgprsKey, Val); + return; + } + // Msgpack format. + getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val); +} + +// Set the number of used sgprs in the metadata. This is an optional advisory +// record for logging etc; wave dispatch actually uses the rsrc1 register for +// the shader stage to determine the number of sgprs to allocate. +void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedSgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_SGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedSgprsKey, Val); + return; + } + // Msgpack format. + getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val); +} + +// Set the scratch size in the metadata. +void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { + if (isLegacy()) { + // Old non-msgpack format. + setRegister(getScratchSizeKey(CC), Val); + return; + } + // Msgpack format. + getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); +} + +// Set the hardware register bit in PAL metadata to enable wave32 on the +// shader of the given calling convention. +void AMDGPUPALMetadata::setWave32(unsigned CC) { + switch (CC) { + case CallingConv::AMDGPU_HS: + setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_HS_W32_EN(1)); + break; + case CallingConv::AMDGPU_GS: + setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_GS_W32_EN(1)); + break; + case CallingConv::AMDGPU_VS: + setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_VS_W32_EN(1)); + break; + case CallingConv::AMDGPU_PS: + setRegister(PALMD::R_A1B6_SPI_PS_IN_CONTROL, S_0286D8_PS_W32_EN(1)); + break; + case CallingConv::AMDGPU_CS: + setRegister(PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR, + S_00B800_CS_W32_EN(1)); + break; + } +} + +// Convert a register number to name, for display by toString(). +// Returns nullptr if none. +static const char *getRegisterName(unsigned RegNum) { + // Table of registers. + static const struct RegInfo { + unsigned Num; + const char *Name; + } RegInfoTable[] = { + // Registers that code generation sets/modifies metadata for. + {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS, "SPI_SHADER_PGM_RSRC1_VS"}, + {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS + 1, "SPI_SHADER_PGM_RSRC2_VS"}, + {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS, "SPI_SHADER_PGM_RSRC1_LS"}, + {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS + 1, "SPI_SHADER_PGM_RSRC2_LS"}, + {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS, "SPI_SHADER_PGM_RSRC1_HS"}, + {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS + 1, "SPI_SHADER_PGM_RSRC2_HS"}, + {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES, "SPI_SHADER_PGM_RSRC1_ES"}, + {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES + 1, "SPI_SHADER_PGM_RSRC2_ES"}, + {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS, "SPI_SHADER_PGM_RSRC1_GS"}, + {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS + 1, "SPI_SHADER_PGM_RSRC2_GS"}, + {PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR, "COMPUTE_DISPATCH_INITIATOR"}, + {PALMD::R_2E12_COMPUTE_PGM_RSRC1, "COMPUTE_PGM_RSRC1"}, + {PALMD::R_2E12_COMPUTE_PGM_RSRC1 + 1, "COMPUTE_PGM_RSRC2"}, + {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS, "SPI_SHADER_PGM_RSRC1_PS"}, + {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS + 1, "SPI_SHADER_PGM_RSRC2_PS"}, + {PALMD::R_A1B3_SPI_PS_INPUT_ENA, "SPI_PS_INPUT_ENA"}, + {PALMD::R_A1B4_SPI_PS_INPUT_ADDR, "SPI_PS_INPUT_ADDR"}, + {PALMD::R_A1B6_SPI_PS_IN_CONTROL, "SPI_PS_IN_CONTROL"}, + {PALMD::R_A2D5_VGT_SHADER_STAGES_EN, "VGT_SHADER_STAGES_EN"}, + + // Registers not known to code generation. + {0x2c07, "SPI_SHADER_PGM_RSRC3_PS"}, + {0x2c46, "SPI_SHADER_PGM_RSRC3_VS"}, + {0x2c87, "SPI_SHADER_PGM_RSRC3_GS"}, + {0x2cc7, "SPI_SHADER_PGM_RSRC3_ES"}, + {0x2d07, "SPI_SHADER_PGM_RSRC3_HS"}, + {0x2d47, "SPI_SHADER_PGM_RSRC3_LS"}, + + {0xa1c3, "SPI_SHADER_POS_FORMAT"}, + {0xa1b1, "SPI_VS_OUT_CONFIG"}, + {0xa207, "PA_CL_VS_OUT_CNTL"}, + {0xa204, "PA_CL_CLIP_CNTL"}, + {0xa206, "PA_CL_VTE_CNTL"}, + {0xa2f9, "PA_SU_VTX_CNTL"}, + {0xa293, "PA_SC_MODE_CNTL_1"}, + {0xa2a1, "VGT_PRIMITIVEID_EN"}, + {0x2c81, "SPI_SHADER_PGM_RSRC4_GS"}, + {0x2e18, "COMPUTE_TMPRING_SIZE"}, + {0xa1b5, "SPI_INTERP_CONTROL_0"}, + {0xa1ba, "SPI_TMPRING_SIZE"}, + {0xa1c4, "SPI_SHADER_Z_FORMAT"}, + {0xa1c5, "SPI_SHADER_COL_FORMAT"}, + {0xa203, "DB_SHADER_CONTROL"}, + {0xa08f, "CB_SHADER_MASK"}, + {0xa191, "SPI_PS_INPUT_CNTL_0"}, + {0xa192, "SPI_PS_INPUT_CNTL_1"}, + {0xa193, "SPI_PS_INPUT_CNTL_2"}, + {0xa194, "SPI_PS_INPUT_CNTL_3"}, + {0xa195, "SPI_PS_INPUT_CNTL_4"}, + {0xa196, "SPI_PS_INPUT_CNTL_5"}, + {0xa197, "SPI_PS_INPUT_CNTL_6"}, + {0xa198, "SPI_PS_INPUT_CNTL_7"}, + {0xa199, "SPI_PS_INPUT_CNTL_8"}, + {0xa19a, "SPI_PS_INPUT_CNTL_9"}, + {0xa19b, "SPI_PS_INPUT_CNTL_10"}, + {0xa19c, "SPI_PS_INPUT_CNTL_11"}, + {0xa19d, "SPI_PS_INPUT_CNTL_12"}, + {0xa19e, "SPI_PS_INPUT_CNTL_13"}, + {0xa19f, "SPI_PS_INPUT_CNTL_14"}, + {0xa1a0, "SPI_PS_INPUT_CNTL_15"}, + {0xa1a1, "SPI_PS_INPUT_CNTL_16"}, + {0xa1a2, "SPI_PS_INPUT_CNTL_17"}, + {0xa1a3, "SPI_PS_INPUT_CNTL_18"}, + {0xa1a4, "SPI_PS_INPUT_CNTL_19"}, + {0xa1a5, "SPI_PS_INPUT_CNTL_20"}, + {0xa1a6, "SPI_PS_INPUT_CNTL_21"}, + {0xa1a7, "SPI_PS_INPUT_CNTL_22"}, + {0xa1a8, "SPI_PS_INPUT_CNTL_23"}, + {0xa1a9, "SPI_PS_INPUT_CNTL_24"}, + {0xa1aa, "SPI_PS_INPUT_CNTL_25"}, + {0xa1ab, "SPI_PS_INPUT_CNTL_26"}, + {0xa1ac, "SPI_PS_INPUT_CNTL_27"}, + {0xa1ad, "SPI_PS_INPUT_CNTL_28"}, + {0xa1ae, "SPI_PS_INPUT_CNTL_29"}, + {0xa1af, "SPI_PS_INPUT_CNTL_30"}, + {0xa1b0, "SPI_PS_INPUT_CNTL_31"}, + + {0xa2ce, "VGT_GS_MAX_VERT_OUT"}, + {0xa2ab, "VGT_ESGS_RING_ITEMSIZE"}, + {0xa290, "VGT_GS_MODE"}, + {0xa291, "VGT_GS_ONCHIP_CNTL"}, + {0xa2d7, "VGT_GS_VERT_ITEMSIZE"}, + {0xa2d8, "VGT_GS_VERT_ITEMSIZE_1"}, + {0xa2d9, "VGT_GS_VERT_ITEMSIZE_2"}, + {0xa2da, "VGT_GS_VERT_ITEMSIZE_3"}, + {0xa298, "VGT_GSVS_RING_OFFSET_1"}, + {0xa299, "VGT_GSVS_RING_OFFSET_2"}, + {0xa29a, "VGT_GSVS_RING_OFFSET_3"}, + + {0xa2e4, "VGT_GS_INSTANCE_CNT"}, + {0xa297, "VGT_GS_PER_VS"}, + {0xa29b, "VGT_GS_OUT_PRIM_TYPE"}, + {0xa2ac, "VGT_GSVS_RING_ITEMSIZE"}, + + {0xa2ad, "VGT_REUSE_OFF"}, + {0xa1b8, "SPI_BARYC_CNTL"}, + + {0x2c4c, "SPI_SHADER_USER_DATA_VS_0"}, + {0x2c4d, "SPI_SHADER_USER_DATA_VS_1"}, + {0x2c4e, "SPI_SHADER_USER_DATA_VS_2"}, + {0x2c4f, "SPI_SHADER_USER_DATA_VS_3"}, + {0x2c50, "SPI_SHADER_USER_DATA_VS_4"}, + {0x2c51, "SPI_SHADER_USER_DATA_VS_5"}, + {0x2c52, "SPI_SHADER_USER_DATA_VS_6"}, + {0x2c53, "SPI_SHADER_USER_DATA_VS_7"}, + {0x2c54, "SPI_SHADER_USER_DATA_VS_8"}, + {0x2c55, "SPI_SHADER_USER_DATA_VS_9"}, + {0x2c56, "SPI_SHADER_USER_DATA_VS_10"}, + {0x2c57, "SPI_SHADER_USER_DATA_VS_11"}, + {0x2c58, "SPI_SHADER_USER_DATA_VS_12"}, + {0x2c59, "SPI_SHADER_USER_DATA_VS_13"}, + {0x2c5a, "SPI_SHADER_USER_DATA_VS_14"}, + {0x2c5b, "SPI_SHADER_USER_DATA_VS_15"}, + {0x2c5c, "SPI_SHADER_USER_DATA_VS_16"}, + {0x2c5d, "SPI_SHADER_USER_DATA_VS_17"}, + {0x2c5e, "SPI_SHADER_USER_DATA_VS_18"}, + {0x2c5f, "SPI_SHADER_USER_DATA_VS_19"}, + {0x2c60, "SPI_SHADER_USER_DATA_VS_20"}, + {0x2c61, "SPI_SHADER_USER_DATA_VS_21"}, + {0x2c62, "SPI_SHADER_USER_DATA_VS_22"}, + {0x2c63, "SPI_SHADER_USER_DATA_VS_23"}, + {0x2c64, "SPI_SHADER_USER_DATA_VS_24"}, + {0x2c65, "SPI_SHADER_USER_DATA_VS_25"}, + {0x2c66, "SPI_SHADER_USER_DATA_VS_26"}, + {0x2c67, "SPI_SHADER_USER_DATA_VS_27"}, + {0x2c68, "SPI_SHADER_USER_DATA_VS_28"}, + {0x2c69, "SPI_SHADER_USER_DATA_VS_29"}, + {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"}, + {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"}, + + {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"}, + {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"}, + {0x2cce, "SPI_SHADER_USER_DATA_ES_2"}, + {0x2ccf, "SPI_SHADER_USER_DATA_ES_3"}, + {0x2cd0, "SPI_SHADER_USER_DATA_ES_4"}, + {0x2cd1, "SPI_SHADER_USER_DATA_ES_5"}, + {0x2cd2, "SPI_SHADER_USER_DATA_ES_6"}, + {0x2cd3, "SPI_SHADER_USER_DATA_ES_7"}, + {0x2cd4, "SPI_SHADER_USER_DATA_ES_8"}, + {0x2cd5, "SPI_SHADER_USER_DATA_ES_9"}, + {0x2cd6, "SPI_SHADER_USER_DATA_ES_10"}, + {0x2cd7, "SPI_SHADER_USER_DATA_ES_11"}, + {0x2cd8, "SPI_SHADER_USER_DATA_ES_12"}, + {0x2cd9, "SPI_SHADER_USER_DATA_ES_13"}, + {0x2cda, "SPI_SHADER_USER_DATA_ES_14"}, + {0x2cdb, "SPI_SHADER_USER_DATA_ES_15"}, + {0x2cdc, "SPI_SHADER_USER_DATA_ES_16"}, + {0x2cdd, "SPI_SHADER_USER_DATA_ES_17"}, + {0x2cde, "SPI_SHADER_USER_DATA_ES_18"}, + {0x2cdf, "SPI_SHADER_USER_DATA_ES_19"}, + {0x2ce0, "SPI_SHADER_USER_DATA_ES_20"}, + {0x2ce1, "SPI_SHADER_USER_DATA_ES_21"}, + {0x2ce2, "SPI_SHADER_USER_DATA_ES_22"}, + {0x2ce3, "SPI_SHADER_USER_DATA_ES_23"}, + {0x2ce4, "SPI_SHADER_USER_DATA_ES_24"}, + {0x2ce5, "SPI_SHADER_USER_DATA_ES_25"}, + {0x2ce6, "SPI_SHADER_USER_DATA_ES_26"}, + {0x2ce7, "SPI_SHADER_USER_DATA_ES_27"}, + {0x2ce8, "SPI_SHADER_USER_DATA_ES_28"}, + {0x2ce9, "SPI_SHADER_USER_DATA_ES_29"}, + {0x2cea, "SPI_SHADER_USER_DATA_ES_30"}, + {0x2ceb, "SPI_SHADER_USER_DATA_ES_31"}, + + {0x2c0c, "SPI_SHADER_USER_DATA_PS_0"}, + {0x2c0d, "SPI_SHADER_USER_DATA_PS_1"}, + {0x2c0e, "SPI_SHADER_USER_DATA_PS_2"}, + {0x2c0f, "SPI_SHADER_USER_DATA_PS_3"}, + {0x2c10, "SPI_SHADER_USER_DATA_PS_4"}, + {0x2c11, "SPI_SHADER_USER_DATA_PS_5"}, + {0x2c12, "SPI_SHADER_USER_DATA_PS_6"}, + {0x2c13, "SPI_SHADER_USER_DATA_PS_7"}, + {0x2c14, "SPI_SHADER_USER_DATA_PS_8"}, + {0x2c15, "SPI_SHADER_USER_DATA_PS_9"}, + {0x2c16, "SPI_SHADER_USER_DATA_PS_10"}, + {0x2c17, "SPI_SHADER_USER_DATA_PS_11"}, + {0x2c18, "SPI_SHADER_USER_DATA_PS_12"}, + {0x2c19, "SPI_SHADER_USER_DATA_PS_13"}, + {0x2c1a, "SPI_SHADER_USER_DATA_PS_14"}, + {0x2c1b, "SPI_SHADER_USER_DATA_PS_15"}, + {0x2c1c, "SPI_SHADER_USER_DATA_PS_16"}, + {0x2c1d, "SPI_SHADER_USER_DATA_PS_17"}, + {0x2c1e, "SPI_SHADER_USER_DATA_PS_18"}, + {0x2c1f, "SPI_SHADER_USER_DATA_PS_19"}, + {0x2c20, "SPI_SHADER_USER_DATA_PS_20"}, + {0x2c21, "SPI_SHADER_USER_DATA_PS_21"}, + {0x2c22, "SPI_SHADER_USER_DATA_PS_22"}, + {0x2c23, "SPI_SHADER_USER_DATA_PS_23"}, + {0x2c24, "SPI_SHADER_USER_DATA_PS_24"}, + {0x2c25, "SPI_SHADER_USER_DATA_PS_25"}, + {0x2c26, "SPI_SHADER_USER_DATA_PS_26"}, + {0x2c27, "SPI_SHADER_USER_DATA_PS_27"}, + {0x2c28, "SPI_SHADER_USER_DATA_PS_28"}, + {0x2c29, "SPI_SHADER_USER_DATA_PS_29"}, + {0x2c2a, "SPI_SHADER_USER_DATA_PS_30"}, + {0x2c2b, "SPI_SHADER_USER_DATA_PS_31"}, + + {0x2e40, "COMPUTE_USER_DATA_0"}, + {0x2e41, "COMPUTE_USER_DATA_1"}, + {0x2e42, "COMPUTE_USER_DATA_2"}, + {0x2e43, "COMPUTE_USER_DATA_3"}, + {0x2e44, "COMPUTE_USER_DATA_4"}, + {0x2e45, "COMPUTE_USER_DATA_5"}, + {0x2e46, "COMPUTE_USER_DATA_6"}, + {0x2e47, "COMPUTE_USER_DATA_7"}, + {0x2e48, "COMPUTE_USER_DATA_8"}, + {0x2e49, "COMPUTE_USER_DATA_9"}, + {0x2e4a, "COMPUTE_USER_DATA_10"}, + {0x2e4b, "COMPUTE_USER_DATA_11"}, + {0x2e4c, "COMPUTE_USER_DATA_12"}, + {0x2e4d, "COMPUTE_USER_DATA_13"}, + {0x2e4e, "COMPUTE_USER_DATA_14"}, + {0x2e4f, "COMPUTE_USER_DATA_15"}, + + {0x2e07, "COMPUTE_NUM_THREAD_X"}, + {0x2e08, "COMPUTE_NUM_THREAD_Y"}, + {0x2e09, "COMPUTE_NUM_THREAD_Z"}, + {0xa2db, "VGT_TF_PARAM"}, + {0xa2d6, "VGT_LS_HS_CONFIG"}, + {0xa287, "VGT_HOS_MIN_TESS_LEVEL"}, + {0xa286, "VGT_HOS_MAX_TESS_LEVEL"}, + {0xa2f8, "PA_SC_AA_CONFIG"}, + {0xa310, "PA_SC_SHADER_CONTROL"}, + {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"}, + + {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"}, + {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"}, + {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"}, + {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"}, + {0x2d10, "SPI_SHADER_USER_DATA_LS_4"}, + {0x2d11, "SPI_SHADER_USER_DATA_LS_5"}, + {0x2d12, "SPI_SHADER_USER_DATA_LS_6"}, + {0x2d13, "SPI_SHADER_USER_DATA_LS_7"}, + {0x2d14, "SPI_SHADER_USER_DATA_LS_8"}, + {0x2d15, "SPI_SHADER_USER_DATA_LS_9"}, + {0x2d16, "SPI_SHADER_USER_DATA_LS_10"}, + {0x2d17, "SPI_SHADER_USER_DATA_LS_11"}, + {0x2d18, "SPI_SHADER_USER_DATA_LS_12"}, + {0x2d19, "SPI_SHADER_USER_DATA_LS_13"}, + {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"}, + {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"}, + {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"}, + {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"}, + {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"}, + {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"}, + {0x2d20, "SPI_SHADER_USER_DATA_LS_20"}, + {0x2d21, "SPI_SHADER_USER_DATA_LS_21"}, + {0x2d22, "SPI_SHADER_USER_DATA_LS_22"}, + {0x2d23, "SPI_SHADER_USER_DATA_LS_23"}, + {0x2d24, "SPI_SHADER_USER_DATA_LS_24"}, + {0x2d25, "SPI_SHADER_USER_DATA_LS_25"}, + {0x2d26, "SPI_SHADER_USER_DATA_LS_26"}, + {0x2d27, "SPI_SHADER_USER_DATA_LS_27"}, + {0x2d28, "SPI_SHADER_USER_DATA_LS_28"}, + {0x2d29, "SPI_SHADER_USER_DATA_LS_29"}, + {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"}, + {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"}, + + {0xa2aa, "IA_MULTI_VGT_PARAM"}, + {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"}, + {0xa2e6, "VGT_STRMOUT_BUFFER_CONFIG"}, + {0xa2e5, "VGT_STRMOUT_CONFIG"}, + {0xa2b5, "VGT_STRMOUT_VTX_STRIDE_0"}, + {0xa2b9, "VGT_STRMOUT_VTX_STRIDE_1"}, + {0xa2bd, "VGT_STRMOUT_VTX_STRIDE_2"}, + {0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"}, + {0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"}, + + {0, nullptr}}; + auto Entry = RegInfoTable; + for (; Entry->Num && Entry->Num != RegNum; ++Entry) + ; + return Entry->Name; +} + +// Convert the accumulated PAL metadata into an asm directive. +void AMDGPUPALMetadata::toString(std::string &String) { + String.clear(); + if (!BlobType) + return; + raw_string_ostream Stream(String); + if (isLegacy()) { + if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil) + return; + // Old linear reg=val format. + Stream << '\t' << AMDGPU::PALMD::AssemblerDirective << ' '; + auto Regs = getRegisters(); + for (auto I = Regs.begin(), E = Regs.end(); I != E; ++I) { + if (I != Regs.begin()) + Stream << ','; + unsigned Reg = I->first.getUInt(); + unsigned Val = I->second.getUInt(); + Stream << "0x" << Twine::utohexstr(Reg) << ",0x" << Twine::utohexstr(Val); + } + Stream << '\n'; + return; + } + + // New msgpack-based format -- output as YAML (with unsigned numbers in hex), + // but first change the registers map to use names. + MsgPackDoc.setHexMode(); + auto &RegsObj = refRegisters(); + auto OrigRegs = RegsObj.getMap(); + RegsObj = MsgPackDoc.getMapNode(); + for (auto I : OrigRegs) { + auto Key = I.first; + if (const char *RegName = getRegisterName(Key.getUInt())) { + std::string KeyName = Key.toString(); + KeyName += " ("; + KeyName += RegName; + KeyName += ')'; + Key = MsgPackDoc.getNode(KeyName, /*Copy=*/true); + } + RegsObj.getMap()[Key] = I.second; + } + + // Output as YAML. + Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveBegin << '\n'; + MsgPackDoc.toYAML(Stream); + Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveEnd << '\n'; + + // Restore original registers map. + RegsObj = OrigRegs; +} + +// Convert the accumulated PAL metadata into a binary blob for writing as +// a .note record of the specified AMD type. Returns an empty blob if +// there is no PAL metadata, +void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) { + if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + toLegacyBlob(Blob); + else if (Type) + toMsgPackBlob(Blob); +} + +void AMDGPUPALMetadata::toLegacyBlob(std::string &Blob) { + Blob.clear(); + auto Registers = getRegisters(); + if (Registers.getMap().empty()) + return; + raw_string_ostream OS(Blob); + support::endian::Writer EW(OS, support::endianness::little); + for (auto I : Registers.getMap()) { + EW.write(uint32_t(I.first.getUInt())); + EW.write(uint32_t(I.second.getUInt())); + } +} + +void AMDGPUPALMetadata::toMsgPackBlob(std::string &Blob) { + Blob.clear(); + MsgPackDoc.writeToBlob(Blob); +} + +// Set PAL metadata from YAML text. Returns false if failed. +bool AMDGPUPALMetadata::setFromString(StringRef S) { + BlobType = ELF::NT_AMDGPU_METADATA; + if (!MsgPackDoc.fromYAML(S)) + return false; + + // In the registers map, some keys may be of the form "0xa191 + // (SPI_PS_INPUT_CNTL_0)", in which case the YAML input code made it a + // string. We need to turn it into a number. + auto &RegsObj = refRegisters(); + auto OrigRegs = RegsObj; + RegsObj = MsgPackDoc.getMapNode(); + Registers = RegsObj.getMap(); + bool Ok = true; + for (auto I : OrigRegs.getMap()) { + auto Key = I.first; + if (Key.getKind() == msgpack::Type::String) { + StringRef S = Key.getString(); + uint64_t Val; + if (S.consumeInteger(0, Val)) { + Ok = false; + errs() << "Unrecognized PAL metadata register key '" << S << "'\n"; + continue; + } + Key = MsgPackDoc.getNode(uint64_t(Val)); + } + Registers.getMap()[Key] = I.second; + } + return Ok; +} + +// Reference (create if necessary) the node for the registers map. +msgpack::DocNode &AMDGPUPALMetadata::refRegisters() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".registers")]; + N.getMap(/*Convert=*/true); + return N; +} + +// Get (create if necessary) the registers map. +msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() { + if (Registers.isEmpty()) + Registers = refRegisters(); + return Registers.getMap(); +} + +// Return the PAL metadata hardware shader stage name. +static const char *getStageName(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return ".ps"; + case CallingConv::AMDGPU_VS: + return ".vs"; + case CallingConv::AMDGPU_GS: + return ".gs"; + case CallingConv::AMDGPU_ES: + return ".es"; + case CallingConv::AMDGPU_HS: + return ".hs"; + case CallingConv::AMDGPU_LS: + return ".ls"; + default: + return ".cs"; + } +} + +// Get (create if necessary) the .hardware_stages entry for the given calling +// convention. +msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) { + if (HwStages.isEmpty()) + HwStages = MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)["amdpal.pipelines"] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[".hardware_stages"] + .getMap(/*Convert=*/true); + return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true); +} + +// Get .note record vendor name of metadata blob to be emitted. +const char *AMDGPUPALMetadata::getVendor() const { + return isLegacy() ? ElfNote::NoteNameV2 : ElfNote::NoteNameV3; +} + +// Get .note record type of metadata blob to be emitted: +// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or +// ELF::NT_AMDGPU_METADATA (MsgPack format), or +// 0 (no PAL metadata). +unsigned AMDGPUPALMetadata::getType() const { + return BlobType; +} + +// Return whether the blob type is legacy PAL metadata. +bool AMDGPUPALMetadata::isLegacy() const { + return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA; +} + +// Set legacy PAL metadata format. +void AMDGPUPALMetadata::setLegacy() { + BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; +} + diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h new file mode 100644 index 000000000000..0f17c157b206 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -0,0 +1,135 @@ +//===-- AMDGPUPALMetadata.h - PAL metadata handling -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// PAL metadata handling +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" +#include + +namespace llvm { + +class AMDGPUTargetStreamer; +class formatted_raw_ostream; +class MCStreamer; +class Module; + +class AMDGPUPALMetadata { + unsigned BlobType = 0; + msgpack::Document MsgPackDoc; + msgpack::DocNode Registers; + msgpack::DocNode HwStages; + +public: + // Read the amdgpu.pal.metadata supplied by the frontend, ready for + // per-function modification. + void readFromIR(Module &M); + + // Set PAL metadata from a binary blob from the applicable .note record. + // Returns false if bad format. Blob must remain valid for the lifetime of + // the Metadata. + bool setFromBlob(unsigned Type, StringRef Blob); + + // Set the rsrc1 register in the metadata for a particular shader stage. + // In fact this ORs the value into any previous setting of the register. + void setRsrc1(unsigned CC, unsigned Val); + + // Set the rsrc2 register in the metadata for a particular shader stage. + // In fact this ORs the value into any previous setting of the register. + void setRsrc2(unsigned CC, unsigned Val); + + // Set the SPI_PS_INPUT_ENA register in the metadata. + // In fact this ORs the value into any previous setting of the register. + void setSpiPsInputEna(unsigned Val); + + // Set the SPI_PS_INPUT_ADDR register in the metadata. + // In fact this ORs the value into any previous setting of the register. + void setSpiPsInputAddr(unsigned Val); + + // Get a register from the metadata, or 0 if not currently set. + unsigned getRegister(unsigned Reg); + + // Set a register in the metadata. + // In fact this ORs the value into any previous setting of the register. + void setRegister(unsigned Reg, unsigned Val); + + // Set the entry point name for one shader. + void setEntryPoint(unsigned CC, StringRef Name); + + // Set the number of used vgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of vgprs to allocate. + void setNumUsedVgprs(unsigned CC, unsigned Val); + + // Set the number of used sgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of sgprs to allocate. + void setNumUsedSgprs(unsigned CC, unsigned Val); + + // Set the scratch size in the metadata. + void setScratchSize(unsigned CC, unsigned Val); + + // Set the hardware register bit in PAL metadata to enable wave32 on the + // shader of the given calling convention. + void setWave32(unsigned CC); + + // Emit the accumulated PAL metadata as asm directives. + // This is called from AMDGPUTargetAsmStreamer::Finish(). + void toString(std::string &S); + + // Set PAL metadata from YAML text. + bool setFromString(StringRef S); + + // Get .note record vendor name of metadata blob to be emitted. + const char *getVendor() const; + + // Get .note record type of metadata blob to be emitted: + // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or + // ELF::NT_AMDGPU_METADATA (MsgPack format), or + // 0 (no PAL metadata). + unsigned getType() const; + + // Emit the accumulated PAL metadata as a binary blob. + // This is called from AMDGPUTargetELFStreamer::Finish(). + void toBlob(unsigned Type, std::string &S); + + // Get the msgpack::Document for the PAL metadata. + msgpack::Document *getMsgPackDoc() { return &MsgPackDoc; } + + // Set legacy PAL metadata format. + void setLegacy(); + +private: + // Return whether the blob type is legacy PAL metadata. + bool isLegacy() const; + + // Reference (create if necessary) the node for the registers map. + msgpack::DocNode &refRegisters(); + + // Get (create if necessary) the registers map. + msgpack::MapDocNode getRegisters(); + + // Get (create if necessary) the .hardware_stages entry for the given calling + // convention. + msgpack::MapDocNode getHwStage(unsigned CC); + + bool setFromLegacyBlob(StringRef Blob); + bool setFromMsgPackBlob(StringRef Blob); + void toLegacyBlob(std::string &Blob); + void toMsgPackBlob(std::string &Blob); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 82ffdef8e674..95ad3f35d18f 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -1,9 +1,8 @@ //===--------------------- AMDKernelCodeTInfo.h ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,6 +82,9 @@ COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP), COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE), COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE), +COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE), +COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED), +COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS), // TODO: bulky // TODO: cdbg_user COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), @@ -107,6 +109,7 @@ CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE), CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X), CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y), CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z), +CODEPROP(enable_wavefront_size32, ENABLE_WAVEFRONT_SIZE32), CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS), CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE), CODEPROP(is_ptr64, IS_PTR64), diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index 20059f4a1ed7..443e2cc45ac0 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -1,9 +1,8 @@ //===- AMDKernelCodeTUtils.cpp --------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h index ef9f9bdb6bcb..a87325a78df3 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h @@ -1,9 +1,8 @@ //===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td index 1fd1c1e21527..bd65a495fa72 100644 --- a/lib/Target/AMDGPU/VIInstrFormats.td +++ b/lib/Target/AMDGPU/VIInstrFormats.td @@ -1,9 +1,8 @@ //===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index b45c8fc9c7d5..ec7d8875a746 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -1,9 +1,8 @@ //===-- VIInstructions.td - VI Instruction Defintions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Instruction definitions for VI and newer. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 68446ab79720..6bc416ed7d4b 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -1,9 +1,8 @@ //===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,7 +14,7 @@ class VOP1e op, VOPProfile P> : Enc32 { bits<8> vdst; bits<9> src0; - let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0); + let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, ?); let Inst{16-9} = op; let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{31-25} = 0x3f; //encoding @@ -48,7 +47,6 @@ class VOP1_Pseudo pattern=[], bit VOP1On let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; let VOP1 = 1; let VALU = 1; @@ -144,7 +142,7 @@ defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; // TODO: Make profile for this, there is VOP3 encoding also def V_READFIRSTLANE_B32 : InstSI <(outs SReg_32:$vdst), - (ins VGPR_32:$src0), + (ins VRegOrLds_32:$src0), "v_readfirstlane_b32 $vdst, $src0", [(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>, Enc32 { @@ -156,7 +154,6 @@ def V_READFIRSTLANE_B32 : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; let VOP1 = 1; let VALU = 1; @@ -172,9 +169,16 @@ def V_READFIRSTLANE_B32 : let Inst{31-25} = 0x3f; //encoding } -let SchedRW = [WriteQuarterRate32] in { -defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; +let SchedRW = [WriteDoubleCvt] in { +defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; +defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; +defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; +defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; +defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; +} // End SchedRW = [WriteDoubleCvt] + +let SchedRW = [WriteQuarterRate32] in { defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; @@ -186,15 +190,12 @@ defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; -defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; -defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; +} // End SchedRW = [WriteQuarterRate32] + defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>; defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>; defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>; defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>; -defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; -defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; -} // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>; @@ -271,6 +272,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, @@ -279,6 +281,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; + let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret; let AsmSDWA = getAsmSDWA<1, 1>.ret; let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; @@ -305,41 +308,43 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT>; defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; -// These instruction only exist on SI and CI -let SubtargetPredicate = isSICI in { - -let SchedRW = [WriteQuarterRate32] in { -defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; -defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>; -defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; -defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>; -defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>; -} // End SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { -defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>; -defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; -} // End SchedRW = [WriteDouble] - -} // End SubtargetPredicate = isSICI - - -let SubtargetPredicate = isCIVI in { - -let SchedRW = [WriteDoubleAdd] in { -defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>; -defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>; -defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>; -defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>; -} // End SchedRW = [WriteDoubleAdd] - -let SchedRW = [WriteQuarterRate32] in { -defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>; -defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; -} // End SchedRW = [WriteQuarterRate32] - -} // End SubtargetPredicate = isCIVI - +let SubtargetPredicate = isGFX6GFX7 in { + let SchedRW = [WriteQuarterRate32] in { + defm V_LOG_CLAMP_F32 : + VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; + defm V_RCP_CLAMP_F32 : + VOP1Inst<"v_rcp_clamp_f32", VOP_F32_F32>; + defm V_RCP_LEGACY_F32 : + VOP1Inst<"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; + defm V_RSQ_CLAMP_F32 : + VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>; + defm V_RSQ_LEGACY_F32 : + VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>; + } // End SchedRW = [WriteQuarterRate32] + + let SchedRW = [WriteDouble] in { + defm V_RCP_CLAMP_F64 : + VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>; + defm V_RSQ_CLAMP_F64 : + VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; + } // End SchedRW = [WriteDouble] +} // End SubtargetPredicate = isGFX6GFX7 + +let SubtargetPredicate = isGFX7GFX8GFX9 in { + let SchedRW = [WriteQuarterRate32] in { + defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>; + defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>; + } // End SchedRW = [WriteQuarterRate32] +} // End SubtargetPredicate = isGFX7GFX8GFX9 + +let SubtargetPredicate = isGFX7Plus in { + let SchedRW = [WriteDoubleAdd] in { + defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>; + defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>; + defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, frint>; + defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>; + } // End SchedRW = [WriteDoubleAdd] +} // End SubtargetPredicate = isGFX7Plus let SubtargetPredicate = Has16BitInsts in { @@ -393,125 +398,279 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Ins64 = (ins); } -let SubtargetPredicate = isGFX9 in { - let Constraints = "$vdst = $src1, $vdst1 = $src0", - DisableEncoding="$vdst1,$src1", - SchedRW = [Write64Bit, Write64Bit] in { -// Never VOP3. Takes as long as 2 v_mov_b32s -def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>; +let SubtargetPredicate = isGFX9Plus in { + def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> { + let Constraints = "$vdst = $src1, $vdst1 = $src0"; + let DisableEncoding = "$vdst1,$src1"; + let SchedRW = [Write64Bit, Write64Bit]; + } + + defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; + defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; + defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX9Only in { + defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; +} // End SubtargetPredicate = isGFX9Only + +let SubtargetPredicate = isGFX10Plus in { + defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NONE>; + + let Uses = [M0] in { + // FIXME-GFX10: Should V_MOVRELSD_2_B32 be VOP_NO_EXT? + defm V_MOVRELSD_2_B32 : + VOP1Inst<"v_movrelsd_2_b32", VOP_NO_EXT>; + + def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> { + let Constraints = "$vdst = $src1, $vdst1 = $src0"; + let DisableEncoding = "$vdst1,$src1"; + let SchedRW = [Write64Bit, Write64Bit]; + } + } // End Uses = [M0] +} // End SubtargetPredicate = isGFX10Plus + +//===----------------------------------------------------------------------===// +// Target-specific instruction encodings. +//===----------------------------------------------------------------------===// + +class VOP1_DPP op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : + VOP_DPP { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + bits<8> vdst; + let Inst{8-0} = 0xfa; + let Inst{16-9} = op; + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; } -defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; +class VOP1_DPP16 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP1_DPP { + let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let SubtargetPredicate = HasDPP16; +} -defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; -defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; -defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; +class VOP1_DPP8 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP_DPP8 { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; -} // End SubtargetPredicate = isGFX9 + bits<8> vdst; + let Inst{8-0} = fi; + let Inst{16-9} = op; + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; + + let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let SubtargetPredicate = HasDPP8; +} //===----------------------------------------------------------------------===// -// Target +// GFX10. //===----------------------------------------------------------------------===// +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass VOP1Only_Real_gfx10 op> { + def _gfx10 : + VOP1_Real(NAME), SIEncodingFamily.GFX10>, + VOP1e(NAME).Pfl>; + } + multiclass VOP1_Real_e32_gfx10 op> { + def _e32_gfx10 : + VOP1_Real(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOP1e(NAME#"_e32").Pfl>; + } + multiclass VOP1_Real_e64_gfx10 op> { + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast(NAME#"_e64").Pfl>; + } + multiclass VOP1_Real_sdwa_gfx10 op> { + def _sdwa_gfx10 : + VOP_SDWA10_Real(NAME#"_sdwa")>, + VOP1_SDWA9Ae(NAME#"_sdwa").Pfl> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP1_Real_dpp_gfx10 op> { + def _dpp_gfx10 : VOP1_DPP16(NAME#"_e32")> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP1_Real_dpp8_gfx10 op> { + def _dpp8_gfx10 : VOP1_DPP8(NAME#"_e32")> { + let DecoderNamespace = "DPP8"; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +multiclass VOP1_Real_gfx10_no_dpp op> : + VOP1_Real_e32_gfx10, VOP1_Real_e64_gfx10, + VOP1_Real_sdwa_gfx10; + +multiclass VOP1_Real_gfx10_no_dpp8 op> : + VOP1_Real_e32_gfx10, VOP1_Real_e64_gfx10, + VOP1_Real_sdwa_gfx10, VOP1_Real_dpp_gfx10; + +multiclass VOP1_Real_gfx10 op> : + VOP1_Real_gfx10_no_dpp8, VOP1_Real_dpp8_gfx10; + +defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>; +defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>; +defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>; +defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>; +defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>; +defm V_CVT_I16_F16 : VOP1_Real_gfx10<0x053>; +defm V_RCP_F16 : VOP1_Real_gfx10<0x054>; +defm V_SQRT_F16 : VOP1_Real_gfx10<0x055>; +defm V_RSQ_F16 : VOP1_Real_gfx10<0x056>; +defm V_LOG_F16 : VOP1_Real_gfx10<0x057>; +defm V_EXP_F16 : VOP1_Real_gfx10<0x058>; +defm V_FREXP_MANT_F16 : VOP1_Real_gfx10<0x059>; +defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>; +defm V_FLOOR_F16 : VOP1_Real_gfx10<0x05b>; +defm V_CEIL_F16 : VOP1_Real_gfx10<0x05c>; +defm V_TRUNC_F16 : VOP1_Real_gfx10<0x05d>; +defm V_RNDNE_F16 : VOP1_Real_gfx10<0x05e>; +defm V_FRACT_F16 : VOP1_Real_gfx10<0x05f>; +defm V_SIN_F16 : VOP1_Real_gfx10<0x060>; +defm V_COS_F16 : VOP1_Real_gfx10<0x061>; +defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>; +defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>; +defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>; + +defm V_SWAP_B32 : VOP1Only_Real_gfx10<0x065>; +defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>; + //===----------------------------------------------------------------------===// -// SI +// GFX7, GFX10. //===----------------------------------------------------------------------===// -multiclass VOP1_Real_si op> { - let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { - def _e32_si : +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass VOP1_Real_e32_gfx7 op> { + def _e32_gfx7 : VOP1_Real(NAME#"_e32"), SIEncodingFamily.SI>, VOP1e(NAME#"_e32").Pfl>; - def _e64_si : + } + multiclass VOP1_Real_e64_gfx7 op> { + def _e64_gfx7 : VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_si <{1, 1, op{6-0}}, !cast(NAME#"_e64").Pfl>; + VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast(NAME#"_e64").Pfl>; } -} +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" -defm V_NOP : VOP1_Real_si <0x0>; -defm V_MOV_B32 : VOP1_Real_si <0x1>; -defm V_CVT_I32_F64 : VOP1_Real_si <0x3>; -defm V_CVT_F64_I32 : VOP1_Real_si <0x4>; -defm V_CVT_F32_I32 : VOP1_Real_si <0x5>; -defm V_CVT_F32_U32 : VOP1_Real_si <0x6>; -defm V_CVT_U32_F32 : VOP1_Real_si <0x7>; -defm V_CVT_I32_F32 : VOP1_Real_si <0x8>; -defm V_MOV_FED_B32 : VOP1_Real_si <0x9>; -defm V_CVT_F16_F32 : VOP1_Real_si <0xa>; -defm V_CVT_F32_F16 : VOP1_Real_si <0xb>; -defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>; -defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>; -defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>; -defm V_CVT_F32_F64 : VOP1_Real_si <0xf>; -defm V_CVT_F64_F32 : VOP1_Real_si <0x10>; -defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>; -defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>; -defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>; -defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>; -defm V_CVT_U32_F64 : VOP1_Real_si <0x15>; -defm V_CVT_F64_U32 : VOP1_Real_si <0x16>; -defm V_FRACT_F32 : VOP1_Real_si <0x20>; -defm V_TRUNC_F32 : VOP1_Real_si <0x21>; -defm V_CEIL_F32 : VOP1_Real_si <0x22>; -defm V_RNDNE_F32 : VOP1_Real_si <0x23>; -defm V_FLOOR_F32 : VOP1_Real_si <0x24>; -defm V_EXP_F32 : VOP1_Real_si <0x25>; -defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>; -defm V_LOG_F32 : VOP1_Real_si <0x27>; -defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>; -defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>; -defm V_RCP_F32 : VOP1_Real_si <0x2a>; -defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>; -defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>; -defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>; -defm V_RSQ_F32 : VOP1_Real_si <0x2e>; -defm V_RCP_F64 : VOP1_Real_si <0x2f>; -defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>; -defm V_RSQ_F64 : VOP1_Real_si <0x31>; -defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>; -defm V_SQRT_F32 : VOP1_Real_si <0x33>; -defm V_SQRT_F64 : VOP1_Real_si <0x34>; -defm V_SIN_F32 : VOP1_Real_si <0x35>; -defm V_COS_F32 : VOP1_Real_si <0x36>; -defm V_NOT_B32 : VOP1_Real_si <0x37>; -defm V_BFREV_B32 : VOP1_Real_si <0x38>; -defm V_FFBH_U32 : VOP1_Real_si <0x39>; -defm V_FFBL_B32 : VOP1_Real_si <0x3a>; -defm V_FFBH_I32 : VOP1_Real_si <0x3b>; -defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>; -defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>; -defm V_FRACT_F64 : VOP1_Real_si <0x3e>; -defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>; -defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>; -defm V_CLREXCP : VOP1_Real_si <0x41>; -defm V_MOVRELD_B32 : VOP1_Real_si <0x42>; -defm V_MOVRELS_B32 : VOP1_Real_si <0x43>; -defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>; +multiclass VOP1_Real_gfx7 op> : + VOP1_Real_e32_gfx7, VOP1_Real_e64_gfx7; + +multiclass VOP1_Real_gfx7_gfx10 op> : + VOP1_Real_gfx7, VOP1_Real_gfx10; + +defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>; +defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>; + +defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>; +defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10<0x018>; +defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>; +defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>; //===----------------------------------------------------------------------===// -// CI +// GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -multiclass VOP1_Real_ci op> { - let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in { - def _e32_ci : +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP1_Real_e32_gfx6_gfx7 op> { + def _e32_gfx6_gfx7 : VOP1_Real(NAME#"_e32"), SIEncodingFamily.SI>, VOP1e(NAME#"_e32").Pfl>; - def _e64_ci : + } + multiclass VOP1_Real_e64_gfx6_gfx7 op> { + def _e64_gfx6_gfx7 : VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_si <{1, 1, op{6-0}}, !cast(NAME#"_e64").Pfl>; + VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast(NAME#"_e64").Pfl>; } -} - -defm V_TRUNC_F64 : VOP1_Real_ci <0x17>; -defm V_CEIL_F64 : VOP1_Real_ci <0x18>; -defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>; -defm V_RNDNE_F64 : VOP1_Real_ci <0x19>; -defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>; -defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>; +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP1_Real_gfx6_gfx7 op> : + VOP1_Real_e32_gfx6_gfx7, VOP1_Real_e64_gfx6_gfx7; + +multiclass VOP1_Real_gfx6_gfx7_gfx10 op> : + VOP1_Real_gfx6_gfx7, VOP1_Real_gfx10; + +multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp8 op> : + VOP1_Real_gfx6_gfx7, VOP1_Real_gfx10_no_dpp8; + +multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp op> : + VOP1_Real_gfx6_gfx7, VOP1_Real_gfx10_no_dpp; + +defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>; +defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>; +defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>; +defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>; +defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>; +defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>; +defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>; + +defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10<0x000>; +defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x001>; +defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x003>; +defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x004>; +defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>; +defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>; +defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>; +defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>; +defm V_MOV_FED_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x009>; +defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10<0x00e>; +defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x010>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10<0x011>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10<0x012>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10<0x013>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10<0x014>; +defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x015>; +defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x016>; +defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x020>; +defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x021>; +defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x022>; +defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x023>; +defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x024>; +defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x025>; +defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x027>; +defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02a>; +defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02b>; +defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02e>; +defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x031>; +defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x033>; +defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x034>; +defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x035>; +defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x036>; +defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x037>; +defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x038>; +defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>; +defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>; +defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>; +defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03d>; +defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03e>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>; +defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>; +defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>; +defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x042>; +defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x043>; +defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x044>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// class VOP1_DPPe op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : @@ -524,7 +683,7 @@ class VOP1_DPPe op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : } multiclass VOP1Only_Real_vi op> { - let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { def _vi : VOP1_Real(NAME), SIEncodingFamily.VI>, VOP1e(NAME).Pfl>; @@ -532,7 +691,7 @@ multiclass VOP1Only_Real_vi op> { } multiclass VOP1_Real_e32e64_vi op> { - let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { def _e32_vi : VOP1_Real(NAME#"_e32"), SIEncodingFamily.VI>, VOP1e(NAME#"_e32").Pfl>; @@ -649,7 +808,7 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs), PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT.ret:$vdst, getVOPSrc0ForVT.ret:$src0)> { let VOP1 = 1; - let SubtargetPredicate = isVI; + let SubtargetPredicate = isGFX8GFX9; } // This is a pseudo variant of the v_movreld_b32 instruction in which the @@ -672,7 +831,7 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo; def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo; def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo; -let OtherPredicates = [isVI] in { +let OtherPredicates = [isGFX8GFX9] in { def : GCNPat < (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, @@ -690,6 +849,9 @@ def : GCNPat < (as_i1imm $bound_ctrl)) >; +} // End OtherPredicates = [isGFX8GFX9] + +let OtherPredicates = [isGFX8Plus] in { def : GCNPat< (i32 (anyext i16:$src)), (COPY $src) @@ -712,14 +874,14 @@ def : GCNPat < (EXTRACT_SUBREG $src, sub0) >; -} // End OtherPredicates = [isVI] +} // End OtherPredicates = [isGFX8Plus] //===----------------------------------------------------------------------===// // GFX9 //===----------------------------------------------------------------------===// multiclass VOP1_Real_gfx9 op> { - let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { + let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in { defm NAME : VOP1_Real_e32e64_vi ; } @@ -735,3 +897,30 @@ multiclass VOP1_Real_gfx9 op> { } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; + +//===----------------------------------------------------------------------===// +// GFX10 +//===----------------------------------------------------------------------===// + +let OtherPredicates = [isGFX10Plus] in { +def : GCNPat < + (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)), + (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) +>; + +def : GCNPat < + (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl)), + (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl), (i32 0)) +>; + +def : GCNPat < + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, + imm:$bank_mask, imm:$bound_ctrl)), + (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl), (i32 0)) +>; +} // End OtherPredicates = [isGFX10Plus] diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index e3fd7b5f9fad..1b30cd2ed516 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -1,9 +1,8 @@ //===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -69,7 +68,6 @@ class VOP2_Pseudo pattern=[], string suf let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; let VOP2 = 1; let VALU = 1; @@ -177,7 +175,9 @@ multiclass VOP2bInst .ret>, - Commutable_REV; + Commutable_REV { + let usesCustomInserter = !eq(P.NumSrcArgs, 2); + } def _sdwa : VOP2_SDWA_Pseudo { let AsmMatchConverter = "cvtSdwaVOP2b"; @@ -192,6 +192,23 @@ multiclass VOP2bInst : + InstAlias , + PredicateControl { +} + +multiclass VOP2bInstAliases { + let WaveSizePredicate = isWave32 in { + def : VOP2bInstAlias; + } + let WaveSizePredicate = isWave64 in { + def : VOP2bInstAlias; + } +} + multiclass VOP2eInst : + InstAlias , + PredicateControl { +} + +multiclass VOP2eInstAliases { + let WaveSizePredicate = isWave32 in { + def : VOP2eInstAlias; + } + let WaveSizePredicate = isWave64 in { + def : VOP2eInstAlias; + } +} + class VOP_MADAK : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); @@ -244,15 +277,22 @@ def VOP_MADMK_F32 : VOP_MADMK ; // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. -class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { +class VOP_MAC : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, - 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; + 0, HasModifiers, HasModifiers, HasOMod, + Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + + let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument + dpp8:$dpp8, FI:$fi); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, @@ -260,11 +300,13 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let Asm32 = getAsm32<1, 2, vt>.ret; - let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret; - let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; - let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; + let Asm32 = getAsm32<1, 2, vt0>.ret; + let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt0>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt0>.ret; + let AsmDPP16 = getAsmDPP16<1, 2, HasModifiers, vt0>.ret; + let AsmDPP8 = getAsmDPP8<1, 2, 0, vt0>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt0>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt0>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; @@ -272,38 +314,51 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 0; + let TieRegDPP = "$src2"; } def VOP_MAC_F16 : VOP_MAC ; def VOP_MAC_F32 : VOP_MAC ; +class VOP_DOT_ACC : VOP_MAC { + let HasClamp = 0; + let HasExtSDWA = 0; + let HasModifiers = 1; + let HasOpSel = 0; + let IsPacked = 0; +} + +def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC { + let Src0ModDPP = FPVRegInputMods; + let Src1ModDPP = FPVRegInputMods; +} +def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC; + // Write out to vcc or arbitrary SGPR. -def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> { let Asm32 = "$vdst, vcc, $src0, $src1"; - let Asm64 = "$vdst, $sdst, $src0, $src1"; + let Asm64 = "$vdst, $sdst, $src0, $src1$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); } // Write out to vcc or arbitrary SGPR and read in from vcc or // arbitrary SGPR. -def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { - // We use VCSrc_b32 to exclude literal constants, even though the - // encoding normally allows them since the implicit VCC use means - // using one would always violate the constant bus - // restriction. SGPRs are still allowed because it should - // technically be possible to use VCC again as src0. - let Src0RC32 = VCSrc_b32; +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> { let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; - let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. @@ -320,20 +375,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let HasExt = 1; let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; } -// Read in from vcc or arbitrary SGPR -def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { - let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. - let Asm32 = "$vdst, $src0, $src1, vcc"; - let Asm64 = "$vdst, $src0, $src1, $src2"; +// Read in from vcc or arbitrary SGPR. +def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> { + let Asm32 = "$vdst, $src0, $src1"; + let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2"; let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst); @@ -349,10 +407,12 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { src0_sel:$src0_sel, src1_sel:$src1_sel); let InsDPP = (ins DstRCDPP:$old, - Src0DPP:$src0, - Src1DPP:$src1, + Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let HasExt = 1; let HasExtDPP = 1; let HasExtSDWA = 1; @@ -362,7 +422,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let Outs32 = (outs SReg_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; @@ -393,8 +453,6 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { // VOP2 Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN, Predicates = [isGCN] in { - defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; @@ -414,9 +472,9 @@ defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN, smin>; defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN, smax>; defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN, umin>; defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN, umax>; -defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">; -defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">; -defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">; +defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">; +defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">; +defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">; defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN, and>; defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN, or>; defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN, xor>; @@ -442,9 +500,9 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f let SubtargetPredicate = HasAddNoCarryInsts in { -defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>; -defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; -defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; +defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>; +defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; +defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; } } // End isCommutable = 1 @@ -472,32 +530,20 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT, AMDGPUpk_u16_u32>; defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT, AMDGPUpk_i16_i32>; -} // End SubtargetPredicate = isGCN, Predicates = [isGCN] - -def : GCNPat< - (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), - (V_ADDC_U32_e64 $src0, $src1, $src2) ->; - -def : GCNPat< - (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), - (V_SUBB_U32_e64 $src0, $src1, $src2) ->; - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI, Predicates = [isSICI] in { +let SubtargetPredicate = isGFX6GFX7 in { defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>; defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; +} // End SubtargetPredicate = isGFX6GFX7 +let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN, srl>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN, sra>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN, shl>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End isCommutable = 1 - -} // End let SubtargetPredicate = SICI, Predicates = [isSICI] +} // End SubtargetPredicate = isGFX6GFX7GFX10 class DivergentBinOp : GCNPat< @@ -508,29 +554,29 @@ class DivergentBinOp : ) >; -let AddedComplexity = 1 in { - def : DivergentBinOp; - def : DivergentBinOp; - def : DivergentBinOp; -} +class DivergentClampingBinOp : + GCNPat< + (getDivergentFrag.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1), + !if(!cast(Inst).IsOrig, + (Inst $src0, $src1, 0), + (Inst $src1, $src0, 0) + ) + >; + +def : DivergentBinOp; +def : DivergentBinOp; +def : DivergentBinOp; let SubtargetPredicate = HasAddNoCarryInsts in { - def : DivergentBinOp; - def : DivergentBinOp; - def : DivergentBinOp; + def : DivergentClampingBinOp; + def : DivergentClampingBinOp; } +let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in { +def : DivergentClampingBinOp; +def : DivergentClampingBinOp; +} -def : DivergentBinOp; - -def : DivergentBinOp; -def : DivergentBinOp; - -def : DivergentBinOp; - -def : DivergentBinOp; -def : DivergentBinOp; -def : DivergentBinOp; def : DivergentBinOp; def : DivergentBinOp; @@ -604,56 +650,133 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; } // End SubtargetPredicate = HasDLInsts -// Note: 16-bit instructions produce a 0 result in the high 16-bits. -multiclass Arithmetic_i16_Pats { +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { + let SubtargetPredicate = HasDot5Insts in + defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; + let SubtargetPredicate = HasDot6Insts in + defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; + + let SubtargetPredicate = HasDot4Insts in + defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; + let SubtargetPredicate = HasDot3Insts in + defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; +} + +let AddedComplexity = 30 in { + def : GCNPat< + (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), + (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot5Insts; + } + def : GCNPat< + (i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), + (i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot6Insts; + } + def : GCNPat< + (i32 (int_amdgcn_sdot2 v2i16:$src0, v2i16:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), + (i32 (V_DOT2C_I32_I16_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot4Insts; + } + def : GCNPat< + (i32 (int_amdgcn_sdot8 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), + (i32 (V_DOT8C_I32_I4_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot3Insts; + } +} // End AddedComplexity = 30 + +let SubtargetPredicate = isGFX10Plus in { + +def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">; +let FPDPRounding = 1 in +def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; + +let isCommutable = 1 in { +def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">; +let FPDPRounding = 1 in +def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; +} // End isCommutable = 1 + +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { +defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>; +} + +} // End SubtargetPredicate = isGFX10Plus + +let SubtargetPredicate = HasPkFmacF16Inst in { +defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; +} // End SubtargetPredicate = HasPkFmacF16Inst + +// Note: 16-bit instructions produce a 0 result in the high 16-bits +// on GFX8 and GFX9 and preserve high 16 bits on GFX10+ +def ClearHI16 : OutPatFrag<(ops node:$op), + (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>; + +multiclass Arithmetic_i16_Pats { def : GCNPat< (op i16:$src0, i16:$src1), - (inst $src0, $src1) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) >; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src0, $src1) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - (inst $src0, $src1), sub0, + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)), + sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; - } -multiclass Bits_OpsRev_i16_Pats { +multiclass Bits_OpsRev_i16_Pats { def : GCNPat< (op i16:$src0, i16:$src1), - (inst $src1, $src0) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) >; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src1, $src0) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - (inst $src1, $src0), sub0, + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)), + sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } class ZExt_i16_i1_Pat : GCNPat < (i16 (ext i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) + (V_CNDMASK_B32_e64 (i32 0/*src0mod*/), (i32 0/*src0*/), + (i32 0/*src1mod*/), (i32 1/*src1*/), + $src) >; let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; @@ -661,6 +784,17 @@ defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; +} + +let Predicates = [Has16BitInsts, isGFX10Plus] in { +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +} def : GCNPat < (and i16:$src0, i16:$src1), @@ -677,16 +811,25 @@ def : GCNPat < (V_XOR_B32_e64 $src0, $src1) >; +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { defm : Bits_OpsRev_i16_Pats; defm : Bits_OpsRev_i16_Pats; defm : Bits_OpsRev_i16_Pats; +} + +let Predicates = [Has16BitInsts, isGFX10Plus] in { +defm : Bits_OpsRev_i16_Pats; +defm : Bits_OpsRev_i16_Pats; +defm : Bits_OpsRev_i16_Pats; +} def : ZExt_i16_i1_Pat; def : ZExt_i16_i1_Pat; def : GCNPat < (i16 (sext i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src) >; // Undo sub x, c -> add x, -c canonicalization since c is more likely @@ -697,105 +840,334 @@ def : GCNPat< (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) >; -} // End Predicates = [Has16BitInsts] +} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] + //===----------------------------------------------------------------------===// -// SI +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { +class VOP2_DPP op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl, + bit IsDPP16 = 0> : + VOP_DPP { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; -multiclass VOP2_Real_si op> { - def _si : - VOP2_Real(NAME), SIEncodingFamily.SI>, - VOP2e(NAME).Pfl>; + bits<8> vdst; + bits<8> src1; + let Inst{8-0} = 0xfa; + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; } -multiclass VOP2_Real_MADK_si op> { - def _si : VOP2_Real(NAME), SIEncodingFamily.SI>, - VOP2_MADKe(NAME).Pfl>; +class VOP2_DPP16 op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP2_DPP { + let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let SubtargetPredicate = HasDPP16; } -multiclass VOP2_Real_e32_si op> { - def _e32_si : - VOP2_Real(NAME#"_e32"), SIEncodingFamily.SI>, - VOP2e(NAME#"_e32").Pfl>; +class VOP2_DPP8 op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP_DPP8 { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = fi; + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; + + let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let SubtargetPredicate = HasDPP8; } -multiclass VOP2_Real_e32e64_si op> : VOP2_Real_e32_si { - def _e64_si : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_si <{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; -} - -multiclass VOP2be_Real_e32e64_si op> : VOP2_Real_e32_si { - def _e64_si : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3be_si <{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; -} - -} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" - -defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>; -defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>; -defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>; -defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>; -defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>; -defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>; -defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>; -defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>; -defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>; -defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>; -defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>; -defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>; -defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>; -defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>; -defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>; -defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>; -defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>; -defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>; -defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>; -defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>; -defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>; -defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>; -defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>; -defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>; -defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>; -defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>; -defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>; -defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>; -defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>; -defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>; -defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; - -defm V_READLANE_B32 : VOP2_Real_si <0x01>; - -let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { -defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; -} - -defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>; -defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>; -defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>; -defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>; -defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>; -defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>; - -defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>; -defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>; -defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>; -defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>; -defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>; -defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>; -defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>; -defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>; -defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>; -defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + //===------------------------------- VOP2 -------------------------------===// + multiclass VOP2Only_Real_MADK_gfx10 op> { + def _gfx10 : + VOP2_Real(NAME), SIEncodingFamily.GFX10>, + VOP2_MADKe(NAME).Pfl>; + } + multiclass VOP2Only_Real_MADK_gfx10_with_name op, string opName, + string asmName> { + def _gfx10 : + VOP2_Real(opName), SIEncodingFamily.GFX10>, + VOP2_MADKe(opName).Pfl> { + VOP2_Pseudo ps = !cast(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_e32_gfx10 op> { + def _e32_gfx10 : + VOP2_Real(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOP2e(NAME#"_e32").Pfl>; + } + multiclass VOP2_Real_e64_gfx10 op> { + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; + } + multiclass VOP2_Real_sdwa_gfx10 op> { + def _sdwa_gfx10 : + VOP_SDWA10_Real(NAME#"_sdwa")>, + VOP2_SDWA9Ae(NAME#"_sdwa").Pfl> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP2_Real_dpp_gfx10 op> { + def _dpp_gfx10 : VOP2_DPP16(NAME#"_e32")> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP2_Real_dpp8_gfx10 op> { + def _dpp8_gfx10 : VOP2_DPP8(NAME#"_e32")> { + let DecoderNamespace = "DPP8"; + } + } + + //===------------------------- VOP2 (with name) -------------------------===// + multiclass VOP2_Real_e32_gfx10_with_name op, string opName, + string asmName> { + def _e32_gfx10 : + VOP2_Real(opName#"_e32"), SIEncodingFamily.GFX10>, + VOP2e(opName#"_e32").Pfl> { + VOP2_Pseudo ps = !cast(opName#"_e32"); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_e64_gfx10_with_name op, string opName, + string asmName> { + def _e64_gfx10 : + VOP3_Real(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, + !cast(opName#"_e64").Pfl> { + VOP3_Pseudo ps = !cast(opName#"_e64"); + let AsmString = asmName # ps.AsmOperands; + } + } + let DecoderNamespace = "SDWA10" in { + multiclass VOP2_Real_sdwa_gfx10_with_name op, string opName, + string asmName> { + def _sdwa_gfx10 : + VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast(opName#"_sdwa"); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_dpp_gfx10_with_name op, string opName, + string asmName> { + def _dpp_gfx10 : VOP2_DPP16(opName#"_e32")> { + VOP2_Pseudo ps = !cast(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP16; + } + } + multiclass VOP2_Real_dpp8_gfx10_with_name op, string opName, + string asmName> { + def _dpp8_gfx10 : VOP2_DPP8(opName#"_e32")> { + VOP2_Pseudo ps = !cast(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8"; + } + } + } // End DecoderNamespace = "SDWA10" + + //===------------------------------ VOP2be ------------------------------===// + multiclass VOP2be_Real_gfx10 op, string opName, string asmName> { + def _e32_gfx10 : + VOP2_Real(opName#"_e32"), SIEncodingFamily.GFX10>, + VOP2e(opName#"_e32").Pfl> { + VOP2_Pseudo Ps = !cast(opName#"_e32"); + let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); + } + def _e64_gfx10 : + VOP3_Real(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3be_gfx10<{0, 1, 0, 0, op{5-0}}, + !cast(opName#"_e64").Pfl> { + VOP3_Pseudo Ps = !cast(opName#"_e64"); + let AsmString = asmName # Ps.AsmOperands; + } + def _sdwa_gfx10 : + VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast(opName#"_sdwa"); + let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); + let DecoderNamespace = "SDWA10"; + } + def _dpp_gfx10 : + VOP2_DPP16(opName#"_e32"), asmName> { + string AsmDPP = !cast(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst(", vcc", "", AsmDPP); + let DecoderNamespace = "SDWA10"; + } + def _dpp8_gfx10 : + VOP2_DPP8(opName#"_e32"), asmName> { + string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst(", vcc", "", AsmDPP8); + let DecoderNamespace = "DPP8"; + } + + let WaveSizePredicate = isWave32 in { + def _sdwa_w32_gfx10 : + Base_VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast(opName#"_sdwa"); + let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + } + def _dpp_w32_gfx10 : + VOP2_DPP16(opName#"_e32"), asmName> { + string AsmDPP = !cast(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + } + def _dpp8_w32_gfx10 : + VOP2_DPP8(opName#"_e32"), asmName> { + string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + } + } // End WaveSizePredicate = isWave32 + + let WaveSizePredicate = isWave64 in { + def _sdwa_w64_gfx10 : + Base_VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast(opName#"_sdwa"); + let AsmString = asmName # Ps.AsmOperands; + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + } + def _dpp_w64_gfx10 : + VOP2_DPP16(opName#"_e32"), asmName> { + string AsmDPP = !cast(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + } + def _dpp8_w64_gfx10 : + VOP2_DPP8(opName#"_e32"), asmName> { + string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + } + } // End WaveSizePredicate = isWave64 + } + //===----------------------------- VOP3Only -----------------------------===// + multiclass VOP3Only_Real_gfx10 op> { + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10(NAME#"_e64").Pfl>; + } + + //===---------------------------- VOP3beOnly ----------------------------===// + multiclass VOP3beOnly_Real_gfx10 op, string opName, string asmName> { + def _e64_gfx10 : + VOP3_Real(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3be_gfx10(opName#"_e64").Pfl> { + VOP3_Pseudo Ps = !cast(opName#"_e64"); + let AsmString = asmName # Ps.AsmOperands; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +multiclass Base_VOP2_Real_gfx10 op> : + VOP2_Real_e32_gfx10, VOP2_Real_e64_gfx10; + +multiclass VOP2_Real_gfx10 op> : + VOP2_Real_e32_gfx10, VOP2_Real_e64_gfx10, + VOP2_Real_sdwa_gfx10, VOP2_Real_dpp_gfx10, VOP2_Real_dpp8_gfx10; + +multiclass VOP2_Real_gfx10_with_name op, string opName, + string asmName> : + VOP2_Real_e32_gfx10_with_name, + VOP2_Real_e64_gfx10_with_name, + VOP2_Real_sdwa_gfx10_with_name, + VOP2_Real_dpp_gfx10_with_name, + VOP2_Real_dpp8_gfx10_with_name; + +defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>; +defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; +defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>; +defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>; +defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>; +defm V_ADD_F16 : VOP2_Real_gfx10<0x032>; +defm V_SUB_F16 : VOP2_Real_gfx10<0x033>; +defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>; +defm V_MUL_F16 : VOP2_Real_gfx10<0x035>; +defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>; +defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>; +defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>; +defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; +defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; +defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; +defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; + +// VOP2 no carry-in, carry-out. +defm V_ADD_NC_U32 : + VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">; +defm V_SUB_NC_U32 : + VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">; +defm V_SUBREV_NC_U32 : + VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; + +// VOP2 carry-in, carry-out. +defm V_ADD_CO_CI_U32 : + VOP2be_Real_gfx10<0x028, "V_ADDC_U32", "v_add_co_ci_u32">; +defm V_SUB_CO_CI_U32 : + VOP2be_Real_gfx10<0x029, "V_SUBB_U32", "v_sub_co_ci_u32">; +defm V_SUBREV_CO_CI_U32 : + VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; + +// VOP3 only. +defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>; +defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Real_gfx10<0x365>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Real_gfx10<0x366>; +defm V_LDEXP_F32 : VOP3Only_Real_gfx10<0x362>; +defm V_CVT_PKNORM_I16_F32 : VOP3Only_Real_gfx10<0x368>; +defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>; +defm V_CVT_PK_U16_U32 : VOP3Only_Real_gfx10<0x36a>; +defm V_CVT_PK_I16_I32 : VOP3Only_Real_gfx10<0x36b>; + +// VOP3 carry-in, carry-out. +defm V_ADD_CO_U32 : + VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">; +defm V_SUB_CO_U32 : + VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">; +defm V_SUBREV_CO_U32 : + VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">; + +let SubtargetPredicate = isGFX10Plus in { + defm : VOP2eInstAliases; + + defm : VOP2bInstAliases< + V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx10, "v_add_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// -// VI +// GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// class VOP2_DPPe op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : @@ -809,7 +1181,111 @@ class VOP2_DPPe op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31} = 0x0; //encoding } -let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP2Only_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP2_Real(NAME), SIEncodingFamily.SI>, + VOP2e(NAME).Pfl>; + } + multiclass VOP2Only_Real_MADK_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP2_Real(NAME), SIEncodingFamily.SI>, + VOP2_MADKe(NAME).Pfl>; + } + multiclass VOP2_Real_e32_gfx6_gfx7 op> { + def _e32_gfx6_gfx7 : + VOP2_Real(NAME#"_e32"), SIEncodingFamily.SI>, + VOP2e(NAME#"_e32").Pfl>; + } + multiclass VOP2_Real_e64_gfx6_gfx7 op> { + def _e64_gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; + } + multiclass VOP2be_Real_e64_gfx6_gfx7 op> { + def _e64_gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP2Only_Real_MADK_gfx6_gfx7_gfx10 op> : + VOP2Only_Real_MADK_gfx6_gfx7, VOP2Only_Real_MADK_gfx10; + +multiclass VOP2_Real_gfx6_gfx7 op> : + VOP2_Real_e32_gfx6_gfx7, VOP2_Real_e64_gfx6_gfx7; + +multiclass VOP2_Real_gfx6_gfx7_gfx10 op> : + VOP2_Real_gfx6_gfx7, VOP2_Real_gfx10; + +multiclass VOP2be_Real_gfx6_gfx7 op> : + VOP2_Real_e32_gfx6_gfx7, VOP2be_Real_e64_gfx6_gfx7; + +defm V_CNDMASK_B32 : VOP2_Real_gfx6_gfx7<0x000>; +defm V_MIN_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00d>; +defm V_MAX_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00e>; +defm V_LSHR_B32 : VOP2_Real_gfx6_gfx7<0x015>; +defm V_ASHR_I32 : VOP2_Real_gfx6_gfx7<0x017>; +defm V_LSHL_B32 : VOP2_Real_gfx6_gfx7<0x019>; +defm V_BFM_B32 : VOP2_Real_gfx6_gfx7<0x01e>; +defm V_BCNT_U32_B32 : VOP2_Real_gfx6_gfx7<0x022>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_gfx6_gfx7<0x023>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_gfx6_gfx7<0x024>; +defm V_LDEXP_F32 : VOP2_Real_gfx6_gfx7<0x02b>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_gfx6_gfx7<0x02c>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>; +defm V_CVT_PK_U16_U32 : VOP2_Real_gfx6_gfx7<0x030>; +defm V_CVT_PK_I16_I32 : VOP2_Real_gfx6_gfx7<0x031>; +defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7<0x025>; +defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7<0x026>; +defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7<0x027>; +defm V_ADDC_U32 : VOP2be_Real_gfx6_gfx7<0x028>; +defm V_SUBB_U32 : VOP2be_Real_gfx6_gfx7<0x029>; +defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>; + +defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>; + +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>; +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) + +let SubtargetPredicate = isGFX6GFX7 in { + defm : VOP2eInstAliases; +} // End SubtargetPredicate = isGFX6GFX7 + +defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>; +defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>; +defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; +defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; +defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>; +defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>; +defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>; +defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>; +defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>; +defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>; +defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>; +defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>; +defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>; +defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>; +defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>; +defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>; +defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>; +defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>; +defm V_MADAK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x021>; + +//===----------------------------------------------------------------------===// +// GFX8, GFX9 (VI). +//===----------------------------------------------------------------------===// + +let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { multiclass VOP2_Real_MADK_vi op> { def _vi : VOP2_Real(NAME), SIEncodingFamily.VI>, @@ -843,7 +1319,7 @@ multiclass Base_VOP2_Real_e32e64_vi op> : VOP2_Real_e32_vi, VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>; -} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" +} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" multiclass VOP2_SDWA_Real op> { def _sdwa_vi : @@ -857,7 +1333,7 @@ multiclass VOP2_SDWA9_Real op> { VOP2_SDWA9Ae (NAME#"_sdwa").Pfl>; } -let AssemblerPredicates = [isVIOnly] in { +let AssemblerPredicates = [isGFX8Only] in { multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName> { def _e32_vi : @@ -865,14 +1341,14 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName VOP2e(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; } def _e64_vi : VOP3_Real(OpName#"_e64"), SIEncodingFamily.VI>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; } def _sdwa_vi : VOP_SDWA_Real (OpName#"_sdwa")>, @@ -890,7 +1366,7 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName } } -let AssemblerPredicates = [isGFX9] in { +let AssemblerPredicates = [isGFX9Only] in { multiclass VOP2be_Real_e32e64_gfx9 op, string OpName, string AsmName> { def _e32_gfx9 : @@ -946,7 +1422,7 @@ multiclass VOP2_Real_e32e64_gfx9 op> { } } -} // AssemblerPredicates = [isGFX9] +} // AssemblerPredicates = [isGFX9Only] multiclass VOP2_Real_e32e64_vi op> : Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { @@ -1035,7 +1511,7 @@ defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>; defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>; defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>; -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. @@ -1055,7 +1531,20 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; -} // End SubtargetPredicate = isVI +defm : VOP2eInstAliases; + +} // End SubtargetPredicate = isGFX8GFX9 + +let SubtargetPredicate = isGFX9Only in { + +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; +defm : VOP2bInstAliases; + +} // End SubtargetPredicate = isGFX9Only let SubtargetPredicate = HasDLInsts in { @@ -1063,3 +1552,35 @@ defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>; defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; } // End SubtargetPredicate = HasDLInsts + +multiclass VOP2_Real_DOT_ACC_gfx9 op> : VOP2_Real_e32_vi { + def _dpp : VOP2_DPP(NAME#"_e32")>; +} + +multiclass VOP2_Real_DOT_ACC_gfx10 op> : + VOP2_Real_e32_gfx10, + VOP2_Real_dpp_gfx10, + VOP2_Real_dpp8_gfx10; + +let SubtargetPredicate = HasDot5Insts in { + defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>; + // NB: Opcode conflicts with V_DOT8C_I32_I4 + // This opcode exists in gfx 10.1* only + defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>; +} + +let SubtargetPredicate = HasDot6Insts in { + defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>; + defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>; +} + +let SubtargetPredicate = HasDot4Insts in { + defm V_DOT2C_I32_I16 : VOP2_Real_DOT_ACC_gfx9<0x38>; +} +let SubtargetPredicate = HasDot3Insts in { + defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx9<0x3a>; +} + +let SubtargetPredicate = HasPkFmacF16Inst in { +defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>; +} // End SubtargetPredicate = HasPkFmacF16Inst diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 4b8c1f208a0e..21dbef9240e1 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -1,9 +1,8 @@ //===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -111,6 +110,11 @@ class getVOP3ClampPat { ret1)); } +class getVOP3MAIPat { + list ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + imm:$cbsz, imm:$abid, imm:$blgp))]; +} + class VOP3Inst : VOP3_Pseudo.ret, !if(P.HasIntClamp, getVOP3ClampPat.ret, - getVOP3Pat.ret))), + !if (P.IsMAI, + getVOP3MAIPat.ret, + getVOP3Pat.ret)))), VOP3Only, 0, P.HasOpSel> { let IntClamp = P.HasIntClamp; @@ -144,33 +150,27 @@ def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> { } } -class getVOP3VCC { - list ret = - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), - (i1 VCC)))]; -} - -class VOP3Features { +class VOP3Features { bit HasClamp = Clamp; bit HasOpSel = OpSel; bit IsPacked = Packed; + bit IsMAI = MAI; } -def VOP3_REGULAR : VOP3Features<0, 0, 0>; -def VOP3_CLAMP : VOP3Features<1, 0, 0>; -def VOP3_OPSEL : VOP3Features<1, 1, 0>; -def VOP3_PACKED : VOP3Features<1, 1, 1>; +def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>; +def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>; +def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>; +def VOP3_PACKED : VOP3Features<1, 1, 1, 0>; +def VOP3_MAI : VOP3Features<0, 0, 0, 1>; class VOP3_Profile : VOPProfile { let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers); + let HasModifiers = !if(Features.IsPacked, !if(Features.IsMAI, 0, 1), P.HasModifiers); // FIXME: Hack to stop printing _e64 let Outs64 = (outs DstRC.RegClass:$vdst); @@ -191,8 +191,9 @@ class VOP3_Profile : VOPProf class VOP3b_Profile : VOPProfile<[vt, vt, vt, vt]> { // v_div_scale_{f32|f64} do not support input modifiers. let HasModifiers = 0; + let HasClamp = 0; let HasOMod = 0; - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; } @@ -212,7 +213,7 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { // FIXME: Hack to stop printing _e64 let DstRC = RegisterOperand; - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp"; } @@ -303,7 +304,7 @@ def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_li } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteQuarterRate32] in { -def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile>; +def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile, mul>; def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile, mulhu>; def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile>; def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile, mulhs>; @@ -315,8 +316,7 @@ let Uses = [VCC, EXEC] in { // if (vcc) // result *= 2^32 // -def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, - getVOP3VCC.ret> { +def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []> { let SchedRW = [WriteFloatFMA]; } // v_div_fmas_f64: @@ -324,8 +324,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, // if (vcc) // result *= 2^64 // -def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, - getVOP3VCC.ret> { +def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> { let SchedRW = [WriteDouble]; let FPDPRounding = 1; } @@ -386,22 +385,21 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile>, shl>; def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile>, srl>; def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile>, sra>; def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile>; -} // End SubtargetPredicate = isSICI, Predicates = [isSICI] +} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] -let SubtargetPredicate = isVI in { -def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile>; -def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile>; -def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile>; -} // End SubtargetPredicate = isVI +let SubtargetPredicate = isGFX8Plus in { +def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile, lshl_rev>; +def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile, lshr_rev>; +def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile, ashr_rev>; +} // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] -let Predicates = [isVI] in { +let Predicates = [isGFX8Plus] in { def : GCNPat < (getDivergentFrag.ret i64:$x, i32:$y), (V_LSHLREV_B64 $y, $x) @@ -417,7 +415,13 @@ def : AMDGPUPat < } -let SubtargetPredicate = isCIVI in { +let SchedRW = [Write32Bit] in { +let SubtargetPredicate = isGFX8Plus in { +def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile, AMDGPUperm>; +} // End SubtargetPredicate = isGFX8Plus +} // End SchedRW = [Write32Bit] + +let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile>; @@ -431,27 +435,27 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SchedRW = [WriteDouble, WriteSALU] } // End isCommutable = 1 -} // End SubtargetPredicate = isCIVI +} // End SubtargetPredicate = isGFX7Plus def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup> { - let Predicates = [Has16BitInsts, isVIOnly]; + let Predicates = [Has16BitInsts, isGFX8Only]; let FPDPRounding = 1; } def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile, AMDGPUdiv_fixup> { let renamedInGFX9 = 1; - let Predicates = [Has16BitInsts, isGFX9]; + let Predicates = [Has16BitInsts, isGFX9Plus]; let FPDPRounding = 1; } def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma> { - let Predicates = [Has16BitInsts, isVIOnly]; + let Predicates = [Has16BitInsts, isGFX8Only]; let FPDPRounding = 1; } def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile, fma> { let renamedInGFX9 = 1; - let Predicates = [Has16BitInsts, isGFX9]; + let Predicates = [Has16BitInsts, isGFX9Plus]; let FPDPRounding = 1; } @@ -463,36 +467,58 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile, fmad>; let Uses = [M0, EXEC] in { -def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; +def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, + [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp)))]>; } // End Uses = [M0, EXEC] } // End FPDPRounding = 1 } // End renamedInGFX9 = 1 -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Only in { def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile> { let FPDPRounding = 1; } +} // End SubtargetPredicate = isGFX9Only + +let SubtargetPredicate = isGFX9Plus in { def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile>; def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile>; def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus let Uses = [M0, EXEC], FPDPRounding = 1 in { -def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; -def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, + [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i32 imm:$omod)))]>; +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, + [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i32 imm:$omod)))]>; } // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; +} // End SubtargetPredicate = isGFX8GFX9 -def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile, AMDGPUperm>; -} // End SubtargetPredicate = isVI - -let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { multiclass Ternary_i16_Pats { @@ -506,7 +532,23 @@ def : GCNPat < defm: Ternary_i16_Pats; defm: Ternary_i16_Pats; -} // End Predicates = [Has16BitInsts] +} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] + +let Predicates = [Has16BitInsts, isGFX10Plus] in { + +multiclass Ternary_i16_Pats_gfx9 { +def : GCNPat < + (op2 (op1 i16:$src0, i16:$src1), i16:$src2), + (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) +>; + +} + +defm: Ternary_i16_Pats_gfx9; +defm: Ternary_i16_Pats_gfx9; + +} // End Predicates = [Has16BitInsts, isGFX10Plus] class ThreeOpFrag : PatFrag< (ops node:$x, node:$y, node:$z), @@ -528,7 +570,9 @@ class ThreeOpFrag : PatFrag< if (!Operands[i]->isDivergent() && !isInlineImmediate(Operands[i].getNode())) { ConstantBusUses++; - if (ConstantBusUses >= 2) + // This uses AMDGPU::V_ADD3_U32, but all three operand instructions + // have the same constant bus limit. + if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32)) return false; } } @@ -539,7 +583,7 @@ class ThreeOpFrag : PatFrag< let PredicateCodeUsesOperands = 1; } -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile>; def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile>; def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile>; @@ -589,7 +633,38 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus + +def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { + let Src0RC64 = VRegSrc_32; + let Src1RC64 = SCSrc_b32; + let Src2RC64 = SCSrc_b32; + let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, + IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1, + IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2, + VGPR_32:$vdst_in, op_sel:$op_sel); + let HasClamp = 0; + let HasOMod = 0; +} + +let SubtargetPredicate = isGFX10Plus in { + def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile>; + def : ThreeOp_i32_Pats; + + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>; + def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + } // End $vdst = $vdst_in, DisableEncoding $vdst_in + + def : GCNPat< + (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) + >; + def : GCNPat< + (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) + >; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // Integer Clamp Patterns @@ -631,111 +706,239 @@ def : IntClampPat; def : IntClampPat; def : IntClampPat; + //===----------------------------------------------------------------------===// -// Target +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SI +// GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { - -multiclass VOP3_Real_si op> { - def _si : VOP3_Real(NAME), SIEncodingFamily.SI>, - VOP3e_si (NAME).Pfl>; -} - -multiclass VOP3be_Real_si op> { - def _si : VOP3_Real(NAME), SIEncodingFamily.SI>, - VOP3be_si (NAME).Pfl>; -} - -} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" - -defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>; -defm V_MAD_F32 : VOP3_Real_si <0x141>; -defm V_MAD_I32_I24 : VOP3_Real_si <0x142>; -defm V_MAD_U32_U24 : VOP3_Real_si <0x143>; -defm V_CUBEID_F32 : VOP3_Real_si <0x144>; -defm V_CUBESC_F32 : VOP3_Real_si <0x145>; -defm V_CUBETC_F32 : VOP3_Real_si <0x146>; -defm V_CUBEMA_F32 : VOP3_Real_si <0x147>; -defm V_BFE_U32 : VOP3_Real_si <0x148>; -defm V_BFE_I32 : VOP3_Real_si <0x149>; -defm V_BFI_B32 : VOP3_Real_si <0x14a>; -defm V_FMA_F32 : VOP3_Real_si <0x14b>; -defm V_FMA_F64 : VOP3_Real_si <0x14c>; -defm V_LERP_U8 : VOP3_Real_si <0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>; -defm V_MULLIT_F32 : VOP3_Real_si <0x150>; -defm V_MIN3_F32 : VOP3_Real_si <0x151>; -defm V_MIN3_I32 : VOP3_Real_si <0x152>; -defm V_MIN3_U32 : VOP3_Real_si <0x153>; -defm V_MAX3_F32 : VOP3_Real_si <0x154>; -defm V_MAX3_I32 : VOP3_Real_si <0x155>; -defm V_MAX3_U32 : VOP3_Real_si <0x156>; -defm V_MED3_F32 : VOP3_Real_si <0x157>; -defm V_MED3_I32 : VOP3_Real_si <0x158>; -defm V_MED3_U32 : VOP3_Real_si <0x159>; -defm V_SAD_U8 : VOP3_Real_si <0x15a>; -defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>; -defm V_SAD_U16 : VOP3_Real_si <0x15c>; -defm V_SAD_U32 : VOP3_Real_si <0x15d>; -defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>; -defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>; -defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>; -defm V_LSHL_B64 : VOP3_Real_si <0x161>; -defm V_LSHR_B64 : VOP3_Real_si <0x162>; -defm V_ASHR_I64 : VOP3_Real_si <0x163>; -defm V_ADD_F64 : VOP3_Real_si <0x164>; -defm V_MUL_F64 : VOP3_Real_si <0x165>; -defm V_MIN_F64 : VOP3_Real_si <0x166>; -defm V_MAX_F64 : VOP3_Real_si <0x167>; -defm V_LDEXP_F64 : VOP3_Real_si <0x168>; -defm V_MUL_LO_U32 : VOP3_Real_si <0x169>; -defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>; -defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>; -defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>; -defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>; -defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>; -defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>; -defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>; -defm V_MSAD_U8 : VOP3_Real_si <0x171>; -defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>; -defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>; +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass VOP3_Real_gfx10 op> { + def _gfx10 : + VOP3_Real(NAME), SIEncodingFamily.GFX10>, + VOP3e_gfx10(NAME).Pfl>; + } + multiclass VOP3_Real_gfx10_with_name op, string opName, + string asmName> { + def _gfx10 : + VOP3_Real(opName), SIEncodingFamily.GFX10>, + VOP3e_gfx10(opName).Pfl> { + VOP3_Pseudo ps = !cast(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP3be_Real_gfx10 op> { + def _gfx10 : + VOP3_Real(NAME), SIEncodingFamily.GFX10>, + VOP3be_gfx10(NAME).Pfl>; + } + multiclass VOP3Interp_Real_gfx10 op> { + def _gfx10 : + VOP3_Real(NAME), SIEncodingFamily.GFX10>, + VOP3Interp_gfx10(NAME).Pfl>; + } + multiclass VOP3OpSel_Real_gfx10 op> { + def _gfx10 : + VOP3_Real(NAME), SIEncodingFamily.GFX10>, + VOP3OpSel_gfx10(NAME).Pfl>; + } + multiclass VOP3OpSel_Real_gfx10_with_name op, string opName, + string asmName> { + def _gfx10 : + VOP3_Real(opName), SIEncodingFamily.GFX10>, + VOP3OpSel_gfx10(opName).Pfl> { + VOP3_Pseudo ps = !cast(opName); + let AsmString = asmName # ps.AsmOperands; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>; + +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>; +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) + +defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>; +defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>; +defm V_LSHRREV_B64 : VOP3_Real_gfx10<0x300>; +defm V_ASHRREV_I64 : VOP3_Real_gfx10<0x301>; +defm V_PERM_B32 : VOP3_Real_gfx10<0x344>; +defm V_XAD_U32 : VOP3_Real_gfx10<0x345>; +defm V_LSHL_ADD_U32 : VOP3_Real_gfx10<0x346>; +defm V_ADD_LSHL_U32 : VOP3_Real_gfx10<0x347>; +defm V_ADD3_U32 : VOP3_Real_gfx10<0x36d>; +defm V_LSHL_OR_B32 : VOP3_Real_gfx10<0x36f>; +defm V_AND_OR_B32 : VOP3_Real_gfx10<0x371>; +defm V_OR3_B32 : VOP3_Real_gfx10<0x372>; + +// TODO-GFX10: add MC tests for v_add/sub_nc_i16 +defm V_ADD_NC_I16 : + VOP3OpSel_Real_gfx10_with_name<0x30d, "V_ADD_I16", "v_add_nc_i16">; +defm V_SUB_NC_I16 : + VOP3OpSel_Real_gfx10_with_name<0x30e, "V_SUB_I16", "v_sub_nc_i16">; +defm V_SUB_NC_I32 : + VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32_gfx9", "v_sub_nc_i32">; +defm V_ADD_NC_I32 : + VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32_gfx9", "v_add_nc_i32">; + +defm V_INTERP_P1LL_F16 : VOP3Interp_Real_gfx10<0x342>; +defm V_INTERP_P1LV_F16 : VOP3Interp_Real_gfx10<0x343>; +defm V_INTERP_P2_F16 : VOP3Interp_Real_gfx10<0x35a>; + +defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx10<0x311>; +defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx10<0x312>; +defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx10<0x313>; + +defm V_MIN3_F16 : VOP3OpSel_Real_gfx10<0x351>; +defm V_MIN3_I16 : VOP3OpSel_Real_gfx10<0x352>; +defm V_MIN3_U16 : VOP3OpSel_Real_gfx10<0x353>; +defm V_MAX3_F16 : VOP3OpSel_Real_gfx10<0x354>; +defm V_MAX3_I16 : VOP3OpSel_Real_gfx10<0x355>; +defm V_MAX3_U16 : VOP3OpSel_Real_gfx10<0x356>; +defm V_MED3_F16 : VOP3OpSel_Real_gfx10<0x357>; +defm V_MED3_I16 : VOP3OpSel_Real_gfx10<0x358>; +defm V_MED3_U16 : VOP3OpSel_Real_gfx10<0x359>; +defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx10<0x373>; +defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx10<0x375>; + +defm V_MAD_U16 : + VOP3OpSel_Real_gfx10_with_name<0x340, "V_MAD_U16_gfx9", "v_mad_u16">; +defm V_FMA_F16 : + VOP3OpSel_Real_gfx10_with_name<0x34b, "V_FMA_F16_gfx9", "v_fma_f16">; +defm V_MAD_I16 : + VOP3OpSel_Real_gfx10_with_name<0x35e, "V_MAD_I16_gfx9", "v_mad_i16">; +defm V_DIV_FIXUP_F16 : + VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; + +// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these +// (they do not support SDWA or DPP). +defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16_e64", "v_add_nc_u16">; +defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16_e64", "v_sub_nc_u16">; +defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_e64", "v_mul_lo_u16">; +defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_e64", "v_lshrrev_b16">; +defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_e64", "v_ashrrev_i16">; +defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16_e64", "v_max_u16">; +defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16_e64", "v_max_i16">; +defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">; +defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">; +defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">; +defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>; +defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>; //===----------------------------------------------------------------------===// -// CI +// GFX7, GFX10. //===----------------------------------------------------------------------===// -multiclass VOP3_Real_ci op> { - def _ci : VOP3_Real(NAME), SIEncodingFamily.SI>, - VOP3e_si (NAME).Pfl> { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass VOP3_Real_gfx7 op> { + def _gfx7 : + VOP3_Real(NAME), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7(NAME).Pfl>; } -} - -multiclass VOP3be_Real_ci op> { - def _ci : VOP3_Real(NAME), SIEncodingFamily.SI>, - VOP3be_si (NAME).Pfl> { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; + multiclass VOP3be_Real_gfx7 op> { + def _gfx7 : + VOP3_Real(NAME), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7(NAME).Pfl>; } -} +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" + +multiclass VOP3_Real_gfx7_gfx10 op> : + VOP3_Real_gfx7, VOP3_Real_gfx10; + +multiclass VOP3be_Real_gfx7_gfx10 op> : + VOP3be_Real_gfx7, VOP3be_Real_gfx10; + +defm V_QSAD_PK_U16_U8 : VOP3_Real_gfx7_gfx10<0x172>; +defm V_MQSAD_U32_U8 : VOP3_Real_gfx7_gfx10<0x175>; +defm V_MAD_U64_U32 : VOP3be_Real_gfx7_gfx10<0x176>; +defm V_MAD_I64_I32 : VOP3be_Real_gfx7_gfx10<0x177>; -defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; -defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>; -defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>; -defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>; +//===----------------------------------------------------------------------===// +// GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP3_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP3_Real(NAME), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7(NAME).Pfl>; + } + multiclass VOP3be_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP3_Real(NAME), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7(NAME).Pfl>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP3_Real_gfx6_gfx7_gfx10 op> : + VOP3_Real_gfx6_gfx7, VOP3_Real_gfx10; + +multiclass VOP3be_Real_gfx6_gfx7_gfx10 op> : + VOP3be_Real_gfx6_gfx7, VOP3be_Real_gfx10; + +defm V_LSHL_B64 : VOP3_Real_gfx6_gfx7<0x161>; +defm V_LSHR_B64 : VOP3_Real_gfx6_gfx7<0x162>; +defm V_ASHR_I64 : VOP3_Real_gfx6_gfx7<0x163>; + +defm V_MAD_LEGACY_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x140>; +defm V_MAD_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x141>; +defm V_MAD_I32_I24 : VOP3_Real_gfx6_gfx7_gfx10<0x142>; +defm V_MAD_U32_U24 : VOP3_Real_gfx6_gfx7_gfx10<0x143>; +defm V_CUBEID_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x144>; +defm V_CUBESC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x145>; +defm V_CUBETC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x146>; +defm V_CUBEMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x147>; +defm V_BFE_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x148>; +defm V_BFE_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x149>; +defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; +defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; +defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; +defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; +defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; +defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; +defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; +defm V_MIN3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x153>; +defm V_MAX3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x154>; +defm V_MAX3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x155>; +defm V_MAX3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x156>; +defm V_MED3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x157>; +defm V_MED3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x158>; +defm V_MED3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x159>; +defm V_SAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15a>; +defm V_SAD_HI_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15b>; +defm V_SAD_U16 : VOP3_Real_gfx6_gfx7_gfx10<0x15c>; +defm V_SAD_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x15d>; +defm V_CVT_PK_U8_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15e>; +defm V_DIV_FIXUP_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15f>; +defm V_DIV_FIXUP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x160>; +defm V_ADD_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x164>; +defm V_MUL_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x165>; +defm V_MIN_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x166>; +defm V_MAX_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x167>; +defm V_LDEXP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x168>; +defm V_MUL_LO_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x169>; +defm V_MUL_HI_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x16a>; +defm V_MUL_LO_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16b>; +defm V_MUL_HI_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16c>; +defm V_DIV_FMAS_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x16f>; +defm V_DIV_FMAS_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x170>; +defm V_MSAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x171>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x173>; +defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>; +defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>; +defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { +let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { multiclass VOP3_Real_vi op> { def _vi : VOP3_Real(NAME), SIEncodingFamily.VI>, @@ -757,9 +960,9 @@ multiclass VOP3Interp_Real_vi op> { VOP3Interp_vi (NAME).Pfl>; } -} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" +} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" -let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in { +let AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8" in { multiclass VOP3_F16_Real_vi op> { def _vi : VOP3_Real(NAME), SIEncodingFamily.VI>, @@ -771,9 +974,9 @@ multiclass VOP3Interp_F16_Real_vi op> { VOP3Interp_vi (NAME).Pfl>; } -} // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" +} // End AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8" -let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { +let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in { multiclass VOP3_F16_Real_gfx9 op, string OpName, string AsmName> { def _gfx9 : VOP3_Real(OpName), SIEncodingFamily.GFX9>, @@ -807,7 +1010,7 @@ multiclass VOP3_Real_gfx9 op, string AsmName> { } } -} // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" +} // End AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 91b45583c848..55ee5f6577cf 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1,9 +1,8 @@ //===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -70,6 +69,16 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; + +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// The constant will be emitted as a mov, and folded later. +// TODO: We could directly encode the immediate now +def : GCNPat< + (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1), + (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp) +>; + multiclass MadFmaMixPats : GCNPat < (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), (and i32:$src1, (i32 65535))) ), - (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) ->; + (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { + let SubtargetPredicate = !cast(Inst).SubtargetPredicate; +} class SDot2Pat : GCNPat < (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), (sra i32:$src1, (i32 16))), i32:$src2), (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), (sext_inreg i32:$src1, i16))), - (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) ->; + (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { + let SubtargetPredicate = !cast(Inst).SubtargetPredicate; +} -let SubtargetPredicate = HasDotInsts in { +let SubtargetPredicate = HasDot2Insts in { def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile>; def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile>; def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile>; -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile>; def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile>; def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile>; +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot1Insts in { + +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile>; + +} // End SubtargetPredicate = HasDot1Insts + multiclass DotPats { + let SubtargetPredicate = dot_inst.SubtargetPredicate in def : GCNPat < (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), @@ -281,12 +300,14 @@ def : UDot2Pat; def : SDot2Pat; foreach Type = ["U", "I"] in + let SubtargetPredicate = !cast("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in def : GCNPat < !cast(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, (add_oneuse lhs, (!cast("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), (!cast("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; foreach Type = ["U", "I"] in + let SubtargetPredicate = !cast("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in def : GCNPat < !cast(!foldl((add_oneuse i32:$src2, (!cast("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [1, 2, 3, 4, 5, 6, 7], lhs, y, @@ -296,19 +317,101 @@ foreach Type = ["U", "I"] in // Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase // in the compile time. Directly handle the pattern generated by the FE here. foreach Type = ["U", "I"] in + let SubtargetPredicate = !cast("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in def : GCNPat < !cast(!foldl((add_oneuse i32:$src2, (!cast("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [7, 1, 2, 3, 4, 5, 6], lhs, y, (NonACAdd_oneuse lhs, (!cast("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), (!cast("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; -} // End SubtargetPredicate = HasDotInsts +def ADst_32 : VOPDstOperand; +def ADst_128 : VOPDstOperand; +def ADst_512 : VOPDstOperand; +def ADst_1024 : VOPDstOperand; + +def VOPProfileAccRead : VOP3_Profile { + let Src0RC64 = ARegSrc_32; +} + +def VOPProfileAccWrite : VOP3_Profile { + let DstRC = ADst_32; + let Src0RC64 = VISrc_b32; +} + +class VOPProfileMAI + : VOP3_Profile { + let DstRC = _DstRC; + let Src0RC64 = SrcABRC; + let Src1RC64 = SrcABRC; + let Src2RC64 = _SrcRC; + let HasOpSel = 0; + let HasClamp = 0; + let HasModifiers = 0; + let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp"; + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); +} + +def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI; +def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI; +def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI; +def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI; +def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI; +def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI; +def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI; +def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI; +def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI; +def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI; +def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI; +def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI; + +let Predicates = [HasMAIInsts] in { +def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; +def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> { + let isMoveImm = 1; +} + +let isConvergent = 1 in { +def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; +def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; +def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; +def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>; +def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>; +def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>; +def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>; +def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>; +def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>; +def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>; +def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>; +def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>; +def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>; +def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>; +def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>; +def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>; +def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>; +def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; +def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; +def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; +} // End isConvergent = 1 + +} // End SubtargetPredicate = HasMAIInsts + +def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; +def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; multiclass VOP3P_Real_vi op> { def _vi : VOP3P_Real(NAME), SIEncodingFamily.VI>, VOP3Pe (NAME).Pfl> { let AssemblerPredicates = [HasVOP3PInsts]; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; + } +} + +multiclass VOP3P_Real_MAI op> { + def _vi : VOP3P_Real(NAME), SIEncodingFamily.VI>, + VOP3Pe_MAI (NAME).Pfl> { + let AssemblerPredicates = [HasMAIInsts]; + let DecoderNamespace = "GFX8"; } } @@ -352,14 +455,97 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; } -let SubtargetPredicate = HasDotInsts in { +let SubtargetPredicate = HasDot2Insts in { defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; -defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; -defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; -} // End SubtargetPredicate = HasDotInsts +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot1Insts in { + +defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; +defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; + +} // End SubtargetPredicate = HasDot1Insts + +let SubtargetPredicate = HasMAIInsts in { + +defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>; +defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>; +defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>; +defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>; +defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>; +defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>; +defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>; +defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>; +defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>; +defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>; +defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>; +defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>; +defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>; +defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>; +defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>; +defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>; +defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>; +defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>; +defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>; +defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>; +defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>; +defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>; + +} // End SubtargetPredicate = HasMAIInsts + +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass VOP3P_Real_gfx10 op> { + def _gfx10 : VOP3P_Real(NAME), SIEncodingFamily.GFX10>, + VOP3Pe_gfx10 (NAME).Pfl>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>; +defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>; +defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>; +defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>; +defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>; +defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>; +defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>; +defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>; +defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>; +defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>; +defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>; +defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>; +defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>; +defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>; +defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>; +defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>; + +let SubtargetPredicate = HasDot2Insts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>; +defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>; +defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>; +defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>; +defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot1Insts in { + +defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>; +defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>; + +} // End SubtargetPredicate = HasDot1Insts diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index 091cac8cd35c..b3513e383d10 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -1,9 +1,8 @@ //===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -54,14 +53,29 @@ class VOPC_SDWA9e op, VOPProfile P> : VOP_SDWA9Be

{ // an explicit $dst. class VOPC_Profile sched, ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { - let Asm32 = "vcc, $src0, $src1"; + let Asm32 = "$src0, $src1"; // The destination for 32-bit encoding is implicit. let HasDst32 = 0; - let Outs64 = (outs VOPDstS64:$sdst); + let Outs64 = (outs VOPDstS64orS32:$sdst); list Schedule = sched; } -class VOPC_Pseudo pattern=[]> : +class VOPC_NoSdst_Profile sched, ValueType vt0, + ValueType vt1 = vt0> : + VOPC_Profile { + let Outs64 = (outs ); + let OutsSDWA = (outs ); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm64 = !if(isFloatType.ret, "$src0_modifiers, $src1_modifiers$clamp", + "$src0, $src1"); + let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; + let EmitDst = 0; +} + +class VOPC_Pseudo pattern=[], + bit DefVcc = 1> : InstSI<(outs), P.Ins32, "", pattern>, VOP , SIMCInstr { @@ -81,9 +95,7 @@ class VOPC_Pseudo pattern=[]> : let VALU = 1; let VOPC = 1; let Uses = [EXEC]; - let Defs = [VCC]; - - let SubtargetPredicate = isGCN; + let Defs = !if(DefVcc, [VCC], []); VOPProfile Pfl = P; } @@ -115,8 +127,9 @@ class VOPC_SDWA_Pseudo pattern=[]> : } // This class is used only with VOPC instructions. Use $sdst for out operand -class VOPCInstAlias : - InstAlias , PredicateControl { +class VOPCInstAlias : + InstAlias , PredicateControl { field bit isCompare; field bit isCommutable; @@ -149,6 +162,27 @@ class VOPCInstAlias : let SubtargetPredicate = AssemblerPredicate; } +multiclass VOPCInstAliases { + def : VOPCInstAlias (OpName#"_e64"), + !cast(OpName#"_e32_"#Arch)>; + let WaveSizePredicate = isWave32 in { + def : VOPCInstAlias (OpName#"_e64"), + !cast(OpName#"_e32_"#Arch), + "vcc_lo, "#!cast(OpName#"_e64").Pfl.Asm32>; + } + let WaveSizePredicate = isWave64 in { + def : VOPCInstAlias (OpName#"_e64"), + !cast(OpName#"_e32_"#Arch), + "vcc, "#!cast(OpName#"_e64").Pfl.Asm32>; + } +} + +multiclass VOPCXInstAliases { + def : VOPCInstAlias (OpName#"_e64"), + !cast(OpName#"_e32_"#Arch)>; +} + + class getVOPCPat64 : LetDummies { list ret = !if(P.HasModifiers, [(set i1:$sdst, @@ -161,6 +195,10 @@ class getVOPCPat64 : LetDummies { [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]); } +class VCMPXNoSDstTable { + bit HasSDst = has_sdst; + string NoSDstOp = Name; +} multiclass VOPC_Pseudos { def _e32 : VOPC_Pseudo , - Commutable_REV { + Commutable_REV, + VCMPXNoSDstTable<1, opName#"_e32"> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -178,7 +217,8 @@ multiclass VOPC_Pseudos .ret>, - Commutable_REV { + Commutable_REV, + VCMPXNoSDstTable<1, opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -193,6 +233,44 @@ multiclass VOPC_Pseudos : + VOPC_Pseudos { + + def _nosdst_e32 : VOPC_Pseudo , + Commutable_REV, + VCMPXNoSDstTable<0, opName#"_e32"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let isCompare = 1; + let isCommutable = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_e64 : VOP3_Pseudo, + Commutable_REV, + VCMPXNoSDstTable<0, opName#"_e64"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isCompare = 1; + let isCommutable = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_sdwa : VOPC_SDWA_Pseudo { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let isCompare = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } +} +} // End SubtargetPredicate = HasSdstCMPX + def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>; @@ -200,6 +278,13 @@ def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>; def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>; def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>; +def VOPC_F16_F16 : VOPC_NoSdst_Profile<[Write32Bit], f16>; +def VOPC_F32_F32 : VOPC_NoSdst_Profile<[Write32Bit], f32>; +def VOPC_F64_F64 : VOPC_NoSdst_Profile<[Write64Bit], f64>; +def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>; +def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>; +def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>; + multiclass VOPC_F16 : VOPC_Pseudos ; @@ -219,22 +304,22 @@ multiclass VOPC_I64 ; multiclass VOPCX_F16 : - VOPC_Pseudos ; + VOPCX_Pseudos ; multiclass VOPCX_F32 : - VOPC_Pseudos ; + VOPCX_Pseudos ; multiclass VOPCX_F64 : - VOPC_Pseudos ; + VOPCX_Pseudos ; multiclass VOPCX_I16 : - VOPC_Pseudos ; + VOPCX_Pseudos ; multiclass VOPCX_I32 : - VOPC_Pseudos ; + VOPCX_Pseudos ; multiclass VOPCX_I64 : - VOPC_Pseudos ; + VOPCX_Pseudos ; //===----------------------------------------------------------------------===// @@ -309,7 +394,7 @@ defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">; defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">; defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">; -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isGFX6GFX7 in { defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">; defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; @@ -379,7 +464,7 @@ defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">; defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">; defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">; -} // End SubtargetPredicate = isSICI +} // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = Has16BitInsts in { @@ -546,6 +631,18 @@ class VOPC_Class_Profile sched, ValueType vt> : let HasOMod = 0; } +class VOPC_Class_NoSdst_Profile sched, ValueType vt> : + VOPC_Class_Profile { + let Outs64 = (outs ); + let OutsSDWA = (outs ); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm64 = "$src0_modifiers, $src1"; + let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; + let EmitDst = 0; +} + class getVOPCClassPat64 { list ret = [(set i1:$sdst, @@ -556,46 +653,85 @@ class getVOPCClassPat64 { // Special case for class instructions which only have modifiers on // the 1st source operand. -multiclass VOPC_Class_Pseudos { - def _e32 : VOPC_Pseudo { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); +multiclass VOPC_Class_Pseudos { + def _e32 : VOPC_Pseudo , + VCMPXNoSDstTable<1, opName#"_e32"> { + let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), + !if(DefVcc, [VCC], [])); let SchedRW = p.Schedule; let isConvergent = DefExec; } - def _e64 : VOP3_Pseudo.ret> { + def _e64 : VOP3_Pseudo.ret>, + VCMPXNoSDstTable<1, opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = p.Schedule; } def _sdwa : VOPC_SDWA_Pseudo { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), + !if(DefVcc, [VCC], [])); let SchedRW = p.Schedule; let isConvergent = DefExec; } } +let SubtargetPredicate = HasSdstCMPX in { +multiclass VOPCX_Class_Pseudos : + VOPC_Class_Pseudos { + + def _nosdst_e32 : VOPC_Pseudo , + VCMPXNoSDstTable<0, opName#"_e32"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_e64 : VOP3_Pseudo, + VCMPXNoSDstTable<0, opName#"_e64"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_sdwa : VOPC_SDWA_Pseudo { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } +} +} // End SubtargetPredicate = HasSdstCMPX + def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>; def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>; +def VOPC_F16_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f16>; +def VOPC_F32_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f32>; +def VOPC_F64_I32 : VOPC_Class_NoSdst_Profile<[Write64Bit], f64>; + multiclass VOPC_CLASS_F16 : VOPC_Class_Pseudos ; multiclass VOPCX_CLASS_F16 : - VOPC_Class_Pseudos ; + VOPCX_Class_Pseudos ; multiclass VOPC_CLASS_F32 : VOPC_Class_Pseudos ; multiclass VOPCX_CLASS_F32 : - VOPC_Class_Pseudos ; + VOPCX_Class_Pseudos ; multiclass VOPC_CLASS_F64 : VOPC_Class_Pseudos ; multiclass VOPCX_CLASS_F64 : - VOPC_Class_Pseudos ; + VOPCX_Class_Pseudos ; defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; @@ -608,342 +744,471 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // V_ICMPIntrinsic Pattern. //===----------------------------------------------------------------------===// -class ICMP_Pattern : GCNPat < - (AMDGPUsetcc vt:$src0, vt:$src1, cond), - (inst $src0, $src1) ->; - -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; - -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; - -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; -def : ICMP_Pattern ; - -class FCMP_Pattern : GCNPat < - (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), - (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), - (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE) ->; - -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; - -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; - -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; - - -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; - -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; - -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; -def : FCMP_Pattern ; +// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() +// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. +multiclass ICMP_Pattern { + let WaveSizePredicate = isWave64 in + def : GCNPat < + (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64)) + >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) + >; +} + +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; + +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; + +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; + +multiclass FCMP_Pattern { + let WaveSizePredicate = isWave64 in + def : GCNPat < + (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i64 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_64)) + >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_32)) + >; +} + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; //===----------------------------------------------------------------------===// -// Target +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SI +// GFX10. //===----------------------------------------------------------------------===// -multiclass VOPC_Real_si op> { - let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { - def _e32_si : - VOPC_Real(NAME#"_e32"), SIEncodingFamily.SI>, - VOPCe; - - def _e64_si : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3a_si (NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 - // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst - bits<8> sdst; - let Inst{7-0} = sdst; - } +let AssemblerPredicate = isGFX10Plus in { + multiclass VOPC_Real_gfx10 op> { + let DecoderNamespace = "GFX10" in { + def _e32_gfx10 : + VOPC_Real(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOPCe; + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } + } // End DecoderNamespace = "GFX10" + + def _sdwa_gfx10 : + VOP_SDWA10_Real(NAME#"_sdwa")>, + VOPC_SDWA9e(NAME#"_sdwa").Pfl>; + + defm : VOPCInstAliases; } - def : VOPCInstAlias (NAME#"_e64"), - !cast(NAME#"_e32_si")> { - let AssemblerPredicate = isSICI; + + multiclass VOPCX_Real_gfx10 op> { + let DecoderNamespace = "GFX10" in { + def _e32_gfx10 : + VOPC_Real(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, + VOPCe { + let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e32").PseudoInstr) + # " " # !cast(NAME#"_nosdst_e32").AsmOperands; + } + + def _e64_gfx10 : + VOP3_Real(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast(NAME#"_nosdst_e64").Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e64").Mnemonic) + # "{_e64} " # !cast(NAME#"_nosdst_e64").AsmOperands; + } + } // End DecoderNamespace = "GFX10" + + def _sdwa_gfx10 : + VOP_SDWA10_Real(NAME#"_nosdst_sdwa")>, + VOPC_SDWA9e(NAME#"_nosdst_sdwa").Pfl> { + let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_sdwa").Mnemonic) + # "{_sdwa} " # !cast(NAME#"_nosdst_sdwa").AsmOperands9; + } + + defm : VOPCXInstAliases; } -} +} // End AssemblerPredicate = isGFX10Plus + +defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>; +defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>; +defm V_CMP_LE_I16 : VOPC_Real_gfx10<0x08b>; +defm V_CMP_GT_I16 : VOPC_Real_gfx10<0x08c>; +defm V_CMP_NE_I16 : VOPC_Real_gfx10<0x08d>; +defm V_CMP_GE_I16 : VOPC_Real_gfx10<0x08e>; +defm V_CMP_CLASS_F16 : VOPC_Real_gfx10<0x08f>; +defm V_CMPX_LT_I16 : VOPCX_Real_gfx10<0x099>; +defm V_CMPX_EQ_I16 : VOPCX_Real_gfx10<0x09a>; +defm V_CMPX_LE_I16 : VOPCX_Real_gfx10<0x09b>; +defm V_CMPX_GT_I16 : VOPCX_Real_gfx10<0x09c>; +defm V_CMPX_NE_I16 : VOPCX_Real_gfx10<0x09d>; +defm V_CMPX_GE_I16 : VOPCX_Real_gfx10<0x09e>; +defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx10<0x09f>; +defm V_CMP_LT_U16 : VOPC_Real_gfx10<0x0a9>; +defm V_CMP_EQ_U16 : VOPC_Real_gfx10<0x0aa>; +defm V_CMP_LE_U16 : VOPC_Real_gfx10<0x0ab>; +defm V_CMP_GT_U16 : VOPC_Real_gfx10<0x0ac>; +defm V_CMP_NE_U16 : VOPC_Real_gfx10<0x0ad>; +defm V_CMP_GE_U16 : VOPC_Real_gfx10<0x0ae>; +defm V_CMPX_LT_U16 : VOPCX_Real_gfx10<0x0b9>; +defm V_CMPX_EQ_U16 : VOPCX_Real_gfx10<0x0ba>; +defm V_CMPX_LE_U16 : VOPCX_Real_gfx10<0x0bb>; +defm V_CMPX_GT_U16 : VOPCX_Real_gfx10<0x0bc>; +defm V_CMPX_NE_U16 : VOPCX_Real_gfx10<0x0bd>; +defm V_CMPX_GE_U16 : VOPCX_Real_gfx10<0x0be>; +defm V_CMP_F_F16 : VOPC_Real_gfx10<0x0c8>; +defm V_CMP_LT_F16 : VOPC_Real_gfx10<0x0c9>; +defm V_CMP_EQ_F16 : VOPC_Real_gfx10<0x0ca>; +defm V_CMP_LE_F16 : VOPC_Real_gfx10<0x0cb>; +defm V_CMP_GT_F16 : VOPC_Real_gfx10<0x0cc>; +defm V_CMP_LG_F16 : VOPC_Real_gfx10<0x0cd>; +defm V_CMP_GE_F16 : VOPC_Real_gfx10<0x0ce>; +defm V_CMP_O_F16 : VOPC_Real_gfx10<0x0cf>; +defm V_CMPX_F_F16 : VOPCX_Real_gfx10<0x0d8>; +defm V_CMPX_LT_F16 : VOPCX_Real_gfx10<0x0d9>; +defm V_CMPX_EQ_F16 : VOPCX_Real_gfx10<0x0da>; +defm V_CMPX_LE_F16 : VOPCX_Real_gfx10<0x0db>; +defm V_CMPX_GT_F16 : VOPCX_Real_gfx10<0x0dc>; +defm V_CMPX_LG_F16 : VOPCX_Real_gfx10<0x0dd>; +defm V_CMPX_GE_F16 : VOPCX_Real_gfx10<0x0de>; +defm V_CMPX_O_F16 : VOPCX_Real_gfx10<0x0df>; +defm V_CMP_U_F16 : VOPC_Real_gfx10<0x0e8>; +defm V_CMP_NGE_F16 : VOPC_Real_gfx10<0x0e9>; +defm V_CMP_NLG_F16 : VOPC_Real_gfx10<0x0ea>; +defm V_CMP_NGT_F16 : VOPC_Real_gfx10<0x0eb>; +defm V_CMP_NLE_F16 : VOPC_Real_gfx10<0x0ec>; +defm V_CMP_NEQ_F16 : VOPC_Real_gfx10<0x0ed>; +defm V_CMP_NLT_F16 : VOPC_Real_gfx10<0x0ee>; +defm V_CMP_TRU_F16 : VOPC_Real_gfx10<0x0ef>; +defm V_CMPX_U_F16 : VOPCX_Real_gfx10<0x0f8>; +defm V_CMPX_NGE_F16 : VOPCX_Real_gfx10<0x0f9>; +defm V_CMPX_NLG_F16 : VOPCX_Real_gfx10<0x0fa>; +defm V_CMPX_NGT_F16 : VOPCX_Real_gfx10<0x0fb>; +defm V_CMPX_NLE_F16 : VOPCX_Real_gfx10<0x0fc>; +defm V_CMPX_NEQ_F16 : VOPCX_Real_gfx10<0x0fd>; +defm V_CMPX_NLT_F16 : VOPCX_Real_gfx10<0x0fe>; +defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>; -defm V_CMP_F_F32 : VOPC_Real_si <0x0>; -defm V_CMP_LT_F32 : VOPC_Real_si <0x1>; -defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>; -defm V_CMP_LE_F32 : VOPC_Real_si <0x3>; -defm V_CMP_GT_F32 : VOPC_Real_si <0x4>; -defm V_CMP_LG_F32 : VOPC_Real_si <0x5>; -defm V_CMP_GE_F32 : VOPC_Real_si <0x6>; -defm V_CMP_O_F32 : VOPC_Real_si <0x7>; -defm V_CMP_U_F32 : VOPC_Real_si <0x8>; -defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>; -defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>; -defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>; -defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>; -defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>; -defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>; -defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>; - -defm V_CMPX_F_F32 : VOPC_Real_si <0x10>; -defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>; -defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>; -defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>; -defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>; -defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>; -defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>; -defm V_CMPX_O_F32 : VOPC_Real_si <0x17>; -defm V_CMPX_U_F32 : VOPC_Real_si <0x18>; -defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>; -defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>; -defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>; -defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>; -defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>; -defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>; -defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>; - -defm V_CMP_F_F64 : VOPC_Real_si <0x20>; -defm V_CMP_LT_F64 : VOPC_Real_si <0x21>; -defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>; -defm V_CMP_LE_F64 : VOPC_Real_si <0x23>; -defm V_CMP_GT_F64 : VOPC_Real_si <0x24>; -defm V_CMP_LG_F64 : VOPC_Real_si <0x25>; -defm V_CMP_GE_F64 : VOPC_Real_si <0x26>; -defm V_CMP_O_F64 : VOPC_Real_si <0x27>; -defm V_CMP_U_F64 : VOPC_Real_si <0x28>; -defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>; -defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>; -defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>; -defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>; -defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>; -defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>; -defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>; - -defm V_CMPX_F_F64 : VOPC_Real_si <0x30>; -defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>; -defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>; -defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>; -defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>; -defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>; -defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>; -defm V_CMPX_O_F64 : VOPC_Real_si <0x37>; -defm V_CMPX_U_F64 : VOPC_Real_si <0x38>; -defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>; -defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>; -defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>; -defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>; -defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>; -defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>; -defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>; - -defm V_CMPS_F_F32 : VOPC_Real_si <0x40>; -defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>; -defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>; -defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>; -defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>; -defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>; -defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>; -defm V_CMPS_O_F32 : VOPC_Real_si <0x47>; -defm V_CMPS_U_F32 : VOPC_Real_si <0x48>; -defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>; -defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>; -defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>; -defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>; -defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>; -defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>; -defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>; - -defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>; -defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>; -defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>; -defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>; -defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>; -defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>; -defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>; -defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>; -defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>; -defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>; -defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>; -defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>; -defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>; -defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>; -defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>; -defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>; - -defm V_CMPS_F_F64 : VOPC_Real_si <0x60>; -defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>; -defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>; -defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>; -defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>; -defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>; -defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>; -defm V_CMPS_O_F64 : VOPC_Real_si <0x67>; -defm V_CMPS_U_F64 : VOPC_Real_si <0x68>; -defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>; -defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>; -defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>; -defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>; -defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>; -defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>; -defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>; - -defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>; -defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>; -defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>; -defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>; -defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>; -defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>; -defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>; -defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>; -defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>; -defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>; -defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>; -defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>; -defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>; -defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>; -defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>; -defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>; - -defm V_CMP_F_I32 : VOPC_Real_si <0x80>; -defm V_CMP_LT_I32 : VOPC_Real_si <0x81>; -defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>; -defm V_CMP_LE_I32 : VOPC_Real_si <0x83>; -defm V_CMP_GT_I32 : VOPC_Real_si <0x84>; -defm V_CMP_NE_I32 : VOPC_Real_si <0x85>; -defm V_CMP_GE_I32 : VOPC_Real_si <0x86>; -defm V_CMP_T_I32 : VOPC_Real_si <0x87>; - -defm V_CMPX_F_I32 : VOPC_Real_si <0x90>; -defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>; -defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>; -defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>; -defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>; -defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>; -defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>; -defm V_CMPX_T_I32 : VOPC_Real_si <0x97>; - -defm V_CMP_F_I64 : VOPC_Real_si <0xa0>; -defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>; -defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>; -defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>; -defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>; -defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>; -defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>; -defm V_CMP_T_I64 : VOPC_Real_si <0xa7>; - -defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>; -defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>; -defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>; -defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>; -defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>; -defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>; -defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>; -defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>; - -defm V_CMP_F_U32 : VOPC_Real_si <0xc0>; -defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>; -defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>; -defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>; -defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>; -defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>; -defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>; -defm V_CMP_T_U32 : VOPC_Real_si <0xc7>; - -defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>; -defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>; -defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>; -defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>; -defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>; -defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>; -defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>; -defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>; - -defm V_CMP_F_U64 : VOPC_Real_si <0xe0>; -defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>; -defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>; -defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>; -defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>; -defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>; -defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>; -defm V_CMP_T_U64 : VOPC_Real_si <0xe7>; - -defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>; -defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>; -defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>; -defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>; -defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>; -defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>; -defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>; -defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>; - -defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>; -defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>; -defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>; -defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>; +//===----------------------------------------------------------------------===// +// GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6GFX7 in { + multiclass VOPC_Real_gfx6_gfx7 op> { + let DecoderNamespace = "GFX6GFX7" in { + def _e32_gfx6_gfx7 : + VOPC_Real(NAME#"_e32"), SIEncodingFamily.SI>, + VOPCe; + def _e64_gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3a_gfx6_gfx7(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } + } // End DecoderNamespace = "GFX6GFX7" + + defm : VOPCInstAliases; + } +} // End AssemblerPredicate = isGFX6GFX7 + +multiclass VOPC_Real_gfx6_gfx7_gfx10 op> : + VOPC_Real_gfx6_gfx7, VOPC_Real_gfx10; + +multiclass VOPCX_Real_gfx6_gfx7 op> : + VOPC_Real_gfx6_gfx7; + +multiclass VOPCX_Real_gfx6_gfx7_gfx10 op> : + VOPC_Real_gfx6_gfx7, VOPCX_Real_gfx10; + +defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>; +defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>; +defm V_CMP_EQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x002>; +defm V_CMP_LE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x003>; +defm V_CMP_GT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x004>; +defm V_CMP_LG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x005>; +defm V_CMP_GE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x006>; +defm V_CMP_O_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x007>; +defm V_CMP_U_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x008>; +defm V_CMP_NGE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x009>; +defm V_CMP_NLG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_CMP_NGT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_CMP_NLE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_CMP_NEQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00d>; +defm V_CMP_NLT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00e>; +defm V_CMP_TRU_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_CMPX_F_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x010>; +defm V_CMPX_LT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x011>; +defm V_CMPX_EQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x012>; +defm V_CMPX_LE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x013>; +defm V_CMPX_GT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x014>; +defm V_CMPX_LG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x015>; +defm V_CMPX_GE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x016>; +defm V_CMPX_O_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x017>; +defm V_CMPX_U_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x018>; +defm V_CMPX_NGE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x019>; +defm V_CMPX_NLG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01a>; +defm V_CMPX_NGT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01b>; +defm V_CMPX_NLE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>; +defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>; +defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>; +defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>; +defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x020>; +defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x021>; +defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x022>; +defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x023>; +defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x024>; +defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x025>; +defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x026>; +defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x027>; +defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x028>; +defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x029>; +defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02a>; +defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02b>; +defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02c>; +defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02d>; +defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02e>; +defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>; +defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>; +defm V_CMPX_EQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x032>; +defm V_CMPX_LE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x033>; +defm V_CMPX_GT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x034>; +defm V_CMPX_LG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x035>; +defm V_CMPX_GE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x036>; +defm V_CMPX_O_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x037>; +defm V_CMPX_U_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x038>; +defm V_CMPX_NGE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x039>; +defm V_CMPX_NLG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03a>; +defm V_CMPX_NGT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03b>; +defm V_CMPX_NLE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03c>; +defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03d>; +defm V_CMPX_NLT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03e>; +defm V_CMPX_TRU_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03f>; +defm V_CMPS_F_F32 : VOPC_Real_gfx6_gfx7<0x040>; +defm V_CMPS_LT_F32 : VOPC_Real_gfx6_gfx7<0x041>; +defm V_CMPS_EQ_F32 : VOPC_Real_gfx6_gfx7<0x042>; +defm V_CMPS_LE_F32 : VOPC_Real_gfx6_gfx7<0x043>; +defm V_CMPS_GT_F32 : VOPC_Real_gfx6_gfx7<0x044>; +defm V_CMPS_LG_F32 : VOPC_Real_gfx6_gfx7<0x045>; +defm V_CMPS_GE_F32 : VOPC_Real_gfx6_gfx7<0x046>; +defm V_CMPS_O_F32 : VOPC_Real_gfx6_gfx7<0x047>; +defm V_CMPS_U_F32 : VOPC_Real_gfx6_gfx7<0x048>; +defm V_CMPS_NGE_F32 : VOPC_Real_gfx6_gfx7<0x049>; +defm V_CMPS_NLG_F32 : VOPC_Real_gfx6_gfx7<0x04a>; +defm V_CMPS_NGT_F32 : VOPC_Real_gfx6_gfx7<0x04b>; +defm V_CMPS_NLE_F32 : VOPC_Real_gfx6_gfx7<0x04c>; +defm V_CMPS_NEQ_F32 : VOPC_Real_gfx6_gfx7<0x04d>; +defm V_CMPS_NLT_F32 : VOPC_Real_gfx6_gfx7<0x04e>; +defm V_CMPS_TRU_F32 : VOPC_Real_gfx6_gfx7<0x04f>; +defm V_CMPSX_F_F32 : VOPCX_Real_gfx6_gfx7<0x050>; +defm V_CMPSX_LT_F32 : VOPCX_Real_gfx6_gfx7<0x051>; +defm V_CMPSX_EQ_F32 : VOPCX_Real_gfx6_gfx7<0x052>; +defm V_CMPSX_LE_F32 : VOPCX_Real_gfx6_gfx7<0x053>; +defm V_CMPSX_GT_F32 : VOPCX_Real_gfx6_gfx7<0x054>; +defm V_CMPSX_LG_F32 : VOPCX_Real_gfx6_gfx7<0x055>; +defm V_CMPSX_GE_F32 : VOPCX_Real_gfx6_gfx7<0x056>; +defm V_CMPSX_O_F32 : VOPCX_Real_gfx6_gfx7<0x057>; +defm V_CMPSX_U_F32 : VOPCX_Real_gfx6_gfx7<0x058>; +defm V_CMPSX_NGE_F32 : VOPCX_Real_gfx6_gfx7<0x059>; +defm V_CMPSX_NLG_F32 : VOPCX_Real_gfx6_gfx7<0x05a>; +defm V_CMPSX_NGT_F32 : VOPCX_Real_gfx6_gfx7<0x05b>; +defm V_CMPSX_NLE_F32 : VOPCX_Real_gfx6_gfx7<0x05c>; +defm V_CMPSX_NEQ_F32 : VOPCX_Real_gfx6_gfx7<0x05d>; +defm V_CMPSX_NLT_F32 : VOPCX_Real_gfx6_gfx7<0x05e>; +defm V_CMPSX_TRU_F32 : VOPCX_Real_gfx6_gfx7<0x05f>; +defm V_CMPS_F_F64 : VOPC_Real_gfx6_gfx7<0x060>; +defm V_CMPS_LT_F64 : VOPC_Real_gfx6_gfx7<0x061>; +defm V_CMPS_EQ_F64 : VOPC_Real_gfx6_gfx7<0x062>; +defm V_CMPS_LE_F64 : VOPC_Real_gfx6_gfx7<0x063>; +defm V_CMPS_GT_F64 : VOPC_Real_gfx6_gfx7<0x064>; +defm V_CMPS_LG_F64 : VOPC_Real_gfx6_gfx7<0x065>; +defm V_CMPS_GE_F64 : VOPC_Real_gfx6_gfx7<0x066>; +defm V_CMPS_O_F64 : VOPC_Real_gfx6_gfx7<0x067>; +defm V_CMPS_U_F64 : VOPC_Real_gfx6_gfx7<0x068>; +defm V_CMPS_NGE_F64 : VOPC_Real_gfx6_gfx7<0x069>; +defm V_CMPS_NLG_F64 : VOPC_Real_gfx6_gfx7<0x06a>; +defm V_CMPS_NGT_F64 : VOPC_Real_gfx6_gfx7<0x06b>; +defm V_CMPS_NLE_F64 : VOPC_Real_gfx6_gfx7<0x06c>; +defm V_CMPS_NEQ_F64 : VOPC_Real_gfx6_gfx7<0x06d>; +defm V_CMPS_NLT_F64 : VOPC_Real_gfx6_gfx7<0x06e>; +defm V_CMPS_TRU_F64 : VOPC_Real_gfx6_gfx7<0x06f>; +defm V_CMPSX_F_F64 : VOPCX_Real_gfx6_gfx7<0x070>; +defm V_CMPSX_LT_F64 : VOPCX_Real_gfx6_gfx7<0x071>; +defm V_CMPSX_EQ_F64 : VOPCX_Real_gfx6_gfx7<0x072>; +defm V_CMPSX_LE_F64 : VOPCX_Real_gfx6_gfx7<0x073>; +defm V_CMPSX_GT_F64 : VOPCX_Real_gfx6_gfx7<0x074>; +defm V_CMPSX_LG_F64 : VOPCX_Real_gfx6_gfx7<0x075>; +defm V_CMPSX_GE_F64 : VOPCX_Real_gfx6_gfx7<0x076>; +defm V_CMPSX_O_F64 : VOPCX_Real_gfx6_gfx7<0x077>; +defm V_CMPSX_U_F64 : VOPCX_Real_gfx6_gfx7<0x078>; +defm V_CMPSX_NGE_F64 : VOPCX_Real_gfx6_gfx7<0x079>; +defm V_CMPSX_NLG_F64 : VOPCX_Real_gfx6_gfx7<0x07a>; +defm V_CMPSX_NGT_F64 : VOPCX_Real_gfx6_gfx7<0x07b>; +defm V_CMPSX_NLE_F64 : VOPCX_Real_gfx6_gfx7<0x07c>; +defm V_CMPSX_NEQ_F64 : VOPCX_Real_gfx6_gfx7<0x07d>; +defm V_CMPSX_NLT_F64 : VOPCX_Real_gfx6_gfx7<0x07e>; +defm V_CMPSX_TRU_F64 : VOPCX_Real_gfx6_gfx7<0x07f>; +defm V_CMP_F_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x080>; +defm V_CMP_LT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x081>; +defm V_CMP_EQ_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x082>; +defm V_CMP_LE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x083>; +defm V_CMP_GT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x084>; +defm V_CMP_NE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x085>; +defm V_CMP_GE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x086>; +defm V_CMP_T_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x087>; +defm V_CMP_CLASS_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x088>; +defm V_CMPX_F_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x090>; +defm V_CMPX_LT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x091>; +defm V_CMPX_EQ_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x092>; +defm V_CMPX_LE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x093>; +defm V_CMPX_GT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x094>; +defm V_CMPX_NE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x095>; +defm V_CMPX_GE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x096>; +defm V_CMPX_T_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x097>; +defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x098>; +defm V_CMP_F_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a0>; +defm V_CMP_LT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a1>; +defm V_CMP_EQ_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a2>; +defm V_CMP_LE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a3>; +defm V_CMP_GT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a4>; +defm V_CMP_NE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a5>; +defm V_CMP_GE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a6>; +defm V_CMP_T_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a7>; +defm V_CMP_CLASS_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a8>; +defm V_CMPX_F_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b0>; +defm V_CMPX_LT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b1>; +defm V_CMPX_EQ_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b2>; +defm V_CMPX_LE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b3>; +defm V_CMPX_GT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b4>; +defm V_CMPX_NE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b5>; +defm V_CMPX_GE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b6>; +defm V_CMPX_T_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b7>; +defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b8>; +defm V_CMP_F_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c0>; +defm V_CMP_LT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c1>; +defm V_CMP_EQ_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c2>; +defm V_CMP_LE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c3>; +defm V_CMP_GT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c4>; +defm V_CMP_NE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c5>; +defm V_CMP_GE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c6>; +defm V_CMP_T_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c7>; +defm V_CMPX_F_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d0>; +defm V_CMPX_LT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d1>; +defm V_CMPX_EQ_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d2>; +defm V_CMPX_LE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d3>; +defm V_CMPX_GT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d4>; +defm V_CMPX_NE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d5>; +defm V_CMPX_GE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d6>; +defm V_CMPX_T_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d7>; +defm V_CMP_F_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e0>; +defm V_CMP_LT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e1>; +defm V_CMP_EQ_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e2>; +defm V_CMP_LE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e3>; +defm V_CMP_GT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e4>; +defm V_CMP_NE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e5>; +defm V_CMP_GE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e6>; +defm V_CMP_T_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e7>; +defm V_CMPX_F_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f0>; +defm V_CMPX_LT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f1>; +defm V_CMPX_EQ_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f2>; +defm V_CMPX_LE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f3>; +defm V_CMPX_GT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f4>; +defm V_CMPX_NE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f5>; +defm V_CMPX_GE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f6>; +defm V_CMPX_T_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f7>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// multiclass VOPC_Real_vi op> { - let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { def _e32_vi : VOPC_Real(NAME#"_e32"), SIEncodingFamily.VI>, VOPCe; @@ -966,9 +1231,8 @@ multiclass VOPC_Real_vi op> { VOP_SDWA9_Real (NAME#"_sdwa")>, VOPC_SDWA9e (NAME#"_sdwa").Pfl>; - def : VOPCInstAlias (NAME#"_e64"), - !cast(NAME#"_e32_vi")> { - let AssemblerPredicate = isVI; + let AssemblerPredicate = isGFX8GFX9 in { + defm : VOPCInstAliases; } } diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 7de7d90d27b3..677095a354be 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -1,9 +1,8 @@ //===-- VOPInstructions.td - Vector Instruction Defintions ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -91,6 +90,7 @@ class VOP3_Pseudo pattern = [], let VOP3_OPSEL = isVop3OpSel; let IsPacked = P.IsPacked; + let IsMAI = P.IsMAI; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -100,7 +100,6 @@ class VOP3_Pseudo pattern = [], let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; // Because SGPRs may be allowed if there are multiple operands, we // need a post-isel hook to insert copies in order to avoid @@ -190,9 +189,15 @@ class VOP3a : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } -class VOP3a_si op, VOPProfile P> : VOP3a

{ +class VOP3a_gfx6_gfx7 op, VOPProfile p> : VOP3a

{ + let Inst{11} = !if(p.HasClamp, clamp{0}, 0); let Inst{25-17} = op; - let Inst{11} = !if(P.HasClamp, clamp{0}, 0); +} + +class VOP3a_gfx10 op, VOPProfile p> : VOP3a

{ + let Inst{15} = !if(p.HasClamp, clamp{0}, 0); + let Inst{25-16} = op; + let Inst{31-26} = 0x35; } class VOP3a_vi op, VOPProfile P> : VOP3a

{ @@ -200,9 +205,14 @@ class VOP3a_vi op, VOPProfile P> : VOP3a

{ let Inst{15} = !if(P.HasClamp, clamp{0}, 0); } -class VOP3e_si op, VOPProfile P> : VOP3a_si { +class VOP3e_gfx6_gfx7 op, VOPProfile p> : VOP3a_gfx6_gfx7 { bits<8> vdst; - let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0); +} + +class VOP3e_gfx10 op, VOPProfile p> : VOP3a_gfx10 { + bits<8> vdst; + let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0); } class VOP3e_vi op, VOPProfile P> : VOP3a_vi { @@ -217,6 +227,13 @@ class VOP3OpSel_gfx9 op, VOPProfile P> : VOP3e_vi { let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0); } +class VOP3OpSel_gfx10 op, VOPProfile p> : VOP3e_gfx10 { + let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasSrc1, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasSrc2, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); +} + // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa class VOP3Interp_vi op, VOPProfile P> : VOP3e_vi { bits<2> attrchan; @@ -236,6 +253,21 @@ class VOP3Interp_vi op, VOPProfile P> : VOP3e_vi { let Inst{49-41} = src0; } +class VOP3Interp_gfx10 op, VOPProfile p> : VOP3e_gfx10 { + bits<6> attr; + bits<2> attrchan; + bits<1> high; + + let Inst{8} = 0; + let Inst{9} = !if(p.HasSrc0Mods, src0_modifiers{1}, 0); + let Inst{37-32} = attr; + let Inst{39-38} = attrchan; + let Inst{40} = !if(p.HasHigh, high, 0); + let Inst{49-41} = src0; + let Inst{61} = 0; + let Inst{62} = !if(p.HasSrc0Mods, src0_modifiers{0}, 0); +} + class VOP3be : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -295,10 +327,51 @@ class VOP3Pe op, VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } -class VOP3be_si op, VOPProfile P> : VOP3be

{ +class VOP3Pe_MAI op, VOPProfile P> : Enc64 { + bits<8> vdst; + bits<10> src0; + bits<10> src1; + bits<9> src2; + bits<3> blgp; + bits<3> cbsz; + bits<4> abid; + bits<1> clamp; + + let Inst{7-0} = vdst; + + let Inst{10-8} = !if(P.HasSrc1, cbsz, 0); + let Inst{14-11} = !if(P.HasSrc1, abid, 0); + + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + + let Inst{59} = !if(P.HasSrc0, src0{9}, 0); // acc(0) + let Inst{60} = !if(P.HasSrc1, src1{9}, 0); // acc(1) + + let Inst{63-61} = !if(P.HasSrc1, blgp, 0); +} + + +class VOP3Pe_gfx10 op, VOPProfile P> : VOP3Pe { + let Inst{31-26} = 0x33; //encoding +} + +class VOP3be_gfx6_gfx7 op, VOPProfile p> : VOP3be

{ let Inst{25-17} = op; } +class VOP3be_gfx10 op, VOPProfile p> : VOP3be

{ + bits<1> clamp; + let Inst{15} = !if(p.HasClamp, clamp{0}, 0); + let Inst{25-16} = op; + let Inst{31-26} = 0x35; +} + class VOP3be_vi op, VOPProfile P> : VOP3be

{ bits<1> clamp; let Inst{25-16} = op; @@ -393,7 +466,7 @@ class VOP_SDWA9Ae : VOP_SDWA9e

{ class VOP_SDWA9Be : VOP_SDWA9e

{ bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} - let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); + let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, ?); let Inst{47} = !if(P.EmitDst, sdst{7}, 0); } @@ -456,9 +529,8 @@ class VOP_SDWA_Real : let TSFlags = ps.TSFlags; } -class VOP_SDWA9_Real : - InstSI , - SIMCInstr { +class Base_VOP_SDWA9_Real : + InstSI { let isPseudo = 0; let isCodeGenOnly = 0; @@ -485,7 +557,20 @@ class VOP_SDWA9_Real : let TSFlags = ps.TSFlags; } -class VOP_DPPe : Enc64 { +class VOP_SDWA9_Real : + Base_VOP_SDWA9_Real , + SIMCInstr ; + +class Base_VOP_SDWA10_Real : Base_VOP_SDWA9_Real { + let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst); + let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst); + let DecoderNamespace = "SDWA10"; +} + +class VOP_SDWA10_Real : + Base_VOP_SDWA10_Real, SIMCInstr; + +class VOP_DPPe : Enc64 { bits<2> src0_modifiers; bits<8> src0; bits<2> src1_modifiers; @@ -493,9 +578,11 @@ class VOP_DPPe : Enc64 { bits<1> bound_ctrl; bits<4> bank_mask; bits<4> row_mask; + bit fi; let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{48-40} = dpp_ctrl; + let Inst{50} = !if(IsDPP16, fi, ?); let Inst{51} = bound_ctrl; let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs @@ -533,8 +620,8 @@ class VOP_DPP_Pseudo pattern=[]> : let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); - let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); - let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "DPP"; VOPProfile Pfl = P; @@ -568,6 +655,67 @@ class VOP_DPP_Real : let TSFlags = ps.TSFlags; } +class VOP_DPP : + InstSI , + VOP_DPPe { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + + let VALU = 1; + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); + let SubtargetPredicate = HasDPP; + let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); + let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); + let DecoderNamespace = "DPP"; +} + +class VOP_DPP8e : Enc64 { + bits<8> src0; + bits<24> dpp8; + bits<9> fi; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{63-40} = dpp8{23-0}; +} + +class VOP_DPP8 : + InstSI, + VOP_DPP8e

{ + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + + let VALU = 1; + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = "cvtDPP8"; + let SubtargetPredicate = HasDPP8; + let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst); + let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); +} + +def DPP8Mode { + int FI_0 = 0xE9; + int FI_1 = 0xEA; +} + class getNumNodeArgs { SDNode N = !cast(Op); SDTypeProfile TP = N.TypeProfile; diff --git a/lib/Target/ARC/ARC.h b/lib/Target/ARC/ARC.h index 65f6ed67eb5b..cbbf0233706d 100644 --- a/lib/Target/ARC/ARC.h +++ b/lib/Target/ARC/ARC.h @@ -1,9 +1,8 @@ //===- ARC.h - Top-level interface for ARC representation -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,6 +25,7 @@ class ARCTargetMachine; FunctionPass *createARCISelDag(ARCTargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createARCExpandPseudosPass(); +FunctionPass *createARCOptAddrMode(); FunctionPass *createARCBranchFinalizePass(); } // end namespace llvm diff --git a/lib/Target/ARC/ARC.td b/lib/Target/ARC/ARC.td index 6635630c62a3..846f1bb6735e 100644 --- a/lib/Target/ARC/ARC.td +++ b/lib/Target/ARC/ARC.td @@ -1,9 +1,8 @@ //===- ARC.td - Describe the ARC Target Machine ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARC/ARCAsmPrinter.cpp b/lib/Target/ARC/ARCAsmPrinter.cpp index 8c13da0484fd..5c3e2c9e773c 100644 --- a/lib/Target/ARC/ARCAsmPrinter.cpp +++ b/lib/Target/ARC/ARCAsmPrinter.cpp @@ -1,9 +1,8 @@ //===- ARCAsmPrinter.cpp - ARC LLVM assembly writer -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,28 +12,18 @@ //===----------------------------------------------------------------------===// #include "ARC.h" -#include "ARCInstrInfo.h" #include "ARCMCInstLower.h" #include "ARCSubtarget.h" #include "ARCTargetMachine.h" -#include "ARCTargetStreamer.h" -#include "InstPrinter/ARCInstPrinter.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringExtras.h" +#include "MCTargetDesc/ARCInstPrinter.h" +#include "TargetInfo/ARCTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include using namespace llvm; @@ -44,7 +33,6 @@ namespace { class ARCAsmPrinter : public AsmPrinter { ARCMCInstLower MCInstLowering; - ARCTargetStreamer &getTargetStreamer(); public: explicit ARCAsmPrinter(TargetMachine &TM, @@ -58,10 +46,6 @@ public: } // end anonymous namespace -ARCTargetStreamer &ARCAsmPrinter::getTargetStreamer() { - return static_cast(*OutStreamer->getTargetStreamer()); -} - void ARCAsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallString<128> Str; raw_svector_ostream O(Str); diff --git a/lib/Target/ARC/ARCBranchFinalize.cpp b/lib/Target/ARC/ARCBranchFinalize.cpp index 3b410fa383b7..633c081b3137 100644 --- a/lib/Target/ARC/ARCBranchFinalize.cpp +++ b/lib/Target/ARC/ARCBranchFinalize.cpp @@ -1,9 +1,8 @@ //===- ARCBranchFinalize.cpp - ARC conditional branches ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCCallingConv.td b/lib/Target/ARC/ARCCallingConv.td index b7d37bc2a41f..098e03e36bca 100644 --- a/lib/Target/ARC/ARCCallingConv.td +++ b/lib/Target/ARC/ARCCallingConv.td @@ -1,9 +1,8 @@ //===- ARCCallingConv.td - Calling Conventions for ARC -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for ARC architecture. diff --git a/lib/Target/ARC/ARCExpandPseudos.cpp b/lib/Target/ARC/ARCExpandPseudos.cpp index 3177735c0529..a1646d17605f 100644 --- a/lib/Target/ARC/ARCExpandPseudos.cpp +++ b/lib/Target/ARC/ARCExpandPseudos.cpp @@ -1,9 +1,8 @@ //===- ARCExpandPseudosPass - ARC expand pseudo loads -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCFrameLowering.cpp b/lib/Target/ARC/ARCFrameLowering.cpp index ca59cb2baaa7..d8946d97deff 100644 --- a/lib/Target/ARC/ARCFrameLowering.cpp +++ b/lib/Target/ARC/ARCFrameLowering.cpp @@ -1,9 +1,8 @@ //===- ARCFrameLowering.cpp - ARC Frame Information -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -65,6 +64,8 @@ static void generateStackAdjustment(MachineBasicBlock &MBB, assert((AbsAmount % 4 == 0) && "Stack adjustments must be 4-byte aligned."); if (isUInt<6>(AbsAmount)) AdjOp = Positive ? ARC::ADD_rru6 : ARC::SUB_rru6; + else if (isInt<12>(AbsAmount)) + AdjOp = Positive ? ARC::ADD_rrs12 : ARC::SUB_rrs12; else AdjOp = Positive ? ARC::ADD_rrlimm : ARC::SUB_rrlimm; @@ -134,8 +135,12 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF, // Add in the varargs area here first. LLVM_DEBUG(dbgs() << "Varargs\n"); unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex()); - BuildMI(MBB, MBBI, dl, TII->get(ARC::SUB_rru6)) - .addReg(ARC::SP) + unsigned Opc = ARC::SUB_rrlimm; + if (isUInt<6>(VarArgsBytes)) + Opc = ARC::SUB_rru6; + else if (isInt<12>(VarArgsBytes)) + Opc = ARC::SUB_rrs12; + BuildMI(MBB, MBBI, dl, TII->get(Opc), ARC::SP) .addReg(ARC::SP) .addImm(VarArgsBytes); } @@ -247,7 +252,10 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF, // Then, replace the frame pointer by (new) [sp,StackSize-4]. // Then, move the stack pointer the rest of the way (sp = sp + StackSize). if (hasFP(MF)) { - BuildMI(MBB, MBBI, DebugLoc(), TII->get(ARC::SUB_rru6), ARC::SP) + unsigned Opc = ARC::SUB_rrlimm; + if (isUInt<6>(StackSize)) + Opc = ARC::SUB_rru6; + BuildMI(MBB, MBBI, DebugLoc(), TII->get(Opc), ARC::SP) .addReg(ARC::FP) .addImm(StackSize); AmountAboveFunclet += 4; @@ -271,19 +279,28 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF, } // Move the stack pointer up to the point of the funclet. - if (StackSize - AmountAboveFunclet) { - BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6)) - .addReg(ARC::SP) + if (unsigned MoveAmount = StackSize - AmountAboveFunclet) { + unsigned Opc = ARC::ADD_rrlimm; + if (isUInt<6>(MoveAmount)) + Opc = ARC::ADD_rru6; + else if (isInt<12>(MoveAmount)) + Opc = ARC::ADD_rrs12; + BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(Opc), ARC::SP) .addReg(ARC::SP) .addImm(StackSize - AmountAboveFunclet); } if (StackSlotsUsedByFunclet) { + // This part of the adjustment will always be < 64 bytes. BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::BL)) .addExternalSymbol(load_funclet_name[Last - ARC::R15]) .addReg(ARC::BLINK, RegState::Implicit | RegState::Kill); - BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6)) - .addReg(ARC::SP) + unsigned Opc = ARC::ADD_rrlimm; + if (isUInt<6>(4 * StackSlotsUsedByFunclet)) + Opc = ARC::ADD_rru6; + else if (isInt<12>(4 * StackSlotsUsedByFunclet)) + Opc = ARC::ADD_rrs12; + BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(Opc), ARC::SP) .addReg(ARC::SP) .addImm(4 * (StackSlotsUsedByFunclet)); } @@ -294,8 +311,8 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF, // Now, pop fp if necessary. if (hasFP(MF)) { BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::LD_AB_rs9)) - .addReg(ARC::SP, RegState::Define) .addReg(ARC::FP, RegState::Define) + .addReg(ARC::SP, RegState::Define) .addReg(ARC::SP) .addImm(4); } @@ -305,7 +322,12 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF, // Add in the varargs area here first. LLVM_DEBUG(dbgs() << "Varargs\n"); unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex()); - BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6)) + unsigned Opc = ARC::ADD_rrlimm; + if (isUInt<6>(VarArgsBytes)) + Opc = ARC::ADD_rru6; + else if (isInt<12>(VarArgsBytes)) + Opc = ARC::ADD_rrs12; + BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(Opc)) .addReg(ARC::SP) .addReg(ARC::SP) .addImm(VarArgsBytes); @@ -431,7 +453,14 @@ static void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned Reg, int NumBytes, bool IsAdd, const ARCInstrInfo *TII) { - unsigned Opc = IsAdd ? ARC::ADD_rru6 : ARC::SUB_rru6; + unsigned Opc; + if (isUInt<6>(NumBytes)) + Opc = IsAdd ? ARC::ADD_rru6 : ARC::SUB_rru6; + else if (isInt<12>(NumBytes)) + Opc = IsAdd ? ARC::ADD_rrs12 : ARC::SUB_rrs12; + else + Opc = IsAdd ? ARC::ADD_rrlimm : ARC::SUB_rrlimm; + BuildMI(MBB, MBBI, dl, TII->get(Opc), Reg) .addReg(Reg, RegState::Kill) .addImm(NumBytes); diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h index c042bec016ca..41b559d16761 100644 --- a/lib/Target/ARC/ARCFrameLowering.h +++ b/lib/Target/ARC/ARCFrameLowering.h @@ -1,9 +1,8 @@ //===- ARCFrameLowering.h - Define frame lowering for ARC -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCISelDAGToDAG.cpp b/lib/Target/ARC/ARCISelDAGToDAG.cpp index 8dbd3d5bf036..f639c4e6f0ff 100644 --- a/lib/Target/ARC/ARCISelDAGToDAG.cpp +++ b/lib/Target/ARC/ARCISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===- ARCISelDAGToDAG.cpp - ARC dag to dag inst selector -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCISelLowering.cpp b/lib/Target/ARC/ARCISelLowering.cpp index bf98af801406..847d23f0abdb 100644 --- a/lib/Target/ARC/ARCISelLowering.cpp +++ b/lib/Target/ARC/ARCISelLowering.cpp @@ -1,9 +1,8 @@ //===- ARCISelLowering.cpp - ARC DAG Lowering Impl --------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCISelLowering.h b/lib/Target/ARC/ARCISelLowering.h index fec01b13a866..4b72bfdaee9c 100644 --- a/lib/Target/ARC/ARCISelLowering.h +++ b/lib/Target/ARC/ARCISelLowering.h @@ -1,9 +1,8 @@ //===- ARCISelLowering.h - ARC DAG Lowering Interface -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCInstrFormats.td b/lib/Target/ARC/ARCInstrFormats.td index 0a49b83ef16a..e4902a73ed49 100644 --- a/lib/Target/ARC/ARCInstrFormats.td +++ b/lib/Target/ARC/ARCInstrFormats.td @@ -1,9 +1,8 @@ //===- ARCInstrFormats.td - ARC Instruction Formats --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -56,6 +55,44 @@ def GPR32Reduced : Operand { let DecoderMethod = "DecodeGBR32ShortRegister"; } +// Helper classes for load/store instructions +class DataSizeMode mode, string instSfx, string asmSfx> { + bits<2> Value = mode; + string InstSuffix = instSfx; + string AsmSuffix = asmSfx; +} +class ExtMode { + bit Value = mode; + string InstSuffix = instSfx; + string AsmSuffix = asmSfx; +} + +class AddrMode mode, string instSfx, string asmSfx> { + bits<2> Value = mode; + string InstSuffix = instSfx; + string AsmSuffix = asmSfx; +} + +class CacheMode { + bit Value = mode; + string InstSuffix = instSfx; + string AsmSuffix = asmSfx; +} + +def ByteSM : DataSizeMode<0b01, "B", "b">; +def HalfSM : DataSizeMode<0b10, "H", "h">; +def WordSM : DataSizeMode<0b00, "", "">; + +def NoEM : ExtMode<0, "", "">; +def SignedEM : ExtMode<1, "_X", ".x">; + +def NoAM : AddrMode<0b00, "", "">; +def PreIncAM : AddrMode<0b01, "_AW", ".aw">; +def PostIncAM : AddrMode<0b10, "_AB", ".ab">; + +def NoCC : CacheMode<0b0, "", "">; +def UncachedCC : CacheMode<0b1, "_DI", ".di">; + class InstARC pattern> : Instruction, Encoding64 { @@ -65,6 +102,18 @@ class InstARC pattern> let AsmString = asmstr; let Pattern = pattern; let Size = sz; + + // Load/Store instruction properties + DataSizeMode ZZ = WordSM; + ExtMode X = NoEM; + AddrMode AA = NoAM; + CacheMode DI = NoCC; + + // Field used for relation models + string BaseOpcode = ""; + + //TSFlags + let TSFlags{1-0} = AA.Value; } // ARC pseudo instructions format @@ -355,6 +404,8 @@ class F32_LD_RS9 aa, bit di, bits<2> zz, dag outs, dag ins, let Inst{8-7} = zz; let Inst{6} = x; let Inst{5-0} = A; + + let BaseOpcode = "ld_rs9"; } class F32_LD_ADDR aa, bit di, bits<2> zz, dag outs, dag ins, @@ -364,6 +415,8 @@ class F32_LD_ADDR aa, bit di, bits<2> zz, dag outs, dag ins, let B = addr{14-9}; let S9 = addr{8-0}; + + let BaseOpcode = "ld_rs9"; } @@ -388,6 +441,8 @@ class F32_LD_LIMM zz, dag outs, dag ins, let Inst{6} = x; let Inst{5-0} = A; let DecoderMethod = "DecodeLdLImmInstruction"; + + let BaseOpcode = "ld_limm"; } // Register + LImm load. The 32-bit immediate address is in Inst[63-32]. @@ -416,6 +471,8 @@ class F32_LD_RLIMM aa, bit di, bits<2> zz, dag outs, dag ins, let Inst{11-6} = LImmReg; let Inst{5-0} = A; let DecoderMethod = "DecodeLdRLImmInstruction"; + + let BaseOpcode = "ld_rlimm"; } // Register + S9 Store. (B + S9) @@ -438,6 +495,8 @@ class F32_ST_RS9 aa, bit di, bits<2> zz, dag outs, dag ins, let Inst{4-3} = aa; let Inst{2-1} = zz; let Inst{0} = 0; + + let BaseOpcode = "st_rs9"; } class F32_ST_ADDR aa, bit di, bits<2> zz, dag outs, dag ins, @@ -447,6 +506,8 @@ class F32_ST_ADDR aa, bit di, bits<2> zz, dag outs, dag ins, let B = addr{14-9}; let S9 = addr{8-0}; + + let BaseOpcode = "st_rs9"; } // LImm Store. @@ -470,6 +531,8 @@ class F32_ST_LIMM zz, dag outs, dag ins, let Inst{2-1} = zz; let Inst{0} = 0; let DecoderMethod = "DecodeStLImmInstruction"; + + let BaseOpcode = "st_limm"; } // Compact Move/Load. diff --git a/lib/Target/ARC/ARCInstrInfo.cpp b/lib/Target/ARC/ARCInstrInfo.cpp index a8084f16893b..2a660e3c4dd1 100644 --- a/lib/Target/ARC/ARCInstrInfo.cpp +++ b/lib/Target/ARC/ARCInstrInfo.cpp @@ -1,9 +1,8 @@ //===- ARCInstrInfo.cpp - ARC Instruction Information -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -28,6 +27,19 @@ using namespace llvm; #include "ARCGenInstrInfo.inc" #define DEBUG_TYPE "arc-inst-info" + +enum AddrIncType { + NoAddInc = 0, + PreInc = 1, + PostInc = 2, + Scaled = 3 +}; + +enum TSFlagsConstants { + TSF_AddrModeOff = 0, + TSF_AddModeMask = 3 +}; + // Pin the vtable to this file. void ARCInstrInfo::anchor() {} @@ -389,10 +401,42 @@ unsigned ARCInstrInfo::insertBranch(MachineBasicBlock &MBB, } unsigned ARCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { - if (MI.getOpcode() == TargetOpcode::INLINEASM) { + if (MI.isInlineAsm()) { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); } return MI.getDesc().getSize(); } + +bool ARCInstrInfo::isPostIncrement(const MachineInstr &MI) const { + const MCInstrDesc &MID = MI.getDesc(); + const uint64_t F = MID.TSFlags; + return ((F >> TSF_AddrModeOff) & TSF_AddModeMask) == PostInc; +} + +bool ARCInstrInfo::isPreIncrement(const MachineInstr &MI) const { + const MCInstrDesc &MID = MI.getDesc(); + const uint64_t F = MID.TSFlags; + return ((F >> TSF_AddrModeOff) & TSF_AddModeMask) == PreInc; +} + +bool ARCInstrInfo::getBaseAndOffsetPosition(const MachineInstr &MI, + unsigned &BasePos, + unsigned &OffsetPos) const { + if (!MI.mayLoad() && !MI.mayStore()) + return false; + + BasePos = 1; + OffsetPos = 2; + + if (isPostIncrement(MI) || isPreIncrement(MI)) { + BasePos++; + OffsetPos++; + } + + if (!MI.getOperand(BasePos).isReg() || !MI.getOperand(OffsetPos).isImm()) + return false; + + return true; +} diff --git a/lib/Target/ARC/ARCInstrInfo.h b/lib/Target/ARC/ARCInstrInfo.h index f965dd4ff7f8..1289b37c37b3 100644 --- a/lib/Target/ARC/ARCInstrInfo.h +++ b/lib/Target/ARC/ARCInstrInfo.h @@ -1,9 +1,8 @@ //===- ARCInstrInfo.h - ARC Instruction Information -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -82,6 +81,16 @@ public: bool reverseBranchCondition(SmallVectorImpl &Cond) const override; + + bool isPostIncrement(const MachineInstr &MI) const override; + + // ARC-specific + bool isPreIncrement(const MachineInstr &MI) const; + + virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, + unsigned &BasePos, + unsigned &OffsetPos) const override; + // Emit code before MBBI to load immediate value into physical register Reg. // Returns an iterator to the new instruction. MachineBasicBlock::iterator loadImmediate(MachineBasicBlock &MBB, diff --git a/lib/Target/ARC/ARCInstrInfo.td b/lib/Target/ARC/ARCInstrInfo.td index 525098c4ff66..311d998f3d86 100644 --- a/lib/Target/ARC/ARCInstrInfo.td +++ b/lib/Target/ARC/ARCInstrInfo.td @@ -1,9 +1,8 @@ //===- ARCInstrInfo.td - Target Description for ARC --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -788,50 +787,47 @@ let isReturn = 1, isTerminator = 1 in { // Load/Store instructions. //---------------------------------------------------------------------------- +// Filter class for load/store mappings +class ArcLdStRel; + // Load instruction variants: // Control bits: x, aa, di, zz // x - sign extend. // aa - incrementing mode. (N/A for LIMM). // di - uncached. // zz - data size. -multiclass ArcLdInst zz, string asmop> { - let mayLoad = 1 in { - def _rs9 : F32_LD_ADDR<0, 0b00, 0, zz, - (outs GPR32:$A), (ins MEMrs9:$addr), - !strconcat(asmop, "\t$A, [$addr]"), []>; - - def _limm : F32_LD_LIMM<0, 0, zz, - (outs GPR32:$A), (ins MEMii:$addr), - !strconcat(asmop, "\t$A, [$addr]"), []>; - - def _rlimm : F32_LD_RLIMM<0, 0b00, 0, zz, - (outs GPR32:$A), (ins MEMrlimm:$addr), - !strconcat(asmop, "\t$A, [$addr]"), []>; - - def _X_rs9 : F32_LD_ADDR<1, 0b00, 0, zz, - (outs GPR32:$A), (ins MEMrs9:$addr), - !strconcat(asmop, ".x\t$A, [$addr]"), []>; - - def _X_limm : F32_LD_LIMM<1, 0, zz, - (outs GPR32:$A), (ins MEMii:$addr), - !strconcat(asmop, ".x\t$A, [$addr]"), []>; - - def _X_rlimm : F32_LD_RLIMM<1, 0b00, 0, zz, - (outs GPR32:$A), (ins MEMrlimm:$addr), - !strconcat(asmop, ".x\t$A, [$addr]"), []>; - - def _AB_rs9 : F32_LD_RS9<0, 0b10, 0, zz, - (outs GPR32:$addrout, GPR32:$A), - (ins GPR32:$B, immS<9>:$S9), - !strconcat(asmop, ".ab\t$A, [$B,$S9]"), []> - { let Constraints = "$addrout = $B"; } +multiclass ArcLdInst { + let mayLoad = 1, ZZ = zz, X = x, DI = di in { + def _rs9: F32_LD_ADDR, ArcLdStRel; + + def _limm: F32_LD_LIMM, ArcLdStRel; + + def _rlimm: F32_LD_RLIMM, ArcLdStRel; + + foreach aa = [PreIncAM, PostIncAM] in { + def aa.InstSuffix#_rs9: F32_LD_RS9:$S9), + asmop#aa.AsmSuffix#"\t$A, [$B,$S9]", []>, ArcLdStRel + { let Constraints = "$addrout = $B"; let AA = aa; } + } + } +} + +foreach di = [NoCC, UncachedCC] in { + defm LD#di.InstSuffix : ArcLdInst; + foreach zz = [ByteSM, HalfSM] in { + foreach x = [NoEM, SignedEM] in { + defm LD#zz.InstSuffix#x.InstSuffix#di.InstSuffix : ArcLdInst; + } } } - -// Load instruction definitions. -defm LD : ArcLdInst<0b00, "ld">; -defm LDH : ArcLdInst<0b10, "ldh">; -defm LDB : ArcLdInst<0b01, "ldb">; // Load instruction patterns. // 32-bit loads. @@ -873,25 +869,32 @@ def : Pat<(sextloadi8 AddrModeS9:$addr),(LDB_X_rs9 AddrModeS9:$addr)>; // aa - incrementing mode. (N/A for LIMM). // di - uncached. // zz - data size. -multiclass ArcStInst zz, string asmop> { - let mayStore = 1 in { - def _rs9 : F32_ST_ADDR<0b00, 0, zz, (outs), (ins GPR32:$C, MEMrs9:$addr), - !strconcat(asmop, "\t$C, [$addr]"), []>; - - def _limm : F32_ST_LIMM<0, zz, (outs), (ins GPR32:$C, MEMii:$addr), - !strconcat(asmop, "\t$C, [$addr]"), []>; - - def _AW_rs9 : F32_ST_RS9<0b01, 0, zz, (outs GPR32:$addrout), - (ins GPR32:$C, GPR32:$B, immS<9>:$S9), - !strconcat(asmop, ".aw\t$C, [$B,$S9]"), []> - { let Constraints = "$addrout = $B"; } +multiclass ArcStInst { + let mayStore = 1, ZZ = zz, DI = di in { + def _rs9: F32_ST_ADDR, ArcLdStRel; + + def _limm: F32_ST_LIMM, ArcLdStRel; + + + foreach aa = [PreIncAM, PostIncAM] in { + def aa.InstSuffix#_rs9: F32_ST_RS9:$S9), + asmop#aa.AsmSuffix#"\t$C, [$B,$S9]", []>, ArcLdStRel + { let Constraints = "$addrout = $B"; let AA = aa; } + } } } -// Store instruction definitions. -defm ST : ArcStInst<0b00, "st">; -defm STH : ArcStInst<0b10, "sth">; -defm STB : ArcStInst<0b01, "stb">; +foreach di = [NoCC, UncachedCC] in { + foreach zz = [ByteSM, HalfSM, WordSM] in { + defm ST#zz.InstSuffix#di.InstSuffix : ArcStInst; + } +} // Store instruction patterns. // 32-bit stores @@ -912,3 +915,10 @@ def : Pat<(truncstorei8 i32:$C, AddrModeS9:$addr), def : Pat<(truncstorei8 i32:$C, AddrModeImm:$addr), (STB_limm i32:$C, AddrModeImm:$addr)>; +def getPostIncOpcode : InstrMapping { + let FilterClass = "ArcLdStRel"; + let RowFields = [ "BaseOpcode", "ZZ", "DI", "X"]; + let ColFields = [ "AA" ]; + let KeyCol = [ "NoAM" ]; + let ValueCols = [["PostIncAM"]]; +} diff --git a/lib/Target/ARC/ARCMCInstLower.cpp b/lib/Target/ARC/ARCMCInstLower.cpp index 43b087a57204..62462b77eccf 100644 --- a/lib/Target/ARC/ARCMCInstLower.cpp +++ b/lib/Target/ARC/ARCMCInstLower.cpp @@ -1,9 +1,8 @@ //===- ARCMCInstLower.cpp - ARC MachineInstr to MCInst ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/ARC/ARCMCInstLower.h b/lib/Target/ARC/ARCMCInstLower.h index 9a698f26334a..24a7f68c695d 100644 --- a/lib/Target/ARC/ARCMCInstLower.h +++ b/lib/Target/ARC/ARCMCInstLower.h @@ -1,9 +1,8 @@ //===- ARCMCInstLower.h - Lower MachineInstr to MCInst ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.cpp b/lib/Target/ARC/ARCMachineFunctionInfo.cpp index 7672f8d2c6dd..9cd9661ae245 100644 --- a/lib/Target/ARC/ARCMachineFunctionInfo.cpp +++ b/lib/Target/ARC/ARCMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===- ARCMachineFunctionInfo.cpp - ARC machine func info -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.h b/lib/Target/ARC/ARCMachineFunctionInfo.h index 95ad294e3668..31aa5b93246c 100644 --- a/lib/Target/ARC/ARCMachineFunctionInfo.h +++ b/lib/Target/ARC/ARCMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===- ARCMachineFunctionInfo.h - ARC machine function info -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCOptAddrMode.cpp b/lib/Target/ARC/ARCOptAddrMode.cpp new file mode 100644 index 000000000000..c922b99c57b0 --- /dev/null +++ b/lib/Target/ARC/ARCOptAddrMode.cpp @@ -0,0 +1,507 @@ +//===- ARCOptAddrMode.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass folds LD/ST + ADD pairs into Pre/Post-increment form of +/// load/store instructions. +//===----------------------------------------------------------------------===// + +#include "ARC.h" +#define GET_INSTRMAP_INFO +#include "ARCInstrInfo.h" +#include "ARCTargetMachine.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define OPTADDRMODE_DESC "ARC load/store address mode" +#define OPTADDRMODE_NAME "arc-addr-mode" +#define DEBUG_TYPE "arc-addr-mode" + +namespace llvm { +FunctionPass *createARCOptAddrMode(); +void initializeARCOptAddrModePass(PassRegistry &); +} // end namespace llvm + +namespace { +class ARCOptAddrMode : public MachineFunctionPass { +public: + static char ID; + + ARCOptAddrMode() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return OPTADDRMODE_DESC; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + const ARCSubtarget *AST = nullptr; + const ARCInstrInfo *AII = nullptr; + MachineRegisterInfo *MRI = nullptr; + MachineDominatorTree *MDT = nullptr; + + // Tries to combine \p Ldst with increment of its base register to form + // single post-increment instruction. + MachineInstr *tryToCombine(MachineInstr &Ldst); + + // Returns true if result of \p Add is not used before \p Ldst + bool noUseOfAddBeforeLoadOrStore(const MachineInstr *Add, + const MachineInstr *Ldst); + + // Returns true if load/store instruction \p Ldst can be hoisted up to + // instruction \p To + bool canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To); + + // Returns true if load/store instruction \p Ldst can be sunk down + // to instruction \p To + bool canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To); + + // Check if instructions \p Ldst and \p Add can be moved to become adjacent + // If they can return instruction which need not to move. + // If \p Uses is not null, fill it with instructions after \p Ldst which use + // \p Ldst's base register + MachineInstr *canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add, + SmallVectorImpl *Uses); + + // Returns true if all instruction in \p Uses array can be adjusted + // to accomodate increment of register \p BaseReg by \p Incr + bool canFixPastUses(const ArrayRef &Uses, + MachineOperand &Incr, unsigned BaseReg); + + // Update all instructions in \p Uses to accomodate increment + // of \p BaseReg by \p Offset + void fixPastUses(ArrayRef Uses, unsigned BaseReg, + int64_t Offset); + + // Change instruction \p Ldst to postincrement form. + // \p NewBase is register to hold update base value + // \p NewOffset is instruction's new offset + void changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode, + unsigned NewBase, MachineOperand &NewOffset); + + bool processBasicBlock(MachineBasicBlock &MBB); +}; + +} // end anonymous namespace + +char ARCOptAddrMode::ID = 0; +INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false, + false) + +// Return true if \p Off can be used as immediate offset +// operand of load/store instruction (S9 literal) +static bool isValidLoadStoreOffset(int64_t Off) { return isInt<9>(Off); } + +// Return true if \p Off can be used as immediate operand of +// ADD/SUB instruction (U6 literal) +static bool isValidIncrementOffset(int64_t Off) { return isUInt<6>(Off); } + +static bool isAddConstantOp(const MachineInstr &MI, int64_t &Amount) { + int64_t Sign = 1; + switch (MI.getOpcode()) { + case ARC::SUB_rru6: + Sign = -1; + LLVM_FALLTHROUGH; + case ARC::ADD_rru6: + assert(MI.getOperand(2).isImm() && "Expected immediate operand"); + Amount = Sign * MI.getOperand(2).getImm(); + return true; + default: + return false; + } +} + +// Return true if \p MI dominates of uses of virtual register \p VReg +static bool dominatesAllUsesOf(const MachineInstr *MI, unsigned VReg, + MachineDominatorTree *MDT, + MachineRegisterInfo *MRI) { + + assert(TargetRegisterInfo::isVirtualRegister(VReg) && + "Expected virtual register!"); + + for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end(); + it != end; ++it) { + MachineInstr *User = it->getParent(); + if (User->isPHI()) { + unsigned BBOperandIdx = User->getOperandNo(&*it) + 1; + MachineBasicBlock *MBB = User->getOperand(BBOperandIdx).getMBB(); + if (MBB->empty()) { + const MachineBasicBlock *InstBB = MI->getParent(); + assert(InstBB != MBB && "Instruction found in empty MBB"); + if (!MDT->dominates(InstBB, MBB)) + return false; + continue; + } + User = &*MBB->rbegin(); + } + + if (!MDT->dominates(MI, User)) + return false; + } + return true; +} + +// Return true if \p MI is load/store instruction with immediate offset +// which can be adjusted by \p Disp +static bool isLoadStoreThatCanHandleDisplacement(const TargetInstrInfo *TII, + const MachineInstr &MI, + int64_t Disp) { + unsigned BasePos, OffPos; + if (!TII->getBaseAndOffsetPosition(MI, BasePos, OffPos)) + return false; + const MachineOperand &MO = MI.getOperand(OffPos); + if (!MO.isImm()) + return false; + int64_t Offset = MO.getImm() + Disp; + return isValidLoadStoreOffset(Offset); +} + +bool ARCOptAddrMode::noUseOfAddBeforeLoadOrStore(const MachineInstr *Add, + const MachineInstr *Ldst) { + unsigned R = Add->getOperand(0).getReg(); + return dominatesAllUsesOf(Ldst, R, MDT, MRI); +} + +MachineInstr *ARCOptAddrMode::tryToCombine(MachineInstr &Ldst) { + assert((Ldst.mayLoad() || Ldst.mayStore()) && "LD/ST instruction expected"); + + unsigned BasePos, OffsetPos; + + LLVM_DEBUG(dbgs() << "[ABAW] tryToCombine " << Ldst); + if (!AII->getBaseAndOffsetPosition(Ldst, BasePos, OffsetPos)) { + LLVM_DEBUG(dbgs() << "[ABAW] Not a recognized load/store\n"); + return nullptr; + } + + MachineOperand &Base = Ldst.getOperand(BasePos); + MachineOperand &Offset = Ldst.getOperand(OffsetPos); + + assert(Base.isReg() && "Base operand must be register"); + if (!Offset.isImm()) { + LLVM_DEBUG(dbgs() << "[ABAW] Offset is not immediate\n"); + return nullptr; + } + + unsigned B = Base.getReg(); + if (TargetRegisterInfo::isStackSlot(B) || + !TargetRegisterInfo::isVirtualRegister(B)) { + LLVM_DEBUG(dbgs() << "[ABAW] Base is not VReg\n"); + return nullptr; + } + + // TODO: try to generate address preincrement + if (Offset.getImm() != 0) { + LLVM_DEBUG(dbgs() << "[ABAW] Non-zero offset\n"); + return nullptr; + } + + for (auto &Add : MRI->use_nodbg_instructions(B)) { + int64_t Incr; + if (!isAddConstantOp(Add, Incr)) + continue; + if (!isValidLoadStoreOffset(Incr)) + continue; + + SmallVector Uses; + MachineInstr *MoveTo = canJoinInstructions(&Ldst, &Add, &Uses); + + if (!MoveTo) + continue; + + if (!canFixPastUses(Uses, Add.getOperand(2), B)) + continue; + + LLVM_DEBUG(MachineInstr *First = &Ldst; MachineInstr *Last = &Add; + if (MDT->dominates(Last, First)) std::swap(First, Last); + dbgs() << "[ABAW] Instructions " << *First << " and " << *Last + << " combined\n"; + + ); + + MachineInstr *Result = Ldst.getNextNode(); + if (MoveTo == &Add) { + Ldst.removeFromParent(); + Add.getParent()->insertAfter(Add.getIterator(), &Ldst); + } + if (Result == &Add) + Result = Result->getNextNode(); + + fixPastUses(Uses, B, Incr); + + int NewOpcode = ARC::getPostIncOpcode(Ldst.getOpcode()); + assert(NewOpcode > 0 && "No postincrement form found"); + unsigned NewBaseReg = Add.getOperand(0).getReg(); + changeToAddrMode(Ldst, NewOpcode, NewBaseReg, Add.getOperand(2)); + Add.eraseFromParent(); + + return Result; + } + return nullptr; +} + +MachineInstr * +ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add, + SmallVectorImpl *Uses) { + assert(Ldst && Add && "NULL instruction passed"); + + MachineInstr *First = Add; + MachineInstr *Last = Ldst; + if (MDT->dominates(Ldst, Add)) + std::swap(First, Last); + else if (!MDT->dominates(Add, Ldst)) + return nullptr; + + LLVM_DEBUG(dbgs() << "canJoinInstructions: " << *First << *Last); + + unsigned BasePos, OffPos; + + if (!AII->getBaseAndOffsetPosition(*Ldst, BasePos, OffPos)) { + LLVM_DEBUG( + dbgs() + << "[canJoinInstructions] Cannot determine base/offset position\n"); + return nullptr; + } + + unsigned BaseReg = Ldst->getOperand(BasePos).getReg(); + + // prohibit this: + // v1 = add v0, c + // st v1, [v0, 0] + // and this + // st v0, [v0, 0] + // v1 = add v0, c + if (Ldst->mayStore() && Ldst->getOperand(0).isReg()) { + unsigned StReg = Ldst->getOperand(0).getReg(); + if (Add->getOperand(0).getReg() == StReg || BaseReg == StReg) { + LLVM_DEBUG(dbgs() << "[canJoinInstructions] Store uses result of Add\n"); + return nullptr; + } + } + + SmallVector UsesAfterLdst; + SmallVector UsesAfterAdd; + for (MachineInstr &MI : MRI->use_nodbg_instructions(BaseReg)) { + if (&MI == Ldst || &MI == Add) + continue; + if (&MI != Add && MDT->dominates(Ldst, &MI)) + UsesAfterLdst.push_back(&MI); + else if (!MDT->dominates(&MI, Ldst)) + return nullptr; + if (MDT->dominates(Add, &MI)) + UsesAfterAdd.push_back(&MI); + } + + MachineInstr *Result = nullptr; + + if (First == Add) { + // n = add b, i + // ... + // x = ld [b, o] or x = ld [n, o] + + if (noUseOfAddBeforeLoadOrStore(First, Last)) { + Result = Last; + LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can sink Add down to Ldst\n"); + } else if (canHoistLoadStoreTo(Ldst, Add)) { + Result = First; + LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can hoist Ldst to Add\n"); + } + } else { + // x = ld [b, o] + // ... + // n = add b, i + Result = First; + LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can hoist Add to Ldst\n"); + } + if (Result && Uses) + *Uses = (Result == Ldst) ? UsesAfterLdst : UsesAfterAdd; + return Result; +} + +bool ARCOptAddrMode::canFixPastUses(const ArrayRef &Uses, + MachineOperand &Incr, unsigned BaseReg) { + + assert(Incr.isImm() && "Expected immediate increment"); + int64_t NewOffset = Incr.getImm(); + for (MachineInstr *MI : Uses) { + int64_t Dummy; + if (isAddConstantOp(*MI, Dummy)) { + if (isValidIncrementOffset(Dummy + NewOffset)) + continue; + return false; + } + if (isLoadStoreThatCanHandleDisplacement(AII, *MI, -NewOffset)) + continue; + LLVM_DEBUG(dbgs() << "Instruction cannot handle displacement " << -NewOffset + << ": " << *MI); + return false; + } + return true; +} + +void ARCOptAddrMode::fixPastUses(ArrayRef Uses, + unsigned NewBase, int64_t NewOffset) { + + for (MachineInstr *MI : Uses) { + int64_t Amount; + unsigned BasePos, OffPos; + if (isAddConstantOp(*MI, Amount)) { + NewOffset += Amount; + assert(isValidIncrementOffset(NewOffset) && + "New offset won't fit into ADD instr"); + BasePos = 1; + OffPos = 2; + } else if (AII->getBaseAndOffsetPosition(*MI, BasePos, OffPos)) { + MachineOperand &MO = MI->getOperand(OffPos); + assert(MO.isImm() && "expected immediate operand"); + NewOffset += MO.getImm(); + assert(isValidLoadStoreOffset(NewOffset) && + "New offset won't fit into LD/ST"); + } else + llvm_unreachable("unexpected instruction"); + + MI->getOperand(BasePos).setReg(NewBase); + MI->getOperand(OffPos).setImm(NewOffset); + } +} + +bool ARCOptAddrMode::canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) { + if (Ldst->getParent() != To->getParent()) + return false; + MachineBasicBlock::const_iterator MI(To), ME(Ldst), + End(Ldst->getParent()->end()); + + bool IsStore = Ldst->mayStore(); + for (; MI != ME && MI != End; ++MI) { + if (MI->isDebugValue()) + continue; + if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() || + MI->hasUnmodeledSideEffects()) + return false; + if (IsStore && MI->mayLoad()) + return false; + } + + for (auto &O : Ldst->explicit_operands()) { + if (!O.isReg() || !O.isUse()) + continue; + MachineInstr *OpDef = MRI->getVRegDef(O.getReg()); + if (!OpDef || !MDT->dominates(OpDef, To)) + return false; + } + return true; +} + +bool ARCOptAddrMode::canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) { + // Can only sink load/store within same BB + if (Ldst->getParent() != To->getParent()) + return false; + MachineBasicBlock::const_iterator MI(Ldst), ME(To), + End(Ldst->getParent()->end()); + + bool IsStore = Ldst->mayStore(); + bool IsLoad = Ldst->mayLoad(); + + Register ValReg = IsLoad ? Ldst->getOperand(0).getReg() : Register(); + for (; MI != ME && MI != End; ++MI) { + if (MI->isDebugValue()) + continue; + if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() || + MI->hasUnmodeledSideEffects()) + return false; + if (IsStore && MI->mayLoad()) + return false; + if (ValReg && MI->readsVirtualRegister(ValReg)) + return false; + } + return true; +} + +void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode, + unsigned NewBase, + MachineOperand &NewOffset) { + bool IsStore = Ldst.mayStore(); + unsigned BasePos, OffPos; + MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF); + AII->getBaseAndOffsetPosition(Ldst, BasePos, OffPos); + + unsigned BaseReg = Ldst.getOperand(BasePos).getReg(); + + Ldst.RemoveOperand(OffPos); + Ldst.RemoveOperand(BasePos); + + if (IsStore) { + Src = Ldst.getOperand(BasePos - 1); + Ldst.RemoveOperand(BasePos - 1); + } + + Ldst.setDesc(AST->getInstrInfo()->get(NewOpcode)); + Ldst.addOperand(MachineOperand::CreateReg(NewBase, true)); + if (IsStore) + Ldst.addOperand(Src); + Ldst.addOperand(MachineOperand::CreateReg(BaseReg, false)); + Ldst.addOperand(NewOffset); + LLVM_DEBUG(dbgs() << "[ABAW] New Ldst: " << Ldst); +} + +bool ARCOptAddrMode::processBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + for (auto MI = MBB.begin(), ME = MBB.end(); MI != ME; ++MI) { + if (MI->isDebugValue()) + continue; + if (!MI->mayLoad() && !MI->mayStore()) + continue; + if (ARC::getPostIncOpcode(MI->getOpcode()) < 0) + continue; + MachineInstr *Res = tryToCombine(*MI); + if (Res) { + Changed = true; + // Res points to the next instruction. Rewind to process it + MI = std::prev(Res->getIterator()); + } + } + return Changed; +} + +bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + AST = &MF.getSubtarget(); + AII = AST->getInstrInfo(); + MRI = &MF.getRegInfo(); + MDT = &getAnalysis(); + + bool Changed = false; + for (auto &MBB : MF) + Changed |= processBasicBlock(MBB); + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createARCOptAddrMode() { return new ARCOptAddrMode(); } diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp index 38ea3c93a2d4..9c8340ac8f81 100644 --- a/lib/Target/ARC/ARCRegisterInfo.cpp +++ b/lib/Target/ARC/ARCRegisterInfo.cpp @@ -1,9 +1,8 @@ //===- ARCRegisterInfo.cpp - ARC Register Information -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,9 +82,11 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II, switch (MI.getOpcode()) { case ARC::LD_rs9: assert((Offset % 4 == 0) && "LD needs 4 byte alignment."); + LLVM_FALLTHROUGH; case ARC::LDH_rs9: case ARC::LDH_X_rs9: assert((Offset % 2 == 0) && "LDH needs 2 byte alignment."); + LLVM_FALLTHROUGH; case ARC::LDB_rs9: case ARC::LDB_X_rs9: LLVM_DEBUG(dbgs() << "Building LDFI\n"); @@ -96,8 +97,10 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II, break; case ARC::ST_rs9: assert((Offset % 4 == 0) && "ST needs 4 byte alignment."); + LLVM_FALLTHROUGH; case ARC::STH_rs9: assert((Offset % 2 == 0) && "STH needs 2 byte alignment."); + LLVM_FALLTHROUGH; case ARC::STB_rs9: LLVM_DEBUG(dbgs() << "Building STFI\n"); BuildMI(MBB, II, dl, TII.get(MI.getOpcode())) @@ -187,7 +190,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Special handling of DBG_VALUE instructions. if (MI.isDebugValue()) { - unsigned FrameReg = getFrameRegister(MF); + Register FrameReg = getFrameRegister(MF); MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); return; @@ -220,7 +223,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, ObjSize, RS, SPAdj); } -unsigned ARCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register ARCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const ARCFrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? ARC::FP : ARC::SP; } diff --git a/lib/Target/ARC/ARCRegisterInfo.h b/lib/Target/ARC/ARCRegisterInfo.h index 53abae3ac7a5..af41234e9dda 100644 --- a/lib/Target/ARC/ARCRegisterInfo.h +++ b/lib/Target/ARC/ARCRegisterInfo.h @@ -1,9 +1,8 @@ //===- ARCRegisterInfo.h - ARC Register Information Impl --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -47,7 +46,7 @@ public: CallingConv::ID CC) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; //! Return whether to emit frame moves static bool needsFrameMoves(const MachineFunction &MF); diff --git a/lib/Target/ARC/ARCRegisterInfo.td b/lib/Target/ARC/ARCRegisterInfo.td index 6d8d1b3dfd25..4b6744ad73da 100644 --- a/lib/Target/ARC/ARCRegisterInfo.td +++ b/lib/Target/ARC/ARCRegisterInfo.td @@ -1,9 +1,8 @@ //===- ARCRegisterInfo.td - ARC Register defs --------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARC/ARCSubtarget.cpp b/lib/Target/ARC/ARCSubtarget.cpp index 2107a27bf786..bce2dbd2eaa6 100644 --- a/lib/Target/ARC/ARCSubtarget.cpp +++ b/lib/Target/ARC/ARCSubtarget.cpp @@ -1,9 +1,8 @@ //===- ARCSubtarget.cpp - ARC Subtarget Information -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCSubtarget.h b/lib/Target/ARC/ARCSubtarget.h index 631d846f3c9c..0be797f753d5 100644 --- a/lib/Target/ARC/ARCSubtarget.h +++ b/lib/Target/ARC/ARCSubtarget.h @@ -1,9 +1,8 @@ //===- ARCSubtarget.h - Define Subtarget for the ARC ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp index 6f5bbd3b4ef3..9fb45d686c26 100644 --- a/lib/Target/ARC/ARCTargetMachine.cpp +++ b/lib/Target/ARC/ARCTargetMachine.cpp @@ -1,9 +1,8 @@ //===- ARCTargetMachine.cpp - Define TargetMachine for ARC ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "ARCTargetMachine.h" #include "ARC.h" #include "ARCTargetTransformInfo.h" +#include "TargetInfo/ARCTargetInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -75,7 +75,10 @@ bool ARCPassConfig::addInstSelector() { void ARCPassConfig::addPreEmitPass() { addPass(createARCBranchFinalizePass()); } -void ARCPassConfig::addPreRegAlloc() { addPass(createARCExpandPseudosPass()); } +void ARCPassConfig::addPreRegAlloc() { + addPass(createARCExpandPseudosPass()); + addPass(createARCOptAddrMode()); +} // Force static initialization. extern "C" void LLVMInitializeARCTarget() { diff --git a/lib/Target/ARC/ARCTargetMachine.h b/lib/Target/ARC/ARCTargetMachine.h index 18117e3409af..c5e8c3f2936d 100644 --- a/lib/Target/ARC/ARCTargetMachine.h +++ b/lib/Target/ARC/ARCTargetMachine.h @@ -1,9 +1,8 @@ //===- ARCTargetMachine.h - Define TargetMachine for ARC --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/ARCTargetStreamer.h b/lib/Target/ARC/ARCTargetStreamer.h index 29fdfda661a4..abe89673316f 100644 --- a/lib/Target/ARC/ARCTargetStreamer.h +++ b/lib/Target/ARC/ARCTargetStreamer.h @@ -1,9 +1,8 @@ //===- ARCTargetStreamer.h - ARC Target Streamer ----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARC/ARCTargetTransformInfo.h b/lib/Target/ARC/ARCTargetTransformInfo.h index 20a83d5ae4c7..3e34008902b5 100644 --- a/lib/Target/ARC/ARCTargetTransformInfo.h +++ b/lib/Target/ARC/ARCTargetTransformInfo.h @@ -1,9 +1,8 @@ //===- ARCTargetTransformInfo.h - ARC specific TTI --------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // \file diff --git a/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/lib/Target/ARC/Disassembler/ARCDisassembler.cpp index 3fc5a033dd5d..82da18617b91 100644 --- a/lib/Target/ARC/Disassembler/ARCDisassembler.cpp +++ b/lib/Target/ARC/Disassembler/ARCDisassembler.cpp @@ -1,9 +1,8 @@ //===- ARCDisassembler.cpp - Disassembler for ARC ---------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -15,6 +14,7 @@ #include "ARC.h" #include "ARCRegisterInfo.h" #include "MCTargetDesc/ARCMCTargetDesc.h" +#include "TargetInfo/ARCTargetInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" diff --git a/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp b/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp deleted file mode 100644 index 9c820c2fc595..000000000000 --- a/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp +++ /dev/null @@ -1,180 +0,0 @@ -//===- ARCInstPrinter.cpp - ARC MCInst to assembly syntax -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an ARC MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "ARCInstPrinter.h" -#include "MCTargetDesc/ARCInfo.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "ARCGenAsmWriter.inc" - -template -static const char *BadConditionCode(T cc) { - LLVM_DEBUG(dbgs() << "Unknown condition code passed: " << cc << "\n"); - return "{unknown-cc}"; -} - -static const char *ARCBRCondCodeToString(ARCCC::BRCondCode BRCC) { - switch (BRCC) { - case ARCCC::BREQ: - return "eq"; - case ARCCC::BRNE: - return "ne"; - case ARCCC::BRLT: - return "lt"; - case ARCCC::BRGE: - return "ge"; - case ARCCC::BRLO: - return "lo"; - case ARCCC::BRHS: - return "hs"; - } - return BadConditionCode(BRCC); -} - -static const char *ARCCondCodeToString(ARCCC::CondCode CC) { - switch (CC) { - case ARCCC::EQ: - return "eq"; - case ARCCC::NE: - return "ne"; - case ARCCC::P: - return "p"; - case ARCCC::N: - return "n"; - case ARCCC::HS: - return "hs"; - case ARCCC::LO: - return "lo"; - case ARCCC::GT: - return "gt"; - case ARCCC::GE: - return "ge"; - case ARCCC::VS: - return "vs"; - case ARCCC::VC: - return "vc"; - case ARCCC::LT: - return "lt"; - case ARCCC::LE: - return "le"; - case ARCCC::HI: - return "hi"; - case ARCCC::LS: - return "ls"; - case ARCCC::PNZ: - return "pnz"; - case ARCCC::AL: - return "al"; - case ARCCC::NZ: - return "nz"; - case ARCCC::Z: - return "z"; - } - return BadConditionCode(CC); -} - -void ARCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << StringRef(getRegisterName(RegNo)).lower(); -} - -void ARCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - printInstruction(MI, O); - printAnnotation(O, Annot); -} - -static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI, - raw_ostream &OS) { - int Offset = 0; - const MCSymbolRefExpr *SRE; - - if (const auto *CE = dyn_cast(Expr)) { - OS << "0x"; - OS.write_hex(CE->getValue()); - return; - } - - if (const auto *BE = dyn_cast(Expr)) { - SRE = dyn_cast(BE->getLHS()); - const auto *CE = dyn_cast(BE->getRHS()); - assert(SRE && CE && "Binary expression must be sym+const."); - Offset = CE->getValue(); - } else { - SRE = dyn_cast(Expr); - assert(SRE && "Unexpected MCExpr type."); - } - assert(SRE->getKind() == MCSymbolRefExpr::VK_None); - - // Symbols are prefixed with '@' - OS << '@'; - SRE->getSymbol().print(OS, MAI); - - if (Offset) { - if (Offset > 0) - OS << '+'; - OS << Offset; - } -} - -void ARCInstPrinter::printOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - return; - } - - if (Op.isImm()) { - O << Op.getImm(); - return; - } - - assert(Op.isExpr() && "unknown operand kind in printOperand"); - printExpr(Op.getExpr(), &MAI, O); -} - -void ARCInstPrinter::printMemOperandRI(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &base = MI->getOperand(OpNum); - const MCOperand &offset = MI->getOperand(OpNum + 1); - assert(base.isReg() && "Base should be register."); - assert(offset.isImm() && "Offset should be immediate."); - printRegName(O, base.getReg()); - O << "," << offset.getImm(); -} - -void ARCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - - const MCOperand &Op = MI->getOperand(OpNum); - assert(Op.isImm() && "Predicate operand is immediate."); - O << ARCCondCodeToString((ARCCC::CondCode)Op.getImm()); -} - -void ARCInstPrinter::printBRCCPredicateOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - assert(Op.isImm() && "Predicate operand is immediate."); - O << ARCBRCondCodeToString((ARCCC::BRCondCode)Op.getImm()); -} diff --git a/lib/Target/ARC/InstPrinter/ARCInstPrinter.h b/lib/Target/ARC/InstPrinter/ARCInstPrinter.h deleted file mode 100644 index bb3898a67cef..000000000000 --- a/lib/Target/ARC/InstPrinter/ARCInstPrinter.h +++ /dev/null @@ -1,46 +0,0 @@ -//===- ARCInstPrinter.h - Convert ARC MCInst to assembly syntax -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file contains the declaration of the ARCInstPrinter class, -/// which is used to print ARC MCInst to a .s file. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_ARC_INSTPRINTER_ARCINSTPRINTER_H -#define LLVM_LIB_TARGET_ARC_INSTPRINTER_ARCINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class ARCInstPrinter : public MCInstPrinter { -public: - ARCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - -private: - void printMemOperandRI(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printBRCCPredicateOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O); -}; -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_ARC_INSTPRINTER_ARCINSTPRINTER_H diff --git a/lib/Target/ARC/MCTargetDesc/ARCInfo.h b/lib/Target/ARC/MCTargetDesc/ARCInfo.h index 401b4c5e6613..57a77631a1fb 100644 --- a/lib/Target/ARC/MCTargetDesc/ARCInfo.h +++ b/lib/Target/ARC/MCTargetDesc/ARCInfo.h @@ -1,9 +1,8 @@ //===- ARCInfo.h - Additional ARC Info --------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp new file mode 100644 index 000000000000..e3e0ea489957 --- /dev/null +++ b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp @@ -0,0 +1,179 @@ +//===- ARCInstPrinter.cpp - ARC MCInst to assembly syntax -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "ARCInstPrinter.h" +#include "MCTargetDesc/ARCInfo.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "ARCGenAsmWriter.inc" + +template +static const char *BadConditionCode(T cc) { + LLVM_DEBUG(dbgs() << "Unknown condition code passed: " << cc << "\n"); + return "{unknown-cc}"; +} + +static const char *ARCBRCondCodeToString(ARCCC::BRCondCode BRCC) { + switch (BRCC) { + case ARCCC::BREQ: + return "eq"; + case ARCCC::BRNE: + return "ne"; + case ARCCC::BRLT: + return "lt"; + case ARCCC::BRGE: + return "ge"; + case ARCCC::BRLO: + return "lo"; + case ARCCC::BRHS: + return "hs"; + } + return BadConditionCode(BRCC); +} + +static const char *ARCCondCodeToString(ARCCC::CondCode CC) { + switch (CC) { + case ARCCC::EQ: + return "eq"; + case ARCCC::NE: + return "ne"; + case ARCCC::P: + return "p"; + case ARCCC::N: + return "n"; + case ARCCC::HS: + return "hs"; + case ARCCC::LO: + return "lo"; + case ARCCC::GT: + return "gt"; + case ARCCC::GE: + return "ge"; + case ARCCC::VS: + return "vs"; + case ARCCC::VC: + return "vc"; + case ARCCC::LT: + return "lt"; + case ARCCC::LE: + return "le"; + case ARCCC::HI: + return "hi"; + case ARCCC::LS: + return "ls"; + case ARCCC::PNZ: + return "pnz"; + case ARCCC::AL: + return "al"; + case ARCCC::NZ: + return "nz"; + case ARCCC::Z: + return "z"; + } + return BadConditionCode(CC); +} + +void ARCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << StringRef(getRegisterName(RegNo)).lower(); +} + +void ARCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI, + raw_ostream &OS) { + int Offset = 0; + const MCSymbolRefExpr *SRE; + + if (const auto *CE = dyn_cast(Expr)) { + OS << "0x"; + OS.write_hex(CE->getValue()); + return; + } + + if (const auto *BE = dyn_cast(Expr)) { + SRE = dyn_cast(BE->getLHS()); + const auto *CE = dyn_cast(BE->getRHS()); + assert(SRE && CE && "Binary expression must be sym+const."); + Offset = CE->getValue(); + } else { + SRE = dyn_cast(Expr); + assert(SRE && "Unexpected MCExpr type."); + } + assert(SRE->getKind() == MCSymbolRefExpr::VK_None); + + // Symbols are prefixed with '@' + OS << '@'; + SRE->getSymbol().print(OS, MAI); + + if (Offset) { + if (Offset > 0) + OS << '+'; + OS << Offset; + } +} + +void ARCInstPrinter::printOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + return; + } + + if (Op.isImm()) { + O << Op.getImm(); + return; + } + + assert(Op.isExpr() && "unknown operand kind in printOperand"); + printExpr(Op.getExpr(), &MAI, O); +} + +void ARCInstPrinter::printMemOperandRI(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &base = MI->getOperand(OpNum); + const MCOperand &offset = MI->getOperand(OpNum + 1); + assert(base.isReg() && "Base should be register."); + assert(offset.isImm() && "Offset should be immediate."); + printRegName(O, base.getReg()); + O << "," << offset.getImm(); +} + +void ARCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + + const MCOperand &Op = MI->getOperand(OpNum); + assert(Op.isImm() && "Predicate operand is immediate."); + O << ARCCondCodeToString((ARCCC::CondCode)Op.getImm()); +} + +void ARCInstPrinter::printBRCCPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + assert(Op.isImm() && "Predicate operand is immediate."); + O << ARCBRCondCodeToString((ARCCC::BRCondCode)Op.getImm()); +} diff --git a/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h new file mode 100644 index 000000000000..5ea58407f9ed --- /dev/null +++ b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h @@ -0,0 +1,45 @@ +//===- ARCInstPrinter.h - Convert ARC MCInst to assembly syntax -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the ARCInstPrinter class, +/// which is used to print ARC MCInst to a .s file. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARC_INSTPRINTER_ARCINSTPRINTER_H +#define LLVM_LIB_TARGET_ARC_INSTPRINTER_ARCINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class ARCInstPrinter : public MCInstPrinter { +public: + ARCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + +private: + void printMemOperandRI(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printBRCCPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); +}; +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_ARC_INSTPRINTER_ARCINSTPRINTER_H diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp index 5d3fb52cfb45..10f93e292e9b 100644 --- a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp +++ b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===- ARCMCAsmInfo.cpp - ARC asm properties --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h index 997a370fee8d..a086bd88d459 100644 --- a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h +++ b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h @@ -1,9 +1,8 @@ //===- ARCMCAsmInfo.h - ARC asm properties ----------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp index 17be15f730de..aa4818cd57ac 100644 --- a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp +++ b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===- ARCMCTargetDesc.cpp - ARC Target Descriptions ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,11 @@ //===----------------------------------------------------------------------===// #include "ARCMCTargetDesc.h" +#include "ARCInstPrinter.h" #include "ARCMCAsmInfo.h" #include "ARCTargetStreamer.h" -#include "InstPrinter/ARCInstPrinter.h" +#include "TargetInfo/ARCTargetInfo.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h index dd152a6a34f9..ab06ce46d99f 100644 --- a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h +++ b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h @@ -1,9 +1,8 @@ //===- ARCMCTargetDesc.h - ARC Target Descriptions --------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,8 +19,6 @@ namespace llvm { class Target; -Target &getTheARCTarget(); - } // end namespace llvm // Defines symbolic names for ARC registers. This defines a mapping from diff --git a/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp b/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp index 460b0a9f3e9b..59b9f806d590 100644 --- a/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp +++ b/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp @@ -1,13 +1,12 @@ //===- ARCTargetInfo.cpp - ARC Target Implementation ----------- *- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "ARC.h" +#include "TargetInfo/ARCTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/ARC/TargetInfo/ARCTargetInfo.h b/lib/Target/ARC/TargetInfo/ARCTargetInfo.h new file mode 100644 index 000000000000..6a9d2685f422 --- /dev/null +++ b/lib/Target/ARC/TargetInfo/ARCTargetInfo.h @@ -0,0 +1,20 @@ +//===- ARCTargetInfo.h - ARC Target Implementation ------------- *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARC_TARGETINFO_ARCTARGETINFO_H +#define LLVM_LIB_TARGET_ARC_TARGETINFO_ARCTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheARCTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_ARC_TARGETINFO_ARCTARGETINFO_H diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index be88fe4ddb14..fb238bfc9cbc 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -1,9 +1,8 @@ //=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index b5cc45c5cc94..bf8ed6562fe7 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -1,9 +1,8 @@ //===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -36,7 +35,7 @@ class MachineInstr; class MCInst; class PassRegistry; - +FunctionPass *createARMLowOverheadLoopsPass(); Pass *createARMParallelDSPPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); @@ -47,6 +46,7 @@ FunctionPass *createARMCodeGenPreparePass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); +FunctionPass *createMVEVPTBlockPass(); FunctionPass *createARMOptimizeBarriersPass(); FunctionPass *createThumb2SizeReductionPass( std::function Ftor = nullptr); @@ -57,11 +57,6 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); -void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB, - BasicBlockInfo &BBI); -std::vector computeAllBlockSizes(MachineFunction *MF); - - void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); @@ -69,6 +64,9 @@ void initializeARMCodeGenPreparePass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); +void initializeThumb2ITBlockPass(PassRegistry &); +void initializeMVEVPTBlockPass(PassRegistry &); +void initializeARMLowOverheadLoopsPass(PassRegistry &); } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 3db60f1c16d6..b687db12eaf5 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -1,9 +1,8 @@ //===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,12 +32,59 @@ def ModeSoftFloat : SubtargetFeature<"soft-float","UseSoftFloat", // // Floating Point, HW Division and Neon Support -def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", - "Enable VFP2 instructions">; -def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", - "Enable VFP3 instructions", - [FeatureVFP2]>; +// FP loads/stores/moves, shared between VFP and MVE (even in the integer-only +// version). +def FeatureFPRegs : SubtargetFeature<"fpregs", "HasFPRegs", "true", + "Enable FP registers">; + +// 16-bit FP loads/stores/moves, shared between VFP (with the v8.2A FP16 +// extension) and MVE (even in the integer-only version). +def FeatureFPRegs16 : SubtargetFeature<"fpregs16", "HasFPRegs16", "true", + "Enable 16-bit FP registers", + [FeatureFPRegs]>; + +def FeatureFPRegs64 : SubtargetFeature<"fpregs64", "HasFPRegs64", "true", + "Enable 64-bit FP registers", + [FeatureFPRegs]>; + +def FeatureFP64 : SubtargetFeature<"fp64", "HasFP64", "true", + "Floating point unit supports " + "double precision", + [FeatureFPRegs64]>; + +def FeatureD32 : SubtargetFeature<"d32", "HasD32", "true", + "Extend FP to 32 double registers">; + +multiclass VFPver prev = [], + list otherimplies = []> { + def _D16_SP: SubtargetFeature< + name#"d16sp", query#"D16SP", "true", + description#" with only 16 d-registers and no double precision", + !foreach(v, prev, !cast(v # "_D16_SP")) # otherimplies>; + def _SP: SubtargetFeature< + name#"sp", query#"SP", "true", + description#" with no double precision", + !foreach(v, prev, !cast(v # "_SP")) # + otherimplies # [FeatureD32, !cast(NAME # "_D16_SP")]>; + def _D16: SubtargetFeature< + name#"d16", query#"D16", "true", + description#" with only 16 d-registers", + !foreach(v, prev, !cast(v # "_D16")) # + otherimplies # [FeatureFP64, !cast(NAME # "_D16_SP")]>; + def "": SubtargetFeature< + name, query, "true", description, + prev # otherimplies # [ + !cast(NAME # "_D16"), + !cast(NAME # "_SP")]>; +} + +defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions", + [], [FeatureFPRegs]>; + +defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions", + [FeatureVFP2]>; def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable NEON instructions", @@ -48,31 +94,22 @@ def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable half-precision " "floating point">; -def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", - "Enable VFP4 instructions", - [FeatureVFP3, FeatureFP16]>; +defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions", + [FeatureVFP3], [FeatureFP16]>; -def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", - "true", "Enable ARMv8 FP", - [FeatureVFP4]>; +defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP", + [FeatureVFP4]>; def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "Enable full half-precision " "floating point", - [FeatureFPARMv8]>; + [FeatureFPARMv8_D16_SP, FeatureFPRegs16]>; def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", "Enable full half-precision " "floating point fml instructions", [FeatureFullFP16]>; -def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", - "Floating point unit supports " - "single precision only">; - -def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", - "Restrict FP to 16 double registers">; - def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb", "true", "Enable divide instructions in Thumb">; @@ -368,6 +405,12 @@ def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", def FeatureSB : SubtargetFeature<"sb", "HasSB", "true", "Enable v8.5a Speculation Barrier" >; +// Armv8.1-M extensions + +def FeatureLOB : SubtargetFeature<"lob", "HasLOB", "true", + "Enable Low Overhead Branch " + "extensions">; + //===----------------------------------------------------------------------===// // ARM architecture class // @@ -461,6 +504,19 @@ def HasV8_5aOps : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", [HasV8_4aOps, FeatureSB]>; +def HasV8_1MMainlineOps : SubtargetFeature< + "v8.1m.main", "HasV8_1MMainlineOps", "true", + "Support ARM v8-1M Mainline instructions", + [HasV8MMainlineOps]>; +def HasMVEIntegerOps : SubtargetFeature< + "mve", "HasMVEIntegerOps", "true", + "Support M-Class Vector Extension with integer ops", + [HasV8_1MMainlineOps, FeatureDSP, FeatureFPRegs16, FeatureFPRegs64]>; +def HasMVEFloatOps : SubtargetFeature< + "mve.fp", "HasMVEFloatOps", "true", + "Support M-Class Vector Extension with integer and floating ops", + [HasMVEIntegerOps, FeatureFPARMv8_D16_SP, FeatureFullFP16]>; + //===----------------------------------------------------------------------===// // ARM Processor subtarget features. // @@ -495,6 +551,8 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", "Cortex-A73 ARM processors", []>; def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", []>; +def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", + "Cortex-A76 ARM processors", []>; def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", "Qualcomm Krait processors", []>; @@ -744,6 +802,18 @@ def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline", FeatureAcquireRelease, FeatureMClass]>; +def ARMv81mMainline : Architecture<"armv8.1-m.main", "ARMv81mMainline", + [HasV8_1MMainlineOps, + FeatureNoARM, + ModeThumb, + FeatureDB, + FeatureHWDivThumb, + Feature8MSecExt, + FeatureAcquireRelease, + FeatureMClass, + FeatureRAS, + FeatureLOB]>; + // Aliases def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>; @@ -757,6 +827,7 @@ def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; // ARM schedules. //===----------------------------------------------------------------------===// // +include "ARMPredicates.td" include "ARMSchedule.td" //===----------------------------------------------------------------------===// @@ -942,14 +1013,12 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVFP3, - FeatureD16, + FeatureVFP3_D16, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureHasRetAddrStack, - FeatureVFP3, - FeatureD16, + FeatureVFP3_D16, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, @@ -957,8 +1026,7 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, FeatureHasRetAddrStack, - FeatureVFP3, - FeatureD16, + FeatureVFP3_D16, FeatureFP16, FeatureMP, FeatureSlowFPBrcc, @@ -968,8 +1036,7 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureHasRetAddrStack, - FeatureVFP3, - FeatureD16, + FeatureVFP3_D16, FeatureFP16, FeatureMP, FeatureSlowFPBrcc, @@ -977,39 +1044,52 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m, +def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m, ProcM3, FeaturePrefLoopAlign32, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; -def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m, +def : ProcessorModel<"sc300", CortexM4Model, [ARMv7m, ProcM3, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; -def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, - FeatureVFP4, - FeatureVFPOnlySP, - FeatureD16, +def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, + FeatureVFP4_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, - FeatureFPARMv8, - FeatureD16]>; + FeatureFPARMv8_D16]>; def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, FeatureNoMovt]>; -def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, +def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline, FeatureDSP, - FeatureFPARMv8, - FeatureD16, - FeatureVFPOnlySP, + FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; +def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, + FeatureDSP, + FeatureFPARMv8_D16_SP, + FeaturePrefLoopAlign32, + FeatureHasSlowFPVMLx, + FeatureUseMISched, + FeatureUseAA, + FeatureHasNoBranchPredictor]>; + + def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, @@ -1060,6 +1140,22 @@ def : ProcNoItin<"cortex-a75", [ARMv82a, ProcA75, FeatureHWDivARM, FeatureDotProd]>; +def : ProcNoItin<"cortex-a76", [ARMv82a, ProcA76, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureDotProd]>; + +def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureDotProd]>; + def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, @@ -1081,6 +1177,9 @@ def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynos]>; def : ProcNoItin<"exynos-m4", [ARMv82a, ProcExynos, FeatureFullFP16, FeatureDotProd]>; +def : ProcNoItin<"exynos-m5", [ARMv82a, ProcExynos, + FeatureFullFP16, + FeatureDotProd]>; def : ProcNoItin<"kryo", [ARMv8a, ProcKryo, FeatureHWDivThumb, diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index b7cd3a0c2dae..e29077266fcd 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,9 +17,10 @@ #include "ARMMachineFunctionInfo.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" -#include "InstPrinter/ARMInstPrinter.h" #include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMInstPrinter.h" #include "MCTargetDesc/ARMMCExpr.h" +#include "TargetInfo/ARMTargetInfo.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/BinaryFormat/COFF.h" @@ -120,13 +120,13 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // Calculate this function's optimization goal. unsigned OptimizationGoal; - if (F.hasFnAttribute(Attribute::OptimizeNone)) + if (F.hasOptNone()) // For best debugging illusion, speed and small size sacrificed OptimizationGoal = 6; - else if (F.optForMinSize()) + else if (F.hasMinSize()) // Aggressively for small size, speed and debug illusion sacrificed OptimizationGoal = 4; - else if (F.optForSize()) + else if (F.hasOptSize()) // For small size, but speed and debugging illusion preserved OptimizationGoal = 3; else if (TM.getOptLevel() == CodeGenOpt::Aggressive) @@ -184,10 +184,21 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } +void ARMAsmPrinter::PrintSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { + assert(MO.isGlobal() && "caller should check MO.isGlobal"); + unsigned TF = MO.getTargetFlags(); + if (TF & ARMII::MO_LO16) + O << ":lower16:"; + else if (TF & ARMII::MO_HI16) + O << ":upper16:"; + GetARMGVSymbol(MO.getGlobal(), TF)->print(O, MAI); + printOffset(MO.getOffset(), O); +} + void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNum); - unsigned TF = MO.getTargetFlags(); switch (MO.getType()) { default: llvm_unreachable(""); @@ -204,27 +215,20 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, break; } case MachineOperand::MO_Immediate: { - int64_t Imm = MO.getImm(); O << '#'; + unsigned TF = MO.getTargetFlags(); if (TF == ARMII::MO_LO16) O << ":lower16:"; else if (TF == ARMII::MO_HI16) O << ":upper16:"; - O << Imm; + O << MO.getImm(); break; } case MachineOperand::MO_MachineBasicBlock: MO.getMBB()->getSymbol()->print(O, MAI); return; case MachineOperand::MO_GlobalAddress: { - const GlobalValue *GV = MO.getGlobal(); - if (TF & ARMII::MO_LO16) - O << ":lower16:"; - else if (TF & ARMII::MO_HI16) - O << ":upper16:"; - GetARMGVSymbol(GV, TF)->print(O, MAI); - - printOffset(MO.getOffset(), O); + PrintSymbolOperand(MO, O); break; } case MachineOperand::MO_ConstantPoolIndex: @@ -256,8 +260,7 @@ GetARMJTIPICJumpTableLabel(unsigned uid) const { } bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) { + const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. @@ -265,20 +268,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O); - case 'a': // Print as a memory address. - if (MI->getOperand(OpNum).isReg()) { - O << "[" - << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()) - << "]"; - return false; - } - LLVM_FALLTHROUGH; - case 'c': // Don't print "#" before an immediate operand. - if (!MI->getOperand(OpNum).isImm()) - return true; - O << MI->getOperand(OpNum).getImm(); - return false; + return AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O); case 'P': // Print a VFP double precision register. case 'q': // Print a NEON quad precision register. printOperand(MI, OpNum, O); @@ -444,8 +434,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, } bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNum, unsigned AsmVariant, - const char *ExtraCode, + unsigned OpNum, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { @@ -668,7 +657,7 @@ void ARMAsmPrinter::emitAttributes() { ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::IEEEDenormals); else { - if (!STI.hasVFP2()) { + if (!STI.hasVFP2Base()) { // When the target doesn't have an FPU (by design or // intention), the assumptions made on the software support // mirror that of the equivalent hardware support *if it @@ -678,7 +667,7 @@ void ARMAsmPrinter::emitAttributes() { if (STI.hasV7Ops()) ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::PreserveFPSign); - } else if (STI.hasVFP3()) { + } else if (STI.hasVFP3Base()) { // In VFPv4, VFPv4U, VFPv3, or VFPv3U, it is preserved. That is, // the sign bit of the zero matches the sign bit of the input or // result that is being flushed to zero. @@ -773,6 +762,14 @@ void ARMAsmPrinter::emitAttributes() { //===----------------------------------------------------------------------===// +static MCSymbol *getBFLabel(StringRef Prefix, unsigned FunctionNumber, + unsigned LabelId, MCContext &Ctx) { + + MCSymbol *Label = Ctx.getOrCreateSymbol(Twine(Prefix) + + "BF" + Twine(FunctionNumber) + "_" + Twine(LabelId)); + return Label; +} + static MCSymbol *getPICLabel(StringRef Prefix, unsigned FunctionNumber, unsigned LabelId, MCContext &Ctx) { @@ -1074,7 +1071,6 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { const TargetRegisterInfo *TargetRegInfo = MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo(); - const ARMFunctionInfo &AFI = *MF.getInfo(); unsigned FramePtr = TargetRegInfo->getFrameRegister(MF); unsigned Opc = MI->getOpcode(); @@ -1138,7 +1134,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { Pad += Width; continue; } - RegList.push_back(MO.getReg()); + // Check for registers that are remapped (for a Thumb1 prologue that + // saves high registers). + unsigned Reg = MO.getReg(); + if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg)) + Reg = RemappedReg; + RegList.push_back(Reg); } break; case ARM::STR_PRE_IMM: @@ -1188,7 +1189,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { unsigned CPI = MI->getOperand(1).getIndex(); const MachineConstantPool *MCP = MF.getConstantPool(); if (CPI >= MCP->getConstants().size()) - CPI = AFI.getOriginalCPIdx(CPI); + CPI = AFI->getOriginalCPIdx(CPI); assert(CPI != -1U && "Invalid constpool index"); // Derive the actual offset. @@ -1218,8 +1219,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { } else if (DstReg == ARM::SP) { MI->print(errs()); llvm_unreachable("Unsupported opcode for unwinding information"); - } - else { + } else if (Opc == ARM::tMOVr) { + // If a Thumb1 function spills r8-r11, we copy the values to low + // registers before pushing them. Record the copy so we can emit the + // correct ".save" later. + AFI->EHPrologueRemappedRegs[DstReg] = SrcReg; + } else { MI->print(errs()); llvm_unreachable("Unsupported opcode for unwinding information"); } @@ -1447,6 +1452,66 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } + case ARM::t2BFi: + case ARM::t2BFic: + case ARM::t2BFLi: + case ARM::t2BFr: + case ARM::t2BFLr: { + // This is a Branch Future instruction. + + const MCExpr *BranchLabel = MCSymbolRefExpr::create( + getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(0).getIndex(), OutContext), + OutContext); + + auto MCInst = MCInstBuilder(Opc).addExpr(BranchLabel); + if (MI->getOperand(1).isReg()) { + // For BFr/BFLr + MCInst.addReg(MI->getOperand(1).getReg()); + } else { + // For BFi/BFLi/BFic + const MCExpr *BranchTarget; + if (MI->getOperand(1).isMBB()) + BranchTarget = MCSymbolRefExpr::create( + MI->getOperand(1).getMBB()->getSymbol(), OutContext); + else if (MI->getOperand(1).isGlobal()) { + const GlobalValue *GV = MI->getOperand(1).getGlobal(); + BranchTarget = MCSymbolRefExpr::create( + GetARMGVSymbol(GV, MI->getOperand(1).getTargetFlags()), OutContext); + } else if (MI->getOperand(1).isSymbol()) { + BranchTarget = MCSymbolRefExpr::create( + GetExternalSymbolSymbol(MI->getOperand(1).getSymbolName()), + OutContext); + } else + llvm_unreachable("Unhandled operand kind in Branch Future instruction"); + + MCInst.addExpr(BranchTarget); + } + + if (Opc == ARM::t2BFic) { + const MCExpr *ElseLabel = MCSymbolRefExpr::create( + getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(2).getIndex(), OutContext), + OutContext); + MCInst.addExpr(ElseLabel); + MCInst.addImm(MI->getOperand(3).getImm()); + } else { + MCInst.addImm(MI->getOperand(2).getImm()) + .addReg(MI->getOperand(3).getReg()); + } + + EmitToStreamer(*OutStreamer, MCInst); + return; + } + case ARM::t2BF_LabelPseudo: { + // This is a pseudo op for a label used by a branch future instruction + + // Emit the label. + OutStreamer->EmitLabel(getBFLabel(DL.getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(0).getIndex(), OutContext)); + return; + } case ARM::tPICADD: { // This is a pseudo op for a label + instruction sequence, which looks like: // LPC0: diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index 0ba4bc05d6f7..a4b37fa2331f 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -1,9 +1,8 @@ //===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -76,12 +75,11 @@ public: void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); + void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, const MCSubtargetInfo *EndInfo) const override; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index bbebed59c851..222aa85856a2 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -134,7 +133,7 @@ ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, ScheduleHazardRecognizer *ARMBaseInstrInfo:: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const { - if (Subtarget.isThumb2() || Subtarget.hasVFP2()) + if (Subtarget.isThumb2() || Subtarget.hasVFP2Base()) return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG); return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); } @@ -707,15 +706,7 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (MCID.getSize()) return MCID.getSize(); - // If this machine instr is an inline asm, measure it. - if (MI.getOpcode() == ARM::INLINEASM) { - unsigned Size = getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); - if (!MF->getInfo()->isThumbFunction()) - Size = alignTo(Size, 4); - return Size; - } - unsigned Opc = MI.getOpcode(); - switch (Opc) { + switch (MI.getOpcode()) { default: // pseudo-instruction sizes are zero. return 0; @@ -752,6 +743,14 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return 12; case ARM::SPACE: return MI.getOperand(1).getImm(); + case ARM::INLINEASM: + case ARM::INLINEASM_BR: { + // If this machine instr is an inline asm, measure it. + unsigned Size = getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); + if (!MF->getInfo()->isThumbFunction()) + Size = alignTo(Size, 4); + return Size; + } } } @@ -806,6 +805,28 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB, .addReg(ARM::CPSR, RegState::Implicit | RegState::Define); } +void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) { + MIB.addImm(ARMVCC::None); + MIB.addReg(0); +} + +void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, + unsigned DestReg) { + addUnpredicatedMveVpredNOp(MIB); + MIB.addReg(DestReg, RegState::Undef); +} + +void llvm::addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond) { + MIB.addImm(Cond); + MIB.addReg(ARM::VPR, RegState::Implicit); +} + +void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB, + unsigned Cond, unsigned Inactive) { + addPredicatedMveVpredNOp(MIB, Cond); + MIB.addReg(Inactive); +} + void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, @@ -831,17 +852,20 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opc = ARM::VMOVRS; else if (SPRDest && GPRSrc) Opc = ARM::VMOVSR; - else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && !Subtarget.isFPOnlySP()) + else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.hasFP64()) Opc = ARM::VMOVD; else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; if (Opc) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); MIB.addReg(SrcReg, getKillRegState(KillSrc)); - if (Opc == ARM::VORRq) + if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR) MIB.addReg(SrcReg, getKillRegState(KillSrc)); - MIB.add(predOps(ARMCC::AL)); + if (Opc == ARM::MVE_VORR) + addUnpredicatedMveVpredROp(MIB, DestReg); + else + MIB.add(predOps(ARMCC::AL)); return; } @@ -852,11 +876,11 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Use VORRq when possible. if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) { - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; BeginIdx = ARM::qsub_0; SubRegs = 2; } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) { - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; BeginIdx = ARM::qsub_0; SubRegs = 4; // Fall back to VMOVD. @@ -891,7 +915,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BeginIdx = ARM::dsub_0; SubRegs = 4; Spacing = 2; - } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.isFPOnlySP()) { + } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && + !Subtarget.hasFP64()) { Opc = ARM::VMOVS; BeginIdx = ARM::ssub_0; SubRegs = 2; @@ -901,6 +926,30 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (DestReg == ARM::CPSR) { copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget); return; + } else if (DestReg == ARM::VPR) { + assert(ARM::GPRRegClass.contains(SrcReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_P0), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (SrcReg == ARM::VPR) { + assert(ARM::GPRRegClass.contains(DestReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_P0), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (DestReg == ARM::FPSCR_NZCV) { + assert(ARM::GPRRegClass.contains(SrcReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_FPSCR_NZCVQC), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (SrcReg == ARM::FPSCR_NZCV) { + assert(ARM::GPRRegClass.contains(DestReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_FPSCR_NZCVQC), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; } assert(Opc && "Impossible reg-to-reg copy"); @@ -925,10 +974,15 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, DstRegs.insert(Dst); #endif Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src); - // VORR takes two source operands. - if (Opc == ARM::VORRq) + // VORR (NEON or MVE) takes two source operands. + if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR) { Mov.addReg(Src); - Mov = Mov.add(predOps(ARMCC::AL)); + } + // MVE VORR takes predicate operands in place of an ordinary condition. + if (Opc == ARM::MVE_VORR) + addUnpredicatedMveVpredROp(Mov, Dst); + else + Mov = Mov.add(predOps(ARMCC::AL)); // MOVr can set CC. if (Opc == ARM::MOVr) Mov = Mov.add(condCodeOp()); @@ -1010,6 +1064,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_P0_off)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); } else llvm_unreachable("Unknown reg class!"); break; @@ -1042,7 +1103,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 16: - if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { // Use aligned spills if the stack can be realigned. if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64)) @@ -1058,6 +1119,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } + } else if (ARM::QPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + auto MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::MVE_VSTRWU32)); + MIB.addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + addUnpredicatedMveVpredNOp(MIB); } else llvm_unreachable("Unknown reg class!"); break; @@ -1155,6 +1224,13 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return MI.getOperand(0).getReg(); } break; + case ARM::VSTR_P0_off: + if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return ARM::P0; + } + break; case ARM::VST1q64: case ARM::VST1d64TPseudo: case ARM::VST1d64QPseudo: @@ -1177,7 +1253,8 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { SmallVector Accesses; - if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses)) { + if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses) && + Accesses.size() == 1) { FrameIndex = cast(Accesses.front()->getPseudoValue()) ->getFrameIndex(); @@ -1224,6 +1301,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(ARM::VLDR_P0_off), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); } else llvm_unreachable("Unknown reg class!"); break; @@ -1260,7 +1343,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 16: - if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) .addFrameIndex(FI) @@ -1273,6 +1356,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } + } else if (ARM::QPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + auto MIB = BuildMI(MBB, I, DL, get(ARM::MVE_VLDRWU32), DestReg); + MIB.addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + addUnpredicatedMveVpredNOp(MIB); } else llvm_unreachable("Unknown reg class!"); break; @@ -1369,6 +1459,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, return MI.getOperand(0).getReg(); } break; + case ARM::VLDR_P0_off: + if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return ARM::P0; + } + break; case ARM::VLD1q64: case ARM::VLD1d8TPseudo: case ARM::VLD1d16TPseudo: @@ -1397,7 +1494,8 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { SmallVector Accesses; - if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses)) { + if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses) && + Accesses.size() == 1) { FrameIndex = cast(Accesses.front()->getPseudoValue()) ->getFrameIndex(); @@ -1480,7 +1578,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be // changed into a VORR that can go down the NEON pipeline. - if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP()) + if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || !Subtarget.hasFP64()) return false; // Look for a copy between even S-registers. That is where we keep floats @@ -1898,24 +1996,15 @@ isProfitableToIfCvt(MachineBasicBlock &MBB, // If we are optimizing for size, see if the branch in the predecessor can be // lowered to cbn?z by the constant island lowering pass, and return false if // so. This results in a shorter instruction sequence. - if (MBB.getParent()->getFunction().optForSize()) { + if (MBB.getParent()->getFunction().hasOptSize()) { MachineBasicBlock *Pred = *MBB.pred_begin(); if (!Pred->empty()) { MachineInstr *LastMI = &*Pred->rbegin(); if (LastMI->getOpcode() == ARM::t2Bcc) { - MachineBasicBlock::iterator CmpMI = LastMI; - if (CmpMI != Pred->begin()) { - --CmpMI; - if (CmpMI->getOpcode() == ARM::tCMPi8 || - CmpMI->getOpcode() == ARM::t2CMPri) { - unsigned Reg = CmpMI->getOperand(0).getReg(); - unsigned PredReg = 0; - ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg); - if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 && - isARMLowRegister(Reg)) - return false; - } - } + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineInstr *CmpMI = findCMPToFoldIntoCBZ(LastMI, TRI); + if (CmpMI) + return false; } } } @@ -1932,6 +2021,15 @@ isProfitableToIfCvt(MachineBasicBlock &TBB, if (!TCycles) return false; + // In thumb code we often end up trading one branch for a IT block, and + // if we are cloning the instruction can increase code size. Prevent + // blocks with multiple predecesors from being ifcvted to prevent this + // cloning. + if (Subtarget.isThumb2() && TBB.getParent()->getFunction().hasMinSize()) { + if (TBB.pred_size() != 1 || FBB.pred_size() != 1) + return false; + } + // Attempt to estimate the relative costs of predication versus branching. // Here we scale up each component of UnpredCost to avoid precision issue when // scaling TCycles/FCycles by Probability. @@ -2040,9 +2138,9 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, /// Identify instructions that can be folded into a MOVCC instruction, and /// return the defining instruction. -static MachineInstr *canFoldIntoMOVCC(unsigned Reg, - const MachineRegisterInfo &MRI, - const TargetInstrInfo *TII) { +MachineInstr * +ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) const { if (!TargetRegisterInfo::isVirtualRegister(Reg)) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) @@ -2050,8 +2148,8 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg, MachineInstr *MI = MRI.getVRegDef(Reg); if (!MI) return nullptr; - // MI is folded into the MOVCC by predicating it. - if (!MI->isPredicable()) + // Check if MI can be predicated and folded into the MOVCC. + if (!isPredicable(*MI)) return nullptr; // Check if MI has any non-dead defs or physreg uses. This also detects // predicated instructions which will be reading CPSR. @@ -2266,7 +2364,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, unsigned NumBytes) { // This optimisation potentially adds lots of load and store // micro-operations, it's only really a great benefit to code-size. - if (!MF.getFunction().optForMinSize()) + if (!Subtarget.hasMinSize()) return false; // If only one register is pushed/popped, LLVM can use an LDR/STR @@ -2332,6 +2430,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded; --CurRegEnc) { unsigned CurReg = RegClass->getRegister(CurRegEnc); + if (IsT1PushPop && CurReg > ARM::R7) + continue; if (!IsPop) { // Pushing any register is completely harmless, mark the register involved // as undef since we don't care about its value and must not restore it @@ -2389,7 +2489,7 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, bool isSub = false; // Memory operands in inline assembly always use AddrMode2. - if (Opcode == ARM::INLINEASM) + if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) AddrMode = ARMII::AddrMode2; if (Opcode == ARM::ADDri) { @@ -2473,6 +2573,15 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, NumBits = 8; Scale = 2; break; + case ARMII::AddrModeT2_i7: + case ARMII::AddrModeT2_i7s2: + case ARMII::AddrModeT2_i7s4: + ImmIdx = FrameRegIdx+1; + InstrOffs = MI.getOperand(ImmIdx).getImm(); + NumBits = 7; + Scale = (AddrMode == ARMII::AddrModeT2_i7s2 ? 2 : + AddrMode == ARMII::AddrModeT2_i7s4 ? 4 : 1); + break; default: llvm_unreachable("Unsupported addressing mode!"); } @@ -2543,6 +2652,7 @@ bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, return true; case ARM::CMPrr: case ARM::t2CMPrr: + case ARM::tCMPr: SrcReg = MI.getOperand(0).getReg(); SrcReg2 = MI.getOperand(1).getReg(); CmpMask = ~0; @@ -2619,32 +2729,62 @@ inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) { /// This function can be extended later on. inline static bool isRedundantFlagInstr(const MachineInstr *CmpI, unsigned SrcReg, unsigned SrcReg2, - int ImmValue, const MachineInstr *OI) { - if ((CmpI->getOpcode() == ARM::CMPrr || - CmpI->getOpcode() == ARM::t2CMPrr) && - (OI->getOpcode() == ARM::SUBrr || - OI->getOpcode() == ARM::t2SUBrr) && + int ImmValue, const MachineInstr *OI, + bool &IsThumb1) { + if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) && + (OI->getOpcode() == ARM::SUBrr || OI->getOpcode() == ARM::t2SUBrr) && ((OI->getOperand(1).getReg() == SrcReg && OI->getOperand(2).getReg() == SrcReg2) || (OI->getOperand(1).getReg() == SrcReg2 && - OI->getOperand(2).getReg() == SrcReg))) + OI->getOperand(2).getReg() == SrcReg))) { + IsThumb1 = false; return true; + } - if ((CmpI->getOpcode() == ARM::CMPri || - CmpI->getOpcode() == ARM::t2CMPri) && - (OI->getOpcode() == ARM::SUBri || - OI->getOpcode() == ARM::t2SUBri) && + if (CmpI->getOpcode() == ARM::tCMPr && OI->getOpcode() == ARM::tSUBrr && + ((OI->getOperand(2).getReg() == SrcReg && + OI->getOperand(3).getReg() == SrcReg2) || + (OI->getOperand(2).getReg() == SrcReg2 && + OI->getOperand(3).getReg() == SrcReg))) { + IsThumb1 = true; + return true; + } + + if ((CmpI->getOpcode() == ARM::CMPri || CmpI->getOpcode() == ARM::t2CMPri) && + (OI->getOpcode() == ARM::SUBri || OI->getOpcode() == ARM::t2SUBri) && OI->getOperand(1).getReg() == SrcReg && - OI->getOperand(2).getImm() == ImmValue) + OI->getOperand(2).getImm() == ImmValue) { + IsThumb1 = false; + return true; + } + + if (CmpI->getOpcode() == ARM::tCMPi8 && + (OI->getOpcode() == ARM::tSUBi8 || OI->getOpcode() == ARM::tSUBi3) && + OI->getOperand(2).getReg() == SrcReg && + OI->getOperand(3).getImm() == ImmValue) { + IsThumb1 = true; return true; + } if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) && (OI->getOpcode() == ARM::ADDrr || OI->getOpcode() == ARM::t2ADDrr || OI->getOpcode() == ARM::ADDri || OI->getOpcode() == ARM::t2ADDri) && OI->getOperand(0).isReg() && OI->getOperand(1).isReg() && OI->getOperand(0).getReg() == SrcReg && - OI->getOperand(1).getReg() == SrcReg2) + OI->getOperand(1).getReg() == SrcReg2) { + IsThumb1 = false; + return true; + } + + if (CmpI->getOpcode() == ARM::tCMPr && + (OI->getOpcode() == ARM::tADDi3 || OI->getOpcode() == ARM::tADDi8 || + OI->getOpcode() == ARM::tADDrr) && + OI->getOperand(0).getReg() == SrcReg && + OI->getOperand(2).getReg() == SrcReg2) { + IsThumb1 = true; return true; + } + return false; } @@ -2662,6 +2802,17 @@ static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) { case ARM::tSUBi3: case ARM::tSUBi8: case ARM::tMUL: + case ARM::tADC: + case ARM::tSBC: + case ARM::tRSB: + case ARM::tAND: + case ARM::tORR: + case ARM::tEOR: + case ARM::tBIC: + case ARM::tMVN: + case ARM::tASRri: + case ARM::tASRrr: + case ARM::tROR: IsThumb1 = true; LLVM_FALLTHROUGH; case ARM::RSBrr: @@ -2761,7 +2912,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( // For CMPri w/ CmpValue != 0, a SubAdd may still be a candidate. // Thus we cannot return here. if (CmpInstr.getOpcode() == ARM::CMPri || - CmpInstr.getOpcode() == ARM::t2CMPri) + CmpInstr.getOpcode() == ARM::t2CMPri || + CmpInstr.getOpcode() == ARM::tCMPi8) MI = nullptr; else return false; @@ -2783,20 +2935,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( // CMP. This peephole works on the vregs, so is still in SSA form. As a // consequence, the movs won't redefine/kill the MUL operands which would // make this reordering illegal. + const TargetRegisterInfo *TRI = &getRegisterInfo(); if (MI && IsThumb1) { --I; - bool CanReorder = true; - const bool HasStmts = I != E; - for (; I != E; --I) { - if (I->getOpcode() != ARM::tMOVi8) { - CanReorder = false; - break; + if (I != E && !MI->readsRegister(ARM::CPSR, TRI)) { + bool CanReorder = true; + for (; I != E; --I) { + if (I->getOpcode() != ARM::tMOVi8) { + CanReorder = false; + break; + } + } + if (CanReorder) { + MI = MI->removeFromParent(); + E = CmpInstr; + CmpInstr.getParent()->insert(E, MI); } - } - if (HasStmts && CanReorder) { - MI = MI->removeFromParent(); - E = CmpInstr; - CmpInstr.getParent()->insert(E, MI); } I = CmpInstr; E = MI; @@ -2804,12 +2958,13 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( // Check that CPSR isn't set between the comparison instruction and the one we // want to change. At the same time, search for SubAdd. - const TargetRegisterInfo *TRI = &getRegisterInfo(); + bool SubAddIsThumb1 = false; do { const MachineInstr &Instr = *--I; // Check whether CmpInstr can be made redundant by the current instruction. - if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &Instr)) { + if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &Instr, + SubAddIsThumb1)) { SubAdd = &*I; break; } @@ -2824,14 +2979,25 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( // change. We can't do this transformation. return false; - } while (I != B); + if (I == B) { + // In some cases, we scan the use-list of an instruction for an AND; + // that AND is in the same BB, but may not be scheduled before the + // corresponding TST. In that case, bail out. + // + // FIXME: We could try to reschedule the AND. + return false; + } + } while (true); // Return false if no candidates exist. if (!MI && !SubAdd) return false; - // The single candidate is called MI. - if (!MI) MI = SubAdd; + // If we found a SubAdd, use it as it will be closer to the CMP + if (SubAdd) { + MI = SubAdd; + IsThumb1 = SubAddIsThumb1; + } // We can't use a predicated instruction - it doesn't always write the flags. if (isPredicated(*MI)) @@ -2899,9 +3065,13 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( // operands will be modified. unsigned Opc = SubAdd->getOpcode(); bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr || - Opc == ARM::SUBri || Opc == ARM::t2SUBri; - if (!IsSub || (SrcReg2 != 0 && SubAdd->getOperand(1).getReg() == SrcReg2 && - SubAdd->getOperand(2).getReg() == SrcReg)) { + Opc == ARM::SUBri || Opc == ARM::t2SUBri || + Opc == ARM::tSUBrr || Opc == ARM::tSUBi3 || + Opc == ARM::tSUBi8; + unsigned OpI = Opc != ARM::tSUBrr ? 1 : 2; + if (!IsSub || + (SrcReg2 != 0 && SubAdd->getOperand(OpI).getReg() == SrcReg2 && + SubAdd->getOperand(OpI + 1).getReg() == SrcReg)) { // VSel doesn't support condition code update. if (IsInstrVSel) return false; @@ -2979,9 +3149,10 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const { ++Next; unsigned SrcReg, SrcReg2; int CmpMask, CmpValue; + bool IsThumb1; if (Next != MI.getParent()->end() && analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) && - isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI)) + isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI, IsThumb1)) return false; return true; } @@ -3372,7 +3543,12 @@ unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const { I != E; ++I) { Size += (*I)->getSize(); } - return Size / 4; + // FIXME: The scheduler currently can't handle values larger than 16. But + // the values can actually go up to 32 for floating-point load/store + // multiple (VLDMIA etc.). Also, the way this code is reasoning about memory + // operations isn't right; we could end up with "extra" memory operands for + // various reasons, like tail merge merging two memory operations. + return std::min(Size / 4, 16U); } static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc, @@ -4093,7 +4269,7 @@ int ARMBaseInstrInfo::getOperandLatencyImpl( // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI.getParent()->getParent(); - // FIXME: Use Function::optForSize(). + // FIXME: Use Function::hasOptSize(). if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize)) --Latency; } @@ -4517,6 +4693,31 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG"; return false; } + if (MI.getOpcode() == ARM::tMOVr && !Subtarget.hasV6Ops()) { + // Make sure we don't generate a lo-lo mov that isn't supported. + if (!ARM::hGPRRegClass.contains(MI.getOperand(0).getReg()) && + !ARM::hGPRRegClass.contains(MI.getOperand(1).getReg())) { + ErrInfo = "Non-flag-setting Thumb1 mov is v6-only"; + return false; + } + } + if (MI.getOpcode() == ARM::tPUSH || + MI.getOpcode() == ARM::tPOP || + MI.getOpcode() == ARM::tPOP_RET) { + for (int i = 2, e = MI.getNumOperands(); i < e; ++i) { + if (MI.getOperand(i).isImplicit() || + !MI.getOperand(i).isReg()) + continue; + unsigned Reg = MI.getOperand(i).getReg(); + if (Reg < ARM::R0 || Reg > ARM::R7) { + if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) && + !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) { + ErrInfo = "Unsupported register in Thumb1 push/pop"; + return false; + } + } + } + } return true; } @@ -5107,3 +5308,44 @@ ARMBaseInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { {MO_NONLAZY, "arm-nonlazy"}}; return makeArrayRef(TargetFlags); } + +bool llvm::registerDefinedBetween(unsigned Reg, + MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To, + const TargetRegisterInfo *TRI) { + for (auto I = From; I != To; ++I) + if (I->modifiesRegister(Reg, TRI)) + return true; + return false; +} + +MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br, + const TargetRegisterInfo *TRI) { + // Search backwards to the instruction that defines CSPR. This may or not + // be a CMP, we check that after this loop. If we find another instruction + // that reads cpsr, we return nullptr. + MachineBasicBlock::iterator CmpMI = Br; + while (CmpMI != Br->getParent()->begin()) { + --CmpMI; + if (CmpMI->modifiesRegister(ARM::CPSR, TRI)) + break; + if (CmpMI->readsRegister(ARM::CPSR, TRI)) + break; + } + + // Check that this inst is a CMP r[0-7], #0 and that the register + // is not redefined between the cmp and the br. + if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri) + return nullptr; + unsigned Reg = CmpMI->getOperand(0).getReg(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg); + if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0) + return nullptr; + if (!isARMLowRegister(Reg)) + return nullptr; + if (registerDefinedBetween(Reg, CmpMI->getNextNode(), Br, TRI)) + return nullptr; + + return &*CmpMI; +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index de1f307083ba..c28983fcc15c 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -1,9 +1,8 @@ //===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -399,6 +398,11 @@ private: void expandMEMCPY(MachineBasicBlock::iterator) const; + /// Identify instructions that can be folded into a MOVCC instruction, and + /// return the defining instruction. + MachineInstr *canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) const; + private: /// Modeling special VFP / NEON fp MLA / MLS hazards. @@ -478,6 +482,21 @@ bool isUncondBranchOpcode(int Opc) { return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; } +static inline bool isVPTOpcode(int Opc) { + return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 || + Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 || + Opc == ARM::MVE_VPTv8u16 || Opc == ARM::MVE_VPTv8s16 || + Opc == ARM::MVE_VPTv4i32 || Opc == ARM::MVE_VPTv4u32 || + Opc == ARM::MVE_VPTv4s32 || Opc == ARM::MVE_VPTv4f32 || + Opc == ARM::MVE_VPTv8f16 || Opc == ARM::MVE_VPTv16i8r || + Opc == ARM::MVE_VPTv16u8r || Opc == ARM::MVE_VPTv16s8r || + Opc == ARM::MVE_VPTv8i16r || Opc == ARM::MVE_VPTv8u16r || + Opc == ARM::MVE_VPTv8s16r || Opc == ARM::MVE_VPTv4i32r || + Opc == ARM::MVE_VPTv4u32r || Opc == ARM::MVE_VPTv4s32r || + Opc == ARM::MVE_VPTv4f32r || Opc == ARM::MVE_VPTv8f16r || + Opc == ARM::MVE_VPST; +} + static inline bool isCondBranchOpcode(int Opc) { return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; @@ -505,6 +524,28 @@ static inline bool isPushOpcode(int Opc) { Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; } +/// isValidCoprocessorNumber - decide whether an explicit coprocessor +/// number is legal in generic instructions like CDP. The answer can +/// vary with the subtarget. +static inline bool isValidCoprocessorNumber(unsigned Num, + const FeatureBitset& featureBits) { + // Armv8-A disallows everything *other* than 111x (CP14 and CP15). + if (featureBits[ARM::HasV8Ops] && (Num & 0xE) != 0xE) + return false; + + // Armv7 disallows 101x (CP10 and CP11), which clash with VFP/NEON. + if (featureBits[ARM::HasV7Ops] && (Num & 0xE) == 0xA) + return false; + + // Armv8.1-M also disallows 100x (CP8,CP9) and 111x (CP14,CP15) + // which clash with MVE. + if (featureBits[ARM::HasV8_1MMainlineOps] && + ((Num & 0xE) == 0x8 || (Num & 0xE) == 0xE)) + return false; + + return true; +} + /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. @@ -512,12 +553,6 @@ ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg); unsigned getMatchingCondBranchOpcode(unsigned Opc); -/// Determine if MI can be folded into an ARM MOVCC instruction, and return the -/// opcode of the SSA instruction representing the conditional MI. -unsigned canFoldARMInstrIntoMOVCC(unsigned Reg, - MachineInstr *&MI, - const MachineRegisterInfo &MRI); - /// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether /// the instruction is encoded with an 'S' bit is determined by the optional /// CPSR def operand. @@ -568,6 +603,23 @@ bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, const ARMBaseInstrInfo &TII); +/// Return true if Reg is defd between From and To +bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To, + const TargetRegisterInfo *TRI); + +/// Search backwards from a tBcc to find a tCMPi8 against 0, meaning +/// we can convert them to a tCBZ or tCBNZ. Return nullptr if not found. +MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br, + const TargetRegisterInfo *TRI); + +void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB); +void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg); + +void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); +void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, + unsigned Inactive); + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 02b3daf3c6fd..dc99b37742da 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -150,7 +149,7 @@ ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const { const uint32_t * ARMBaseRegisterInfo::getSjLjDispatchPreservedMask(const MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget(); - if (!STI.useSoftFloat() && STI.hasVFP2() && !STI.isThumb1Only()) + if (!STI.useSoftFloat() && STI.hasVFP2Base() && !STI.isThumb1Only()) return CSR_NoRegs_RegMask; else return CSR_FPRegs_RegMask; @@ -194,7 +193,7 @@ getReservedRegs(const MachineFunction &MF) const { if (STI.isR9Reserved()) markSuperRegs(Reserved, ARM::R9); // Reserve D16-D31 if the subtarget doesn't support them. - if (!STI.hasVFP3() || STI.hasD16()) { + if (!STI.hasD32()) { static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!"); for (unsigned R = 0; R < 16; ++R) markSuperRegs(Reserved, ARM::D16 + R); @@ -204,6 +203,8 @@ getReservedRegs(const MachineFunction &MF) const { for (MCSubRegIterator SI(Reg, this); SI.isValid(); ++SI) if (Reserved.test(*SI)) markSuperRegs(Reserved, Reg); + // For v8.1m architecture + markSuperRegs(Reserved, ARM::ZR); assert(checkAllSuperRegsMarked(Reserved)); return Reserved; @@ -369,29 +370,35 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { const ARMFunctionInfo *AFI = MF.getInfo(); const ARMFrameLowering *TFI = getFrameLowering(MF); - // When outgoing call frames are so large that we adjust the stack pointer - // around the call, we can no longer use the stack pointer to reach the - // emergency spill slot. + // If we have stack realignment and VLAs, we have no pointer to use to + // access the stack. If we have stack realignment, and a large call frame, + // we have no place to allocate the emergency spill slot. if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF)) return true; // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited // negative range for ldr/str (255), and thumb1 is positive offsets only. + // // It's going to be better to use the SP or Base Pointer instead. When there // are variable sized objects, we can't reference off of the SP, so we // reserve a Base Pointer. - if (AFI->isThumbFunction() && MFI.hasVarSizedObjects()) { - // Conservatively estimate whether the negative offset from the frame - // pointer will be sufficient to reach. If a function has a smallish - // frame, it's less likely to have lots of spills and callee saved - // space, so it's all more likely to be within range of the frame pointer. - // If it's wrong, the scavenger will still enable access to work, it just - // won't be optimal. - if (AFI->isThumb2Function() && MFI.getLocalFrameSize() < 128) - return false; + // + // For Thumb2, estimate whether a negative offset from the frame pointer + // will be sufficient to reach the whole stack frame. If a function has a + // smallish frame, it's less likely to have lots of spills and callee saved + // space, so it's all more likely to be within range of the frame pointer. + // If it's wrong, the scavenger will still enable access to work, it just + // won't be optimal. (We should always be able to reach the emergency + // spill slot from the frame pointer.) + if (AFI->isThumb2Function() && MFI.hasVarSizedObjects() && + MFI.getLocalFrameSize() >= 128) + return true; + // For Thumb1, if sp moves, nothing is in range, so force a base pointer. + // This is necessary for correctness in cases where we need an emergency + // spill slot. (In Thumb1, we can't use a negative offset from the frame + // pointer.) + if (AFI->isThumb1OnlyFunction() && !TFI->hasReservedCallFrame(MF)) return true; - } - return false; } @@ -425,7 +432,7 @@ cannotEliminateFrame(const MachineFunction &MF) const { || needsStackRealignment(MF); } -unsigned +Register ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget(); const ARMFrameLowering *TFI = getFrameLowering(MF); @@ -785,7 +792,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int PIdx = MI.findFirstPredOperandIdx(); ARMCC::CondCodes Pred = (PIdx == -1) ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); - unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); + Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg(); if (Offset == 0) // Must be addrmode4/6. MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 45d29ebc0bd3..7e2c72b4d712 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -1,9 +1,8 @@ //===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -174,7 +173,7 @@ public: bool cannotEliminateFrame(const MachineFunction &MF) const; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; unsigned getBaseRegister() const { return BasePtr; } bool isLowRegister(unsigned Reg) const; diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp new file mode 100644 index 000000000000..2de90e816b33 --- /dev/null +++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp @@ -0,0 +1,146 @@ +//===--- ARMBasicBlockInfo.cpp - Utilities for block sizes ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBasicBlockInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include + +#define DEBUG_TYPE "arm-bb-utils" + +using namespace llvm; + +namespace llvm { + +// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions +// below may shrink MI. +static bool +mayOptimizeThumb2Instruction(const MachineInstr *MI) { + switch(MI->getOpcode()) { + // optimizeThumb2Instructions. + case ARM::t2LEApcrel: + case ARM::t2LDRpci: + // optimizeThumb2Branches. + case ARM::t2B: + case ARM::t2Bcc: + case ARM::tBcc: + // optimizeThumb2JumpTables. + case ARM::t2BR_JT: + case ARM::tBR_JTr: + return true; + } + return false; +} + +void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) { + LLVM_DEBUG(dbgs() << "computeBlockSize: " << MBB->getName() << "\n"); + BasicBlockInfo &BBI = BBInfo[MBB->getNumber()]; + BBI.Size = 0; + BBI.Unalign = 0; + BBI.PostAlign = 0; + + for (MachineInstr &I : *MBB) { + BBI.Size += TII->getInstSizeInBytes(I); + // For inline asm, getInstSizeInBytes returns a conservative estimate. + // The actual size may be smaller, but still a multiple of the instr size. + if (I.isInlineAsm()) + BBI.Unalign = isThumb ? 1 : 2; + // Also consider instructions that may be shrunk later. + else if (isThumb && mayOptimizeThumb2Instruction(&I)) + BBI.Unalign = 1; + } + + // tBR_JTr contains a .align 2 directive. + if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) { + BBI.PostAlign = 2; + MBB->getParent()->ensureAlignment(2); + } +} + +/// getOffsetOf - Return the current offset of the specified machine instruction +/// from the start of the function. This offset changes as stuff is moved +/// around inside the function. +unsigned ARMBasicBlockUtils::getOffsetOf(MachineInstr *MI) const { + const MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BBInfo[MBB->getNumber()].Offset; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::const_iterator I = MBB->begin(); &*I != MI; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + Offset += TII->getInstSizeInBytes(*I); + } + return Offset; +} + +/// isBBInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool ARMBasicBlockUtils::isBBInRange(MachineInstr *MI, + MachineBasicBlock *DestBB, + unsigned MaxDisp) const { + unsigned PCAdj = isThumb ? 4 : 8; + unsigned BrOffset = getOffsetOf(MI) + PCAdj; + unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; + + LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB) + << " from " << printMBBReference(*MI->getParent()) + << " max delta=" << MaxDisp << " from " << getOffsetOf(MI) + << " to " << DestOffset << " offset " + << int(DestOffset - BrOffset) << "\t" << *MI); + + if (BrOffset <= DestOffset) { + // Branch before the Dest. + if (DestOffset-BrOffset <= MaxDisp) + return true; + } else { + if (BrOffset-DestOffset <= MaxDisp) + return true; + } + return false; +} + +void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) { + assert(BB->getParent() == &MF && + "Basic block is not a child of the current function.\n"); + + unsigned BBNum = BB->getNumber(); + LLVM_DEBUG(dbgs() << "Adjust block:\n" + << " - name: " << BB->getName() << "\n" + << " - number: " << BB->getNumber() << "\n" + << " - function: " << MF.getName() << "\n" + << " - blocks: " << MF.getNumBlockIDs() << "\n"); + + for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) { + // Get the offset and known bits at the end of the layout predecessor. + // Include the alignment of the current block. + unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment(); + unsigned Offset = BBInfo[i - 1].postOffset(LogAlign); + unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign); + + // This is where block i begins. Stop if the offset is already correct, + // and we have updated 2 blocks. This is the maximum number of blocks + // changed before calling this function. + if (i > BBNum + 2 && + BBInfo[i].Offset == Offset && + BBInfo[i].KnownBits == KnownBits) + break; + + BBInfo[i].Offset = Offset; + BBInfo[i].KnownBits = KnownBits; + } +} + +} // end namespace llvm diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h index e0cb0aa676a6..400bba351cec 100644 --- a/lib/Target/ARM/ARMBasicBlockInfo.h +++ b/lib/Target/ARM/ARMBasicBlockInfo.h @@ -1,9 +1,8 @@ //===-- ARMBasicBlockInfo.h - Basic Block Information -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,12 +13,16 @@ #ifndef LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H #define LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" #include "llvm/Support/MathExtras.h" #include #include namespace llvm { +using BBInfoVector = SmallVectorImpl; + /// UnknownPadding - Return the worst case padding that could result from /// unknown offset bits. This does not include alignment padding caused by /// known offset bits. @@ -104,6 +107,54 @@ struct BasicBlockInfo { } }; +class ARMBasicBlockUtils { + +private: + MachineFunction &MF; + bool isThumb = false; + const ARMBaseInstrInfo *TII = nullptr; + SmallVector BBInfo; + +public: + ARMBasicBlockUtils(MachineFunction &MF) : MF(MF) { + TII = + static_cast(MF.getSubtarget().getInstrInfo()); + isThumb = MF.getInfo()->isThumbFunction(); + } + + void computeAllBlockSizes() { + BBInfo.resize(MF.getNumBlockIDs()); + for (MachineBasicBlock &MBB : MF) + computeBlockSize(&MBB); + } + + void computeBlockSize(MachineBasicBlock *MBB); + + unsigned getOffsetOf(MachineInstr *MI) const; + + unsigned getOffsetOf(MachineBasicBlock *MBB) const { + return BBInfo[MBB->getNumber()].Offset; + } + + void adjustBBOffsetsAfter(MachineBasicBlock *MBB); + + void adjustBBSize(MachineBasicBlock *MBB, int Size) { + BBInfo[MBB->getNumber()].Size += Size; + } + + bool isBBInRange(MachineInstr *MI, MachineBasicBlock *DestBB, + unsigned MaxDisp) const; + + void insert(unsigned BBNum, BasicBlockInfo BBI) { + BBInfo.insert(BBInfo.begin() + BBNum, BBI); + } + + void clear() { BBInfo.clear(); } + + BBInfoVector &getBBInfo() { return BBInfo; } + +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 8e80c32bcf89..0cbe6e1871e4 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -1,9 +1,8 @@ //===- llvm/lib/Target/ARM/ARMCallLowering.cpp - Call lowering ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -56,7 +55,7 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { if (T->isArrayTy()) - return true; + return isSupportedType(DL, TLI, T->getArrayElementType()); if (T->isStructTy()) { // For now we only allow homogeneous structs that we can manipulate with @@ -65,7 +64,7 @@ static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, for (unsigned i = 1, e = StructT->getNumElements(); i != e; ++i) if (StructT->getElementType(i) != StructT->getElementType(0)) return false; - return true; + return isSupportedType(DL, TLI, StructT->getElementType(0)); } EVT VT = TLI.getValueType(DL, T, true); @@ -91,27 +90,27 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MachineInstrBuilder &MIB, CCAssignFn *AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) && "Unsupported size"); LLT p0 = LLT::pointer(0, 32); LLT s32 = LLT::scalar(32); - unsigned SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, ARM::SP); + Register SPReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildCopy(SPReg, Register(ARM::SP)); - unsigned OffsetReg = MRI.createGenericVirtualRegister(s32); + Register OffsetReg = MRI.createGenericVirtualRegister(s32); MIRBuilder.buildConstant(OffsetReg, Offset); - unsigned AddrReg = MRI.createGenericVirtualRegister(p0); + Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); return AddrReg; } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { assert(VA.isRegLoc() && "Value shouldn't be assigned to reg"); assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?"); @@ -119,25 +118,27 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size"); assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size"); - unsigned ExtReg = extendRegister(ValVReg, VA); + Register ExtReg = extendRegister(ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); MIB.addUse(PhysReg, RegState::Implicit); } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) && "Unsupported size"); - unsigned ExtReg = extendRegister(ValVReg, VA); + Register ExtReg = extendRegister(ValVReg, VA); auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(), - /* Alignment */ 0); + /* Alignment */ 1); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } unsigned assignCustomValue(const CallLowering::ArgInfo &Arg, ArrayRef VAs) override { + assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet"); + CCValAssign VA = VAs[0]; assert(VA.needsCustom() && "Value doesn't need custom handling"); assert(VA.getValVT() == MVT::f64 && "Unsupported type"); @@ -152,9 +153,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { assert(VA.isRegLoc() && "Value should be in reg"); assert(NextVA.isRegLoc() && "Value should be in reg"); - unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), + Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), MRI.createGenericVirtualRegister(LLT::scalar(32))}; - MIRBuilder.buildUnmerge(NewRegs, Arg.Reg); + MIRBuilder.buildUnmerge(NewRegs, Arg.Regs[0]); bool IsLittle = MIRBuilder.getMF().getSubtarget().isLittle(); if (!IsLittle) @@ -183,18 +184,17 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { } // end anonymous namespace -void ARMCallLowering::splitToValueTypes( - const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, - MachineFunction &MF, const SplitArgTy &PerformArgSplit) const { +void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, + SmallVectorImpl &SplitArgs, + MachineFunction &MF) const { const ARMTargetLowering &TLI = *getTLI(); LLVMContext &Ctx = OrigArg.Ty->getContext(); const DataLayout &DL = MF.getDataLayout(); - MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); SmallVector SplitVTs; - SmallVector Offsets; - ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, nullptr, nullptr, 0); + assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); if (SplitVTs.size() == 1) { // Even if there is no splitting to do, we still want to replace the @@ -202,12 +202,12 @@ void ARMCallLowering::splitToValueTypes( auto Flags = OrigArg.Flags; unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty); Flags.setOrigAlign(OriginalAlignment); - SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), Flags, - OrigArg.IsFixed); + SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), + Flags, OrigArg.IsFixed); return; } - unsigned FirstRegIdx = SplitArgs.size(); + // Create one ArgInfo for each virtual register. for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) { EVT SplitVT = SplitVTs[i]; Type *SplitTy = SplitVT.getTypeForEVT(Ctx); @@ -225,19 +225,16 @@ void ARMCallLowering::splitToValueTypes( Flags.setInConsecutiveRegsLast(); } - SplitArgs.push_back( - ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), - SplitTy, Flags, OrigArg.IsFixed}); + // FIXME: We also want to split SplitTy further. + Register PartReg = OrigArg.Regs[i]; + SplitArgs.emplace_back(PartReg, SplitTy, Flags, OrigArg.IsFixed); } - - for (unsigned i = 0; i < Offsets.size(); ++i) - PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } /// Lower the return value for the already existing \p Ret. This assumes that /// \p MIRBuilder's insertion point is correct. bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, - const Value *Val, ArrayRef VRegs, + const Value *Val, ArrayRef VRegs, MachineInstrBuilder &Ret) const { if (!Val) // Nothing to do here. @@ -251,35 +248,22 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, if (!isSupportedType(DL, TLI, Val->getType())) return false; - SmallVector SplitEVTs; - ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); - assert(VRegs.size() == SplitEVTs.size() && - "For each split Type there should be exactly one VReg."); - - SmallVector SplitVTs; - LLVMContext &Ctx = Val->getType()->getContext(); - for (unsigned i = 0; i < SplitEVTs.size(); ++i) { - ArgInfo CurArgInfo(VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)); - setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); - - SmallVector Regs; - splitToValueTypes( - CurArgInfo, SplitVTs, MF, - [&](unsigned Reg, uint64_t Offset) { Regs.push_back(Reg); }); - if (Regs.size() > 1) - MIRBuilder.buildUnmerge(Regs, VRegs[i]); - } + ArgInfo OrigRetInfo(VRegs, Val->getType()); + setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); + + SmallVector SplitRetInfos; + splitToValueTypes(OrigRetInfo, SplitRetInfos, MF); CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg()); OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn); - return handleAssignments(MIRBuilder, SplitVTs, RetHandler); + return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler); } bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const { + ArrayRef VRegs) const { assert(!Val == VRegs.empty() && "Return value without a vreg"); auto const &ST = MIRBuilder.getMF().getSubtarget(); @@ -302,7 +286,9 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { CCAssignFn AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + bool isArgumentHandler() const override { return true; } + + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) && "Unsupported size"); @@ -319,7 +305,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { return AddrReg; } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) && "Unsupported size"); @@ -332,22 +318,22 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm"); auto LoadVReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - buildLoad(LoadVReg, Addr, Size, /* Alignment */ 0, MPO); + buildLoad(LoadVReg, Addr, Size, /* Alignment */ 1, MPO); MIRBuilder.buildTrunc(ValVReg, LoadVReg); } else { // If the value is not extended, a simple load will suffice. - buildLoad(ValVReg, Addr, Size, /* Alignment */ 0, MPO); + buildLoad(ValVReg, Addr, Size, /* Alignment */ 1, MPO); } } - void buildLoad(unsigned Val, unsigned Addr, uint64_t Size, unsigned Alignment, + void buildLoad(Register Val, Register Addr, uint64_t Size, unsigned Alignment, MachinePointerInfo &MPO) { auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOLoad, Size, Alignment); MIRBuilder.buildLoad(Val, Addr, *MMO); } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { assert(VA.isRegLoc() && "Value shouldn't be assigned to reg"); assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?"); @@ -376,6 +362,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { unsigned assignCustomValue(const ARMCallLowering::ArgInfo &Arg, ArrayRef VAs) override { + assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet"); + CCValAssign VA = VAs[0]; assert(VA.needsCustom() && "Value doesn't need custom handling"); assert(VA.getValVT() == MVT::f64 && "Unsupported type"); @@ -390,7 +378,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { assert(VA.isRegLoc() && "Value should be in reg"); assert(NextVA.isRegLoc() && "Value should be in reg"); - unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), + Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), MRI.createGenericVirtualRegister(LLT::scalar(32))}; assignValueToReg(NewRegs[0], VA.getLocReg(), VA); @@ -400,7 +388,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { if (!IsLittle) std::swap(NewRegs[0], NewRegs[1]); - MIRBuilder.buildMerge(Arg.Reg, NewRegs); + MIRBuilder.buildMerge(Arg.Regs[0], NewRegs); return 1; } @@ -423,9 +411,9 @@ struct FormalArgHandler : public IncomingValueHandler { } // end anonymous namespace -bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef VRegs) const { +bool ARMCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { auto &TLI = *getTLI(); auto Subtarget = TLI.getSubtarget(); @@ -456,21 +444,13 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(), AssignFn); - SmallVector ArgInfos; - SmallVector SplitRegs; + SmallVector SplitArgInfos; unsigned Idx = 0; for (auto &Arg : F.args()) { - ArgInfo AInfo(VRegs[Idx], Arg.getType()); - setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F); - - SplitRegs.clear(); - - splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) { - SplitRegs.push_back(Reg); - }); + ArgInfo OrigArgInfo(VRegs[Idx], Arg.getType()); - if (!SplitRegs.empty()) - MIRBuilder.buildMerge(VRegs[Idx], SplitRegs); + setArgFlags(OrigArgInfo, Idx + AttributeList::FirstArgIndex, DL, F); + splitToValueTypes(OrigArgInfo, SplitArgInfos, MF); Idx++; } @@ -478,7 +458,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (!MBB.empty()) MIRBuilder.setInstr(*MBB.begin()); - if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler)) + if (!handleAssignments(MIRBuilder, SplitArgInfos, ArgHandler)) return false; // Move back to the end of the basic block. @@ -540,19 +520,19 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create the call instruction so we can add the implicit uses of arg // registers, but don't insert it yet. - bool isDirect = !Callee.isReg(); - auto CallOpcode = getCallOpcode(STI, isDirect); + bool IsDirect = !Callee.isReg(); + auto CallOpcode = getCallOpcode(STI, IsDirect); auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode); - bool isThumb = STI.isThumb(); - if (isThumb) + bool IsThumb = STI.isThumb(); + if (IsThumb) MIB.add(predOps(ARMCC::AL)); MIB.add(Callee); - if (!isDirect) { + if (!IsDirect) { auto CalleeReg = Callee.getReg(); if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) { - unsigned CalleeIdx = isThumb ? 2 : 0; + unsigned CalleeIdx = IsThumb ? 2 : 0; MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass( MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(), *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx)); @@ -561,27 +541,22 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv)); + bool IsVarArg = false; SmallVector ArgInfos; for (auto Arg : OrigArgs) { if (!isSupportedType(DL, TLI, Arg.Ty)) return false; if (!Arg.IsFixed) - return false; + IsVarArg = true; if (Arg.Flags.isByVal()) return false; - SmallVector Regs; - splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) { - Regs.push_back(Reg); - }); - - if (Regs.size() > 1) - MIRBuilder.buildUnmerge(Regs, Arg.Reg); + splitToValueTypes(Arg, ArgInfos, MF); } - auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg); OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler)) return false; @@ -594,22 +569,11 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; ArgInfos.clear(); - SmallVector SplitRegs; - splitToValueTypes(OrigRet, ArgInfos, MF, - [&](unsigned Reg, uint64_t Offset) { - SplitRegs.push_back(Reg); - }); - - auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false); + splitToValueTypes(OrigRet, ArgInfos, MF); + auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg); CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler)) return false; - - if (!SplitRegs.empty()) { - // We have split the value and allocated each individual piece, now build - // it up again. - MIRBuilder.buildMerge(OrigRet.Reg, SplitRegs); - } } // We now know the size of the stack - update the ADJCALLSTACKDOWN diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h index 45a988a2f00e..794127b5ebc7 100644 --- a/lib/Target/ARM/ARMCallLowering.h +++ b/lib/Target/ARM/ARMCallLowering.h @@ -1,9 +1,8 @@ //===- llvm/lib/Target/ARM/ARMCallLowering.h - Call lowering ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,10 +33,10 @@ public: ARMCallLowering(const ARMTargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const override; + ArrayRef VRegs) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef VRegs) const override; + ArrayRef> VRegs) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, @@ -45,17 +44,14 @@ public: private: bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs, + ArrayRef VRegs, MachineInstrBuilder &Ret) const; - using SplitArgTy = std::function; - /// Split an argument into one or more arguments that the CC lowering can cope - /// with (e.g. replace pointers with integers). + /// with. void splitToValueTypes(const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, - MachineFunction &MF, - const SplitArgTy &PerformArgSplit) const; + MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp new file mode 100644 index 000000000000..5ede7c67f7c2 --- /dev/null +++ b/lib/Target/ARM/ARMCallingConv.cpp @@ -0,0 +1,284 @@ +//=== ARMCallingConv.cpp - ARM Custom CC Routines ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the ARM Calling Convention that +// aren't done by tablegen, and includes the table generated implementations. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMCallingConv.h" +#include "ARMSubtarget.h" +#include "ARMRegisterInfo.h" +using namespace llvm; + +// APCS f64 is in register pairs, possibly split to stack +static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } + + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(4, 4), + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +// AAPCS f64 is in aligned register pairs +static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; + static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; + static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 }; + static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList); + if (Reg == 0) { + + // If we had R3 unallocated only, now we still must to waste it. + Reg = State.AllocateReg(GPRArgRegs); + assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64"); + + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + unsigned T = State.AllocateReg(LoRegList[i]); + (void)T; + assert(T == LoRegList[i] && "Could not allocate register"); + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { + static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; + static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, LoRegList); + if (Reg == 0) + return false; // we didn't handle it + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + return true; // we handled it +} + +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + +static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; +static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; +static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; + + +// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA +// has InConsecutiveRegs set, and that the last member also has +// InConsecutiveRegsLast set. We must process all members of the HA before +// we can allocate it, as we need to know the total number of registers that +// will be needed in order to (attempt to) allocate a contiguous block. +static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + SmallVectorImpl &PendingMembers = State.getPendingLocs(); + + // AAPCS HFAs must have 1-4 elements, all of the same type + if (PendingMembers.size() > 0) + assert(PendingMembers[0].getLocVT() == LocVT); + + // Add the argument to the list to be allocated once we know the size of the + // aggregate. Store the type's required alignmnent as extra info for later: in + // the [N x i64] case all trace has been removed by the time we actually get + // to do allocation. + PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo, + ArgFlags.getOrigAlign())); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + // Try to allocate a contiguous block of registers, each of the correct + // size to hold one member. + auto &DL = State.getMachineFunction().getDataLayout(); + unsigned StackAlign = DL.getStackAlignment(); + unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); + + ArrayRef RegList; + switch (LocVT.SimpleTy) { + case MVT::i32: { + RegList = RRegList; + unsigned RegIdx = State.getFirstUnallocated(RegList); + + // First consume all registers that would give an unaligned object. Whether + // we go on stack or in regs, no-one will be using them in future. + unsigned RegAlign = alignTo(Align, 4) / 4; + while (RegIdx % RegAlign != 0 && RegIdx < RegList.size()) + State.AllocateReg(RegList[RegIdx++]); + + break; + } + case MVT::f16: + case MVT::f32: + RegList = SRegList; + break; + case MVT::v4f16: + case MVT::f64: + RegList = DRegList; + break; + case MVT::v8f16: + case MVT::v2f64: + RegList = QRegList; + break; + default: + llvm_unreachable("Unexpected member type for block aggregate"); + break; + } + + unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); + if (RegResult) { + for (SmallVectorImpl::iterator It = PendingMembers.begin(); + It != PendingMembers.end(); ++It) { + It->convertToReg(RegResult); + State.addLoc(*It); + ++RegResult; + } + PendingMembers.clear(); + return true; + } + + // Register allocation failed, we'll be needing the stack + unsigned Size = LocVT.getSizeInBits() / 8; + if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) { + // If nothing else has used the stack until this point, a non-HFA aggregate + // can be split between regs and stack. + unsigned RegIdx = State.getFirstUnallocated(RegList); + for (auto &It : PendingMembers) { + if (RegIdx >= RegList.size()) + It.convertToMem(State.AllocateStack(Size, Size)); + else + It.convertToReg(State.AllocateReg(RegList[RegIdx++])); + + State.addLoc(It); + } + PendingMembers.clear(); + return true; + } else if (LocVT != MVT::i32) + RegList = SRegList; + + // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core) + for (auto Reg : RegList) + State.AllocateReg(Reg); + + // After the first item has been allocated, the rest are packed as tightly as + // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll + // be allocating a bunch of i32 slots). + unsigned RestAlign = std::min(Align, Size); + + for (auto &It : PendingMembers) { + It.convertToMem(State.AllocateStack(Size, Align)); + State.addLoc(It); + Align = RestAlign; + } + + // All pending members have now been allocated + PendingMembers.clear(); + + // This will be allocated by the last member of the aggregate + return true; +} + +// Include the table generated calling convention implementations. +#include "ARMGenCallingConv.inc" diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h index 543165de38d0..615634551d90 100644 --- a/lib/Target/ARM/ARMCallingConv.h +++ b/lib/Target/ARM/ARMCallingConv.h @@ -1,292 +1,50 @@ //=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // -// This file contains the custom routines for the ARM Calling Convention that -// aren't done by tablegen. +// This file declares the entry points for ARM calling convention analysis. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H #define LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H -#include "ARM.h" -#include "ARMBaseInstrInfo.h" -#include "ARMSubtarget.h" #include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/IR/CallingConv.h" namespace llvm { -// APCS f64 is in register pairs, possibly split to stack -static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - CCState &State, bool CanFail) { - static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - - // Try to get the first register. - if (unsigned Reg = State.AllocateReg(RegList)) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - else { - // For the 2nd half of a v2f64, do not fail. - if (CanFail) - return false; - - // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 4), - LocVT, LocInfo)); - return true; - } - - // Try to get the second register. - if (unsigned Reg = State.AllocateReg(RegList)) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - else - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(4, 4), - LocVT, LocInfo)); - return true; -} - -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) - return false; - if (LocVT == MVT::v2f64 && - !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) - return false; - return true; // we handled it -} - -// AAPCS f64 is in aligned register pairs -static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - CCState &State, bool CanFail) { - static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; - static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; - static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 }; - static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - - unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList); - if (Reg == 0) { - - // If we had R3 unallocated only, now we still must to waste it. - Reg = State.AllocateReg(GPRArgRegs); - assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64"); - - // For the 2nd half of a v2f64, do not just fail. - if (CanFail) - return false; - - // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 8), - LocVT, LocInfo)); - return true; - } - - unsigned i; - for (i = 0; i < 2; ++i) - if (HiRegList[i] == Reg) - break; - - unsigned T = State.AllocateReg(LoRegList[i]); - (void)T; - assert(T == LoRegList[i] && "Could not allocate register"); - - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - LocVT, LocInfo)); - return true; -} - -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) - return false; - if (LocVT == MVT::v2f64 && - !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) - return false; - return true; // we handled it -} - -static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, CCState &State) { - static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; - static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; - - unsigned Reg = State.AllocateReg(HiRegList, LoRegList); - if (Reg == 0) - return false; // we didn't handle it - - unsigned i; - for (i = 0; i < 2; ++i) - if (HiRegList[i] == Reg) - break; - - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - LocVT, LocInfo)); - return true; -} - -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) - return false; - if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) - return false; - return true; // we handled it -} - -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, - State); -} - -static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - -static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, - ARM::S4, ARM::S5, ARM::S6, ARM::S7, - ARM::S8, ARM::S9, ARM::S10, ARM::S11, - ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; -static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; -static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; - - -// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA -// has InConsecutiveRegs set, and that the last member also has -// InConsecutiveRegsLast set. We must process all members of the HA before -// we can allocate it, as we need to know the total number of registers that -// will be needed in order to (attempt to) allocate a contiguous block. -static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - SmallVectorImpl &PendingMembers = State.getPendingLocs(); - - // AAPCS HFAs must have 1-4 elements, all of the same type - if (PendingMembers.size() > 0) - assert(PendingMembers[0].getLocVT() == LocVT); - - // Add the argument to the list to be allocated once we know the size of the - // aggregate. Store the type's required alignmnent as extra info for later: in - // the [N x i64] case all trace has been removed by the time we actually get - // to do allocation. - PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo, - ArgFlags.getOrigAlign())); - - if (!ArgFlags.isInConsecutiveRegsLast()) - return true; - - // Try to allocate a contiguous block of registers, each of the correct - // size to hold one member. - auto &DL = State.getMachineFunction().getDataLayout(); - unsigned StackAlign = DL.getStackAlignment(); - unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); - - ArrayRef RegList; - switch (LocVT.SimpleTy) { - case MVT::i32: { - RegList = RRegList; - unsigned RegIdx = State.getFirstUnallocated(RegList); - - // First consume all registers that would give an unaligned object. Whether - // we go on stack or in regs, no-one will be using them in future. - unsigned RegAlign = alignTo(Align, 4) / 4; - while (RegIdx % RegAlign != 0 && RegIdx < RegList.size()) - State.AllocateReg(RegList[RegIdx++]); - - break; - } - case MVT::f16: - case MVT::f32: - RegList = SRegList; - break; - case MVT::v4f16: - case MVT::f64: - RegList = DRegList; - break; - case MVT::v8f16: - case MVT::v2f64: - RegList = QRegList; - break; - default: - llvm_unreachable("Unexpected member type for block aggregate"); - break; - } - - unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); - if (RegResult) { - for (SmallVectorImpl::iterator It = PendingMembers.begin(); - It != PendingMembers.end(); ++It) { - It->convertToReg(RegResult); - State.addLoc(*It); - ++RegResult; - } - PendingMembers.clear(); - return true; - } - - // Register allocation failed, we'll be needing the stack - unsigned Size = LocVT.getSizeInBits() / 8; - if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) { - // If nothing else has used the stack until this point, a non-HFA aggregate - // can be split between regs and stack. - unsigned RegIdx = State.getFirstUnallocated(RegList); - for (auto &It : PendingMembers) { - if (RegIdx >= RegList.size()) - It.convertToMem(State.AllocateStack(Size, Size)); - else - It.convertToReg(State.AllocateReg(RegList[RegIdx++])); - - State.addLoc(It); - } - PendingMembers.clear(); - return true; - } else if (LocVT != MVT::i32) - RegList = SRegList; - - // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core) - for (auto Reg : RegList) - State.AllocateReg(Reg); - - // After the first item has been allocated, the rest are packed as tightly as - // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll - // be allocating a bunch of i32 slots). - unsigned RestAlign = std::min(Align, Size); - - for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack(Size, Align)); - State.addLoc(It); - Align = RestAlign; - } - - // All pending members have now been allocated - PendingMembers.clear(); - - // This will be allocated by the last member of the aggregate - return true; -} - -} // End llvm namespace +bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); + +} // namespace llvm #endif diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index f173e423f3e4..61d2d83ddc40 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -1,9 +1,8 @@ //===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for ARM architecture. @@ -16,6 +15,7 @@ class CCIfAlign: //===----------------------------------------------------------------------===// // ARM APCS Calling Convention //===----------------------------------------------------------------------===// +let Entry = 1 in def CC_ARM_APCS : CallingConv<[ // Handles byval parameters. @@ -30,8 +30,8 @@ def CC_ARM_APCS : CallingConv<[ CCIfSwiftError>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, @@ -44,6 +44,7 @@ def CC_ARM_APCS : CallingConv<[ CCIfType<[v2f64], CCAssignToStack<16, 4>> ]>; +let Entry = 1 in def RetCC_ARM_APCS : CallingConv<[ CCIfType<[i1, i8, i16], CCPromoteToType>, CCIfType<[f32], CCBitConvertToType>, @@ -55,8 +56,8 @@ def RetCC_ARM_APCS : CallingConv<[ CCIfSwiftError>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, @@ -67,10 +68,11 @@ def RetCC_ARM_APCS : CallingConv<[ //===----------------------------------------------------------------------===// // ARM APCS Calling Convention for FastCC (when VFP2 or later is available) //===----------------------------------------------------------------------===// +let Entry = 1 in def FastCC_ARM_APCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, @@ -86,10 +88,11 @@ def FastCC_ARM_APCS : CallingConv<[ CCDelegateTo ]>; +let Entry = 1 in def RetFastCC_ARM_APCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, @@ -102,10 +105,11 @@ def RetFastCC_ARM_APCS : CallingConv<[ // ARM APCS Calling Convention for GHC //===----------------------------------------------------------------------===// +let Entry = 1 in def CC_ARM_APCS_GHC : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>, @@ -152,6 +156,7 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[ // ARM AAPCS (EABI) Calling Convention //===----------------------------------------------------------------------===// +let Entry = 1 in def CC_ARM_AAPCS : CallingConv<[ // Handles byval parameters. CCIfByVal>, @@ -160,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[ CCIfNest>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -174,10 +179,11 @@ def CC_ARM_AAPCS : CallingConv<[ CCDelegateTo ]>; +let Entry = 1 in def RetCC_ARM_AAPCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v8f16,v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16,v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -196,13 +202,14 @@ def RetCC_ARM_AAPCS : CallingConv<[ // Also used for FastCC (when VFP2 or later is available) //===----------------------------------------------------------------------===// +let Entry = 1 in def CC_ARM_AAPCS_VFP : CallingConv<[ // Handles byval parameters. CCIfByVal>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -220,10 +227,11 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCDelegateTo ]>; +let Entry = 1 in def RetCC_ARM_AAPCS_VFP : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp index b631c2bc687b..2fc5f4aaab50 100644 --- a/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -1,9 +1,8 @@ //===----- ARMCodeGenPrepare.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -114,15 +113,20 @@ class IRPromoter { SmallPtrSet Promoted; Module *M = nullptr; LLVMContext &Ctx; + // The type we promote to: always i32 IntegerType *ExtTy = nullptr; + // The type of the value that the search began from, either i8 or i16. + // This defines the max range of the values that we allow in the promoted + // tree. IntegerType *OrigTy = nullptr; - SmallPtrSetImpl *Visited; + SetVector *Visited; SmallPtrSetImpl *Sources; SmallPtrSetImpl *Sinks; SmallPtrSetImpl *SafeToPromote; + SmallPtrSetImpl *SafeWrap; void ReplaceAllUsersOfWith(Value *From, Value *To); - void PrepareConstants(void); + void PrepareWrappingAdds(void); void ExtendSources(void); void ConvertTruncs(void); void PromoteTree(void); @@ -135,10 +139,11 @@ public: void Mutate(Type *OrigTy, - SmallPtrSetImpl &Visited, + SetVector &Visited, SmallPtrSetImpl &Sources, SmallPtrSetImpl &Sinks, - SmallPtrSetImpl &SafeToPromote); + SmallPtrSetImpl &SafeToPromote, + SmallPtrSetImpl &SafeWrap); }; class ARMCodeGenPrepare : public FunctionPass { @@ -146,8 +151,9 @@ class ARMCodeGenPrepare : public FunctionPass { IRPromoter *Promoter = nullptr; std::set AllVisited; SmallPtrSet SafeToPromote; + SmallPtrSet SafeWrap; - bool isSafeOverflow(Instruction *I); + bool isSafeWrap(Instruction *I); bool isSupportedValue(Value *V); bool isLegalToPromote(Value *V); bool TryToPromote(Value *V); @@ -172,13 +178,17 @@ public: } -static bool generateSignBits(Value *V) { +static bool GenerateSignBits(Value *V) { + if (auto *Arg = dyn_cast(V)) + return Arg->hasSExtAttr(); + if (!isa(V)) return false; unsigned Opc = cast(V)->getOpcode(); return Opc == Instruction::AShr || Opc == Instruction::SDiv || - Opc == Instruction::SRem; + Opc == Instruction::SRem || Opc == Instruction::SExt || + Opc == Instruction::SIToFP; } static bool EqualTypeSize(Value *V) { @@ -271,19 +281,14 @@ static bool isSink(Value *V) { return isa(V); } -/// Return whether the instruction can be promoted within any modifications to -/// its operands or result. -bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) { - // FIXME Do we need NSW too? - if (isa(I) && I->hasNoUnsignedWrap()) - return true; - - // We can support a, potentially, overflowing instruction (I) if: +/// Return whether this instruction can safely wrap. +bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) { + // We can support a, potentially, wrapping instruction (I) if: // - It is only used by an unsigned icmp. // - The icmp uses a constant. - // - The overflowing value (I) is decreasing, i.e would underflow - wrapping + // - The wrapping value (I) is decreasing, i.e would underflow - wrapping // around zero to become a larger number than before. - // - The underflowing instruction (I) also uses a constant. + // - The wrapping instruction (I) also uses a constant. // // We can then use the two constants to calculate whether the result would // wrap in respect to itself in the original bitwidth. If it doesn't wrap, @@ -327,7 +332,7 @@ bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) { // - (255 >= 254) == (0xFFFFFFFF >= 254) == true // // To demonstrate why we can't handle increasing values: - // + // // %add = add i8 %a, 2 // %cmp = icmp ult i8 %add, 127 // @@ -385,6 +390,7 @@ bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) { return false; LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); + SafeWrap.insert(I); return true; } @@ -408,13 +414,16 @@ static bool shouldPromote(Value *V) { /// Return whether we can safely mutate V's type to ExtTy without having to be /// concerned with zero extending or truncation. static bool isPromotedResultSafe(Value *V) { + if (GenerateSignBits(V)) + return false; + if (!isa(V)) return true; - if (generateSignBits(V)) - return false; + if (!isa(V)) + return true; - return !isa(V); + return cast(V)->hasNoUnsignedWrap(); } /// Return the intrinsic for the instruction that can perform the same @@ -462,61 +471,34 @@ void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) { InstsToRemove.insert(I); } -void IRPromoter::PrepareConstants() { +void IRPromoter::PrepareWrappingAdds() { + LLVM_DEBUG(dbgs() << "ARM CGP: Prepare underflowing adds.\n"); IRBuilder<> Builder{Ctx}; - // First step is to prepare the instructions for mutation. Most constants - // just need to be zero extended into their new type, but complications arise - // because: - // - For nuw binary operators, negative immediates would need sign extending; - // however, instead we'll change them to positive and zext them. We can do - // this because: - // > The operators that can wrap are: add, sub, mul and shl. - // > shl interprets its second operand as unsigned and if the first operand - // is an immediate, it will need zext to be nuw. - // > I'm assuming mul has to interpret immediates as unsigned for nuw. - // > Which leaves the nuw add and sub to be handled; as with shl, if an - // immediate is used as operand 0, it will need zext to be nuw. - // - We also allow add and sub to safely overflow in certain circumstances - // and only when the value (operand 0) is being decreased. - // - // For adds and subs, that are either nuw or safely wrap and use a negative - // immediate as operand 1, we create an equivalent instruction using a - // positive immediate. That positive immediate can then be zext along with - // all the other immediates later. - for (auto *V : *Visited) { - if (!isa(V)) - continue; - - auto *I = cast(V); - if (SafeToPromote->count(I)) { - - if (!isa(I)) - continue; - if (auto *Const = dyn_cast(I->getOperand(1))) { - if (!Const->isNegative()) - break; + // For adds that safely wrap and use a negative immediate as operand 1, we + // create an equivalent instruction using a positive immediate. + // That positive immediate can then be zext along with all the other + // immediates later. + for (auto *I : *SafeWrap) { + if (I->getOpcode() != Instruction::Add) + continue; - unsigned Opc = I->getOpcode(); - if (Opc != Instruction::Add && Opc != Instruction::Sub) - continue; + LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n"); + assert((isa(I->getOperand(1)) && + cast(I->getOperand(1))->isNegative()) && + "Wrapping should have a negative immediate as the second operand"); - LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n"); - auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs()); - Builder.SetInsertPoint(I); - Value *NewVal = Opc == Instruction::Sub ? - Builder.CreateAdd(I->getOperand(0), NewConst) : - Builder.CreateSub(I->getOperand(0), NewConst); - LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n"); - - if (auto *NewInst = dyn_cast(NewVal)) { - NewInst->copyIRFlags(I); - NewInsts.insert(NewInst); - } - InstsToRemove.insert(I); - I->replaceAllUsesWith(NewVal); - } + auto Const = cast(I->getOperand(1)); + auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs()); + Builder.SetInsertPoint(I); + Value *NewVal = Builder.CreateSub(I->getOperand(0), NewConst); + if (auto *NewInst = dyn_cast(NewVal)) { + NewInst->copyIRFlags(I); + NewInsts.insert(NewInst); } + InstsToRemove.insert(I); + I->replaceAllUsesWith(NewVal); + LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n"); } for (auto *I : NewInsts) Visited->insert(I); @@ -605,7 +587,7 @@ void IRPromoter::PromoteTree() { if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I)) continue; - + assert(EnableDSP && "DSP intrinisc insertion not enabled!"); // Replace unsafe instructions with appropriate intrinsic calls. @@ -683,13 +665,14 @@ void IRPromoter::TruncateSinks() { } void IRPromoter::Cleanup() { + LLVM_DEBUG(dbgs() << "ARM CGP: Cleanup..\n"); // Some zexts will now have become redundant, along with their trunc // operands, so remove them for (auto V : *Visited) { - if (!isa(V)) + if (!isa(V)) continue; - auto ZExt = cast(V); + auto ZExt = cast(V); if (ZExt->getDestTy() != ExtTy) continue; @@ -701,9 +684,11 @@ void IRPromoter::Cleanup() { continue; } - // For any truncs that we insert to handle zexts, we can replace the - // result of the zext with the input to the trunc. - if (NewInsts.count(Src) && isa(V) && isa(Src)) { + // Unless they produce a value that is narrower than ExtTy, we can + // replace the result of the zext with the input of a newly inserted + // trunc. + if (NewInsts.count(Src) && isa(Src) && + Src->getType() == OrigTy) { auto *Trunc = cast(Src); assert(Trunc->getOperand(0)->getType() == ExtTy && "expected inserted trunc to be operating on i32"); @@ -721,9 +706,12 @@ void IRPromoter::Cleanup() { NewInsts.clear(); TruncTysMap.clear(); Promoted.clear(); + SafeToPromote->clear(); + SafeWrap->clear(); } void IRPromoter::ConvertTruncs() { + LLVM_DEBUG(dbgs() << "ARM CGP: Converting truncs..\n"); IRBuilder<> Builder{Ctx}; for (auto *V : *Visited) { @@ -731,12 +719,13 @@ void IRPromoter::ConvertTruncs() { continue; auto *Trunc = cast(V); - assert(LessThanTypeSize(Trunc) && "expected narrow trunc"); - Builder.SetInsertPoint(Trunc); - unsigned NumBits = - cast(Trunc->getType())->getScalarSizeInBits(); - ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits)); + IntegerType *SrcTy = cast(Trunc->getOperand(0)->getType()); + IntegerType *DestTy = cast(TruncTysMap[Trunc][0]); + + unsigned NumBits = DestTy->getScalarSizeInBits(); + ConstantInt *Mask = + ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue()); Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask); if (auto *I = dyn_cast(Masked)) @@ -747,10 +736,11 @@ void IRPromoter::ConvertTruncs() { } void IRPromoter::Mutate(Type *OrigTy, - SmallPtrSetImpl &Visited, + SetVector &Visited, SmallPtrSetImpl &Sources, SmallPtrSetImpl &Sinks, - SmallPtrSetImpl &SafeToPromote) { + SmallPtrSetImpl &SafeToPromote, + SmallPtrSetImpl &SafeWrap) { LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << ARMCodeGenPrepare::TypeSize << " to 32-bits\n"); @@ -763,6 +753,7 @@ void IRPromoter::Mutate(Type *OrigTy, this->Sources = &Sources; this->Sinks = &Sinks; this->SafeToPromote = &SafeToPromote; + this->SafeWrap = &SafeWrap; // Cache original types of the values that will likely need truncating for (auto *I : Sinks) { @@ -778,22 +769,28 @@ void IRPromoter::Mutate(Type *OrigTy, TruncTysMap[I].push_back(I->getOperand(i)->getType()); } } + for (auto *V : Visited) { + if (!isa(V) || Sources.count(V)) + continue; + auto *Trunc = cast(V); + TruncTysMap[Trunc].push_back(Trunc->getDestTy()); + } - // Convert adds and subs using negative immediates to equivalent instructions - // that use positive constants. - PrepareConstants(); + // Convert adds using negative immediates to equivalent instructions that use + // positive constants. + PrepareWrappingAdds(); // Insert zext instructions between sources and their users. ExtendSources(); - // Convert any truncs, that aren't sources, into AND masks. - ConvertTruncs(); - // Promote visited instructions, mutating their types in place. Also insert // DSP intrinsics, if enabled, for adds and subs which would be unsafe to // promote. PromoteTree(); + // Convert any truncs, that aren't sources, into AND masks. + ConvertTruncs(); + // Insert trunc instructions for use by calls, stores etc... TruncateSinks(); @@ -819,6 +816,11 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) { return EqualTypeSize(I->getOperand(0)); } + if (GenerateSignBits(V)) { + LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n"); + return false; + } + // Memory instructions if (isa(V) || isa(V)) return true; @@ -835,9 +837,6 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) { isa(V)) return isSupportedType(V); - if (isa(V)) - return false; - if (auto *Cast = dyn_cast(V)) return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0)); @@ -854,10 +853,6 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) { if (!isSupportedType(V)) return false; - if (generateSignBits(V)) { - LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n"); - return false; - } return true; } @@ -873,7 +868,7 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) { if (SafeToPromote.count(I)) return true; - if (isPromotedResultSafe(V) || isSafeOverflow(I)) { + if (isPromotedResultSafe(V) || isSafeWrap(I)) { SafeToPromote.insert(I); return true; } @@ -911,6 +906,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { return false; SafeToPromote.clear(); + SafeWrap.clear(); if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V)) return false; @@ -921,7 +917,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { SetVector WorkList; SmallPtrSet Sources; SmallPtrSet Sinks; - SmallPtrSet CurrentVisited; + SetVector CurrentVisited; WorkList.insert(V); // Return true if V was added to the worklist as a supported instruction, @@ -1009,7 +1005,8 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { if (ToPromote < 2) return false; - Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote); + Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote, + SafeWrap); return true; } diff --git a/lib/Target/ARM/ARMComputeBlockSize.cpp b/lib/Target/ARM/ARMComputeBlockSize.cpp deleted file mode 100644 index b263e9d86c42..000000000000 --- a/lib/Target/ARM/ARMComputeBlockSize.cpp +++ /dev/null @@ -1,81 +0,0 @@ -//===--- ARMComputeBlockSize.cpp - Compute machine block sizes ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ARM.h" -#include "ARMBaseInstrInfo.h" -#include "ARMBasicBlockInfo.h" -#include "ARMMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include - -using namespace llvm; - -namespace llvm { - -// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions -// below may shrink MI. -static bool -mayOptimizeThumb2Instruction(const MachineInstr *MI) { - switch(MI->getOpcode()) { - // optimizeThumb2Instructions. - case ARM::t2LEApcrel: - case ARM::t2LDRpci: - // optimizeThumb2Branches. - case ARM::t2B: - case ARM::t2Bcc: - case ARM::tBcc: - // optimizeThumb2JumpTables. - case ARM::t2BR_JT: - case ARM::tBR_JTr: - return true; - } - return false; -} - -void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB, - BasicBlockInfo &BBI) { - const ARMBaseInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); - bool isThumb = MF->getInfo()->isThumbFunction(); - BBI.Size = 0; - BBI.Unalign = 0; - BBI.PostAlign = 0; - - for (MachineInstr &I : *MBB) { - BBI.Size += TII->getInstSizeInBytes(I); - // For inline asm, getInstSizeInBytes returns a conservative estimate. - // The actual size may be smaller, but still a multiple of the instr size. - if (I.isInlineAsm()) - BBI.Unalign = isThumb ? 1 : 2; - // Also consider instructions that may be shrunk later. - else if (isThumb && mayOptimizeThumb2Instruction(&I)) - BBI.Unalign = 1; - } - - // tBR_JTr contains a .align 2 directive. - if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) { - BBI.PostAlign = 2; - MBB->getParent()->ensureAlignment(2); - } -} - -std::vector computeAllBlockSizes(MachineFunction *MF) { - std::vector BBInfo; - BBInfo.resize(MF->getNumBlockIDs()); - - for (MachineBasicBlock &MBB : *MF) - computeBlockSize(MF, &MBB, BBInfo[MBB.getNumber()]); - - return BBInfo; -} - -} // end namespace llvm diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 5e97c4cb35e3..60e5d7bf6098 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1,9 +1,8 @@ //===- ARMConstantIslandPass.cpp - ARM constant islands -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -98,7 +97,7 @@ namespace { /// CPE - A constant pool entry that has been placed somewhere, which /// tracks a list of users. class ARMConstantIslands : public MachineFunctionPass { - std::vector BBInfo; + std::unique_ptr BBUtils = nullptr; /// WaterList - A sorted list of basic blocks where islands could be placed /// (i.e. blocks that don't fall through to the following block, due @@ -244,7 +243,6 @@ namespace { void initializeFunctionInfo(const std::vector &CPEMIs); MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); void updateForInsertedWaterBlock(MachineBasicBlock *NewBB); - void adjustBBOffsetsAfter(MachineBasicBlock *BB); bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI); unsigned getCombinedIndex(const MachineInstr *CPEMI); int findInRangeCPEntry(CPUser& U, unsigned UserOffset); @@ -260,7 +258,6 @@ namespace { bool DoDump = false); bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water, CPUser &U, unsigned &Growth); - bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); bool fixupImmediateBr(ImmBranch &Br); bool fixupConditionalBr(ImmBranch &Br); bool fixupUnconditionalBr(ImmBranch &Br); @@ -275,7 +272,6 @@ namespace { MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB); - unsigned getOffsetOf(MachineInstr *MI) const; unsigned getUserOffset(CPUser&) const; void dumpBBs(); void verify(); @@ -296,9 +292,10 @@ char ARMConstantIslands::ID = 0; /// verify - check BBOffsets, BBSizes, alignment of islands void ARMConstantIslands::verify() { #ifndef NDEBUG + BBInfoVector &BBInfo = BBUtils->getBBInfo(); assert(std::is_sorted(MF->begin(), MF->end(), - [this](const MachineBasicBlock &LHS, - const MachineBasicBlock &RHS) { + [&BBInfo](const MachineBasicBlock &LHS, + const MachineBasicBlock &RHS) { return BBInfo[LHS.getNumber()].postOffset() < BBInfo[RHS.getNumber()].postOffset(); })); @@ -324,6 +321,7 @@ void ARMConstantIslands::verify() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// print block size and offset information - debugging LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() { + BBInfoVector &BBInfo = BBUtils->getBBInfo(); LLVM_DEBUG({ for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) { const BasicBlockInfo &BBI = BBInfo[J]; @@ -340,6 +338,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() { bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MCP = mf.getConstantPool(); + BBUtils = std::unique_ptr(new ARMBasicBlockUtils(mf)); LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: " << MCP->getConstants().size() << " CP entries, aligned to " @@ -467,7 +466,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { LLVM_DEBUG(dbgs() << '\n'; dumpBBs()); - BBInfo.clear(); + BBUtils->clear(); WaterList.clear(); CPUsers.clear(); CPEntries.clear(); @@ -684,14 +683,14 @@ void ARMConstantIslands::scanFunctionJumpTables() { void ARMConstantIslands:: initializeFunctionInfo(const std::vector &CPEMIs) { - BBInfo = computeAllBlockSizes(MF); - + BBUtils->computeAllBlockSizes(); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); // The known bits of the entry block offset are determined by the function // alignment. BBInfo.front().KnownBits = MF->getAlignment(); // Compute block offsets and known bits. - adjustBBOffsetsAfter(&MF->front()); + BBUtils->adjustBBOffsetsAfter(&MF->front()); // Now go back through the instructions and build up our data structures. for (MachineBasicBlock &MBB : *MF) { @@ -856,25 +855,6 @@ initializeFunctionInfo(const std::vector &CPEMIs) { } } -/// getOffsetOf - Return the current offset of the specified machine instruction -/// from the start of the function. This offset changes as stuff is moved -/// around inside the function. -unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const { - MachineBasicBlock *MBB = MI->getParent(); - - // The offset is composed of two things: the sum of the sizes of all MBB's - // before this instruction's block, and the offset from the start of the block - // it is in. - unsigned Offset = BBInfo[MBB->getNumber()].Offset; - - // Sum instructions before MI in MBB. - for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { - assert(I != MBB->end() && "Didn't find MI in its own basic block?"); - Offset += TII->getInstSizeInBytes(*I); - } - return Offset; -} - /// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB /// ID. static bool CompareMBBNumbers(const MachineBasicBlock *LHS, @@ -891,13 +871,11 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) { // Insert an entry into BBInfo to align it properly with the (newly // renumbered) block numbers. - BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + BBUtils->insert(NewBB->getNumber(), BasicBlockInfo()); // Next, update WaterList. Specifically, we need to add NewMBB as having // available water after it. - water_iterator IP = - std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, - CompareMBBNumbers); + water_iterator IP = llvm::lower_bound(WaterList, NewBB, CompareMBBNumbers); WaterList.insert(IP, NewBB); } @@ -942,15 +920,13 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // Insert an entry into BBInfo to align it properly with the (newly // renumbered) block numbers. - BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + BBUtils->insert(NewBB->getNumber(), BasicBlockInfo()); // Next, update WaterList. Specifically, we need to add OrigMBB as having // available water after it (but not if it's already there, which happens // when splitting before a conditional branch that is followed by an // unconditional branch - in that case we want to insert NewBB). - water_iterator IP = - std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, - CompareMBBNumbers); + water_iterator IP = llvm::lower_bound(WaterList, OrigBB, CompareMBBNumbers); MachineBasicBlock* WaterBB = *IP; if (WaterBB == OrigBB) WaterList.insert(std::next(IP), NewBB); @@ -963,14 +939,14 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // the new jump we added. (It should be possible to do this without // recounting everything, but it's very confusing, and this is rarely // executed.) - computeBlockSize(MF, OrigBB, BBInfo[OrigBB->getNumber()]); + BBUtils->computeBlockSize(OrigBB); // Figure out how large the NewMBB is. As the second half of the original // block, it may contain a tablejump. - computeBlockSize(MF, NewBB, BBInfo[NewBB->getNumber()]); + BBUtils->computeBlockSize(NewBB); // All BBOffsets following these blocks must be modified. - adjustBBOffsetsAfter(OrigBB); + BBUtils->adjustBBOffsetsAfter(OrigBB); return NewBB; } @@ -979,7 +955,9 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { /// displacement computation. Update U.KnownAlignment to match its current /// basic block location. unsigned ARMConstantIslands::getUserOffset(CPUser &U) const { - unsigned UserOffset = getOffsetOf(U.MI); + unsigned UserOffset = BBUtils->getOffsetOf(U.MI); + + SmallVectorImpl &BBInfo = BBUtils->getBBInfo(); const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()]; unsigned KnownBits = BBI.internalKnownBits(); @@ -1028,6 +1006,7 @@ bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset, bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, MachineBasicBlock* Water, CPUser &U, unsigned &Growth) { + BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned CPELogAlign = getCPELogAlign(U.CPEMI); unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); unsigned NextBlockOffset, NextBlockAlignment; @@ -1068,10 +1047,11 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset, MachineInstr *CPEMI, unsigned MaxDisp, bool NegOk, bool DoDump) { - unsigned CPEOffset = getOffsetOf(CPEMI); + unsigned CPEOffset = BBUtils->getOffsetOf(CPEMI); if (DoDump) { LLVM_DEBUG({ + BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned Block = MI->getParent()->getNumber(); const BasicBlockInfo &BBI = BBInfo[Block]; dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm() @@ -1104,28 +1084,6 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) { } #endif // NDEBUG -void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) { - unsigned BBNum = BB->getNumber(); - for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) { - // Get the offset and known bits at the end of the layout predecessor. - // Include the alignment of the current block. - unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment(); - unsigned Offset = BBInfo[i - 1].postOffset(LogAlign); - unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign); - - // This is where block i begins. Stop if the offset is already correct, - // and we have updated 2 blocks. This is the maximum number of blocks - // changed before calling this function. - if (i > BBNum + 2 && - BBInfo[i].Offset == Offset && - BBInfo[i].KnownBits == KnownBits) - break; - - BBInfo[i].Offset = Offset; - BBInfo[i].KnownBits = KnownBits; - } -} - /// decrementCPEReferenceCount - find the constant pool entry with index CPI /// and instruction CPEMI, and decrement its refcount. If the refcount /// becomes 0 remove the entry and instruction. Returns true if we removed @@ -1241,6 +1199,7 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, // When a CP access is out of range, BB0 may be used as water. However, // inserting islands between BB0 and BB1 makes other accesses out of range. MachineBasicBlock *UserBB = U.MI->getParent(); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned MinNoSplitDisp = BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI)); if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2) @@ -1297,6 +1256,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, MachineInstr *CPEMI = U.CPEMI; unsigned CPELogAlign = getCPELogAlign(CPEMI); MachineBasicBlock *UserMBB = UserMI->getParent(); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; // If the block does not end in an unconditional branch already, and if the @@ -1328,8 +1288,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); ImmBranches.push_back(ImmBranch(&UserMBB->back(), MaxDisp, false, UncondBr)); - computeBlockSize(MF, UserMBB, BBInfo[UserMBB->getNumber()]); - adjustBBOffsetsAfter(UserMBB); + BBUtils->computeBlockSize(UserMBB); + BBUtils->adjustBBOffsetsAfter(UserMBB); return; } } @@ -1538,8 +1498,8 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, NewIsland->setAlignment(getCPELogAlign(U.CPEMI)); // Increase the size of the island block to account for the new entry. - BBInfo[NewIsland->getNumber()].Size += Size; - adjustBBOffsetsAfter(&*--NewIsland->getIterator()); + BBUtils->adjustBBSize(NewIsland, Size); + BBUtils->adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) @@ -1550,7 +1510,8 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, LLVM_DEBUG( dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI - << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset)); + << format(" offset=%#x\n", + BBUtils->getBBInfo()[NewIsland->getNumber()].Offset)); return true; } @@ -1561,7 +1522,8 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { MachineBasicBlock *CPEBB = CPEMI->getParent(); unsigned Size = CPEMI->getOperand(2).getImm(); CPEMI->eraseFromParent(); - BBInfo[CPEBB->getNumber()].Size -= Size; + BBInfoVector &BBInfo = BBUtils->getBBInfo(); + BBUtils->adjustBBSize(CPEBB, -Size); // All succeeding offsets have the current size value added in, fix this. if (CPEBB->empty()) { BBInfo[CPEBB->getNumber()].Size = 0; @@ -1572,7 +1534,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { // Entries are sorted by descending alignment, so realign from the front. CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin())); - adjustBBOffsetsAfter(CPEBB); + BBUtils->adjustBBOffsetsAfter(CPEBB); // An island has only one predecessor BB and one successor BB. Check if // this BB's predecessor jumps directly to this BB's successor. This // shouldn't happen currently. @@ -1597,30 +1559,6 @@ bool ARMConstantIslands::removeUnusedCPEntries() { return MadeChange; } -/// isBBInRange - Returns true if the distance between specific MI and -/// specific BB can fit in MI's displacement field. -bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB, - unsigned MaxDisp) { - unsigned PCAdj = isThumb ? 4 : 8; - unsigned BrOffset = getOffsetOf(MI) + PCAdj; - unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; - - LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB) - << " from " << printMBBReference(*MI->getParent()) - << " max delta=" << MaxDisp << " from " << getOffsetOf(MI) - << " to " << DestOffset << " offset " - << int(DestOffset - BrOffset) << "\t" << *MI); - - if (BrOffset <= DestOffset) { - // Branch before the Dest. - if (DestOffset-BrOffset <= MaxDisp) - return true; - } else { - if (BrOffset-DestOffset <= MaxDisp) - return true; - } - return false; -} /// fixupImmediateBr - Fix up an immediate branch whose destination is too far /// away to fit in its displacement field. @@ -1629,7 +1567,7 @@ bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) { MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); // Check to see if the DestBB is already in-range. - if (isBBInRange(MI, DestBB, Br.MaxDisp)) + if (BBUtils->isBBInRange(MI, DestBB, Br.MaxDisp)) return false; if (!Br.isCond) @@ -1648,11 +1586,15 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) { if (!isThumb1) llvm_unreachable("fixupUnconditionalBr is Thumb1 only!"); + if (!AFI->isLRSpilled()) + report_fatal_error("underestimated function size"); + // Use BL to implement far jump. Br.MaxDisp = (1 << 21) * 2; MI->setDesc(TII->get(ARM::tBfar)); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); BBInfo[MBB->getNumber()].Size += 2; - adjustBBOffsetsAfter(MBB); + BBUtils->adjustBBOffsetsAfter(MBB); HasFarJump = true; ++NumUBrFixed; @@ -1699,7 +1641,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { // bne L2 // b L1 MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); - if (isBBInRange(MI, NewDest, Br.MaxDisp)) { + if (BBUtils->isBBInRange(MI, NewDest, Br.MaxDisp)) { LLVM_DEBUG( dbgs() << " Invert Bcc condition and swap its destination with " << *BMI); @@ -1716,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { // No need for the branch to the next block. We're adding an unconditional // branch to the destination. int delta = TII->getInstSizeInBytes(MBB->back()); - BBInfo[MBB->getNumber()].Size -= delta; + BBUtils->adjustBBSize(MBB, -delta); MBB->back().eraseFromParent(); // The conditional successor will be swapped between the BBs after this, so @@ -1737,21 +1679,21 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode())) .addMBB(NextBB).addImm(CC).addReg(CCReg); Br.MI = &MBB->back(); - BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back()); + BBUtils->adjustBBSize(MBB, TII->getInstSizeInBytes(MBB->back())); if (isThumb) BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)) .addMBB(DestBB) .add(predOps(ARMCC::AL)); else BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB); - BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back()); + BBUtils->adjustBBSize(MBB, TII->getInstSizeInBytes(MBB->back())); unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); // Remove the old conditional branch. It may or may not still be in MBB. - BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI); + BBUtils->adjustBBSize(MI->getParent(), -TII->getInstSizeInBytes(*MI)); MI->eraseFromParent(); - adjustBBOffsetsAfter(MBB); + BBUtils->adjustBBOffsetsAfter(MBB); return true; } @@ -1826,8 +1768,8 @@ bool ARMConstantIslands::optimizeThumb2Instructions() { LLVM_DEBUG(dbgs() << "Shrink: " << *U.MI); U.MI->setDesc(TII->get(NewOpc)); MachineBasicBlock *MBB = U.MI->getParent(); - BBInfo[MBB->getNumber()].Size -= 2; - adjustBBOffsetsAfter(MBB); + BBUtils->adjustBBSize(MBB, -2); + BBUtils->adjustBBOffsetsAfter(MBB); ++NumT2CPShrunk; MadeChange = true; } @@ -1866,12 +1808,12 @@ bool ARMConstantIslands::optimizeThumb2Branches() { if (NewOpc) { unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); - if (isBBInRange(Br.MI, DestBB, MaxOffs)) { + if (BBUtils->isBBInRange(Br.MI, DestBB, MaxOffs)) { LLVM_DEBUG(dbgs() << "Shrink branch: " << *Br.MI); Br.MI->setDesc(TII->get(NewOpc)); MachineBasicBlock *MBB = Br.MI->getParent(); - BBInfo[MBB->getNumber()].Size -= 2; - adjustBBOffsetsAfter(MBB); + BBUtils->adjustBBSize(MBB, -2); + BBUtils->adjustBBOffsetsAfter(MBB); ++NumT2BrShrunk; MadeChange = true; } @@ -1898,34 +1840,47 @@ bool ARMConstantIslands::optimizeThumb2Branches() { MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); // Check if the distance is within 126. Subtract starting offset by 2 // because the cmp will be eliminated. - unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2; + unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2; + BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; - if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) { - MachineBasicBlock::iterator CmpMI = Br.MI; - if (CmpMI != Br.MI->getParent()->begin()) { - --CmpMI; - if (CmpMI->getOpcode() == ARM::tCMPi8) { - unsigned Reg = CmpMI->getOperand(0).getReg(); - Pred = getInstrPredicate(*CmpMI, PredReg); - if (Pred == ARMCC::AL && - CmpMI->getOperand(1).getImm() == 0 && - isARMLowRegister(Reg)) { - MachineBasicBlock *MBB = Br.MI->getParent(); - LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI); - MachineInstr *NewBR = - BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc)) - .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags()); - CmpMI->eraseFromParent(); - Br.MI->eraseFromParent(); - Br.MI = NewBR; - BBInfo[MBB->getNumber()].Size -= 2; - adjustBBOffsetsAfter(MBB); - ++NumCBZ; - MadeChange = true; - } - } + if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126) + continue; + + // Search backwards to find a tCMPi8 + auto *TRI = STI->getRegisterInfo(); + MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI); + if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8) + continue; + + unsigned Reg = CmpMI->getOperand(0).getReg(); + + // Check for Kill flags on Reg. If they are present remove them and set kill + // on the new CBZ. + MachineBasicBlock::iterator KillMI = Br.MI; + bool RegKilled = false; + do { + --KillMI; + if (KillMI->killsRegister(Reg, TRI)) { + KillMI->clearRegisterKills(Reg, TRI); + RegKilled = true; + break; } - } + } while (KillMI != CmpMI); + + // Create the new CBZ/CBNZ + MachineBasicBlock *MBB = Br.MI->getParent(); + LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI); + MachineInstr *NewBR = + BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + .addReg(Reg, getKillRegState(RegKilled)) + .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); + CmpMI->eraseFromParent(); + Br.MI->eraseFromParent(); + Br.MI = NewBR; + BBInfo[MBB->getNumber()].Size -= 2; + BBUtils->adjustBBOffsetsAfter(MBB); + ++NumCBZ; + MadeChange = true; } return MadeChange; @@ -2085,16 +2040,6 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI, DeadSize += 4; } -static bool registerDefinedBetween(unsigned Reg, - MachineBasicBlock::iterator From, - MachineBasicBlock::iterator To, - const TargetRegisterInfo *TRI) { - for (auto I = From; I != To; ++I) - if (I->modifiesRegister(Reg, TRI)) - return true; - return false; -} - /// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller /// jumptables when it's possible. bool ARMConstantIslands::optimizeThumb2JumpTables() { @@ -2117,8 +2062,9 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { bool ByteOk = true; bool HalfWordOk = true; - unsigned JTOffset = getOffsetOf(MI) + 4; + unsigned JTOffset = BBUtils->getOffsetOf(MI) + 4; const std::vector &JTBBs = JT[JTI].MBBs; + BBInfoVector &BBInfo = BBUtils->getBBInfo(); for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { MachineBasicBlock *MBB = JTBBs[j]; unsigned DstOffset = BBInfo[MBB->getNumber()].Offset; @@ -2281,7 +2227,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { int Delta = OrigSize - NewSize + DeadSize; BBInfo[MBB->getNumber()].Size -= Delta; - adjustBBOffsetsAfter(MBB); + BBUtils->adjustBBOffsetsAfter(MBB); ++NumTBs; MadeChange = true; diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp index 236c4fab2a5c..3bdb0e1ef62d 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -1,9 +1,8 @@ //===- ARMConstantPoolValue.cpp - ARM constantpool value ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h index 55194ed94532..660b7fc88d82 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.h +++ b/lib/Target/ARM/ARMConstantPoolValue.h @@ -1,9 +1,8 @@ //===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index eecd0a10dc7d..b32ba3eeea18 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1,9 +1,8 @@ //===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,6 +23,7 @@ #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -423,8 +423,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { } #endif - auto I = std::lower_bound(std::begin(NEONLdStTable), - std::end(NEONLdStTable), Opcode); + auto I = llvm::lower_bound(NEONLdStTable, Opcode); if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode) return I; return nullptr; @@ -470,6 +469,7 @@ static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); + LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); @@ -571,8 +571,8 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { // Transfer memoperands. MIB.cloneMemRefs(MI); - MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } /// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register @@ -580,6 +580,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); + LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); @@ -646,8 +647,8 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { // Transfer memoperands. MIB.cloneMemRefs(MI); - MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } /// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ @@ -655,6 +656,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); + LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && "NEONLdStTable lookup failed"); @@ -745,6 +747,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, unsigned Opc, bool IsExt) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); + LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); unsigned OpIdx = 0; @@ -774,6 +777,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill)); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } static bool IsAnAddressOperand(const MachineOperand &MO) { @@ -830,6 +834,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO); MachineInstrBuilder LO16, HI16; + LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); if (!STI->hasV6T2Ops() && (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) { @@ -911,6 +916,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, LO16.add(makeImplicit(MI.getOperand(1))); TransferImpOps(MI, LO16, HI16); MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "To: "; LO16.getInstr()->dump();); + LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump();); } /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as @@ -1930,11 +1937,16 @@ bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) { TRI = STI->getRegisterInfo(); AFI = MF.getInfo(); + LLVM_DEBUG(dbgs() << "********** ARM EXPAND PSEUDO INSTRUCTIONS **********\n" + << "********** Function: " << MF.getName() << '\n'); + bool Modified = false; for (MachineBasicBlock &MBB : MF) Modified |= ExpandMBB(MBB); if (VerifyARMPseudo) MF.verify(this, "After expanding ARM pseudo instructions."); + + LLVM_DEBUG(dbgs() << "***************************************************\n"); return Modified; } diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index a50abfdbee44..6e274d269bf2 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1,9 +1,8 @@ //===- ARMFastISel.cpp - ARM FastISel implementation ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -245,8 +244,6 @@ class ARMFastISel final : public FastISel { } // end anonymous namespace -#include "ARMGenCallingConv.inc" - // DefinesOptionalPredicate - This is different from DefinesPredicate in that // we don't care about implicit defs here, just places we'll need to add a // default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR. @@ -444,7 +441,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) { } // Require VFP2 for loading fp constants. - if (!Subtarget->hasVFP2()) return false; + if (!Subtarget->hasVFP2Base()) return false; // MachineConstantPool wants an explicit alignment. unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); @@ -500,7 +497,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { } unsigned ResultReg = 0; - if (Subtarget->useMovt(*FuncInfo.MF)) + if (Subtarget->useMovt()) ResultReg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue()); if (ResultReg) @@ -558,7 +555,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { bool IsPositionIndependent = isPositionIndependent(); // Use movw+movt when possible, it avoids constant pool entries. // Non-darwin targets only support static movt relocations in FastISel. - if (Subtarget->useMovt(*FuncInfo.MF) && + if (Subtarget->useMovt() && (Subtarget->isTargetMachO() || !IsPositionIndependent)) { unsigned Opc; unsigned char TF = 0; @@ -972,7 +969,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; break; case MVT::f32: - if (!Subtarget->hasVFP2()) return false; + if (!Subtarget->hasVFP2Base()) return false; // Unaligned loads need special handling. Floats require word-alignment. if (Alignment && Alignment < 4) { needVMOV = true; @@ -985,7 +982,8 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, } break; case MVT::f64: - if (!Subtarget->hasVFP2()) return false; + // Can load and store double precision even without FeatureFP64 + if (!Subtarget->hasVFP2Base()) return false; // FIXME: Unaligned loads need special handling. Doublewords require // word-alignment. if (Alignment && Alignment < 4) @@ -1110,7 +1108,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, } break; case MVT::f32: - if (!Subtarget->hasVFP2()) return false; + if (!Subtarget->hasVFP2Base()) return false; // Unaligned stores need special handling. Floats require word-alignment. if (Alignment && Alignment < 4) { unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32)); @@ -1125,7 +1123,8 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, } break; case MVT::f64: - if (!Subtarget->hasVFP2()) return false; + // Can load and store double precision even without FeatureFP64 + if (!Subtarget->hasVFP2Base()) return false; // FIXME: Unaligned stores need special handling. Doublewords require // word-alignment. if (Alignment && Alignment < 4) @@ -1356,10 +1355,10 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, if (!SrcEVT.isSimple()) return false; MVT SrcVT = SrcEVT.getSimpleVT(); - if (Ty->isFloatTy() && !Subtarget->hasVFP2()) + if (Ty->isFloatTy() && !Subtarget->hasVFP2Base()) return false; - if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP())) + if (Ty->isDoubleTy() && (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64())) return false; // Check to see if the 2nd operand is a constant that we can encode directly @@ -1509,7 +1508,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) { bool ARMFastISel::SelectFPExt(const Instruction *I) { // Make sure we have VFP and that we're extending float to double. - if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false; + if (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64()) return false; Value *V = I->getOperand(0); if (!I->getType()->isDoubleTy() || @@ -1528,7 +1527,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) { bool ARMFastISel::SelectFPTrunc(const Instruction *I) { // Make sure we have VFP and that we're truncating double to float. - if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false; + if (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64()) return false; Value *V = I->getOperand(0); if (!(I->getType()->isFloatTy() && @@ -1547,7 +1546,7 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) { bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { // Make sure we have VFP. - if (!Subtarget->hasVFP2()) return false; + if (!Subtarget->hasVFP2Base()) return false; MVT DstVT; Type *Ty = I->getType(); @@ -1579,7 +1578,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { unsigned Opc; if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS; - else if (Ty->isDoubleTy() && !Subtarget->isFPOnlySP()) + else if (Ty->isDoubleTy() && Subtarget->hasFP64()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD; else return false; @@ -1592,7 +1591,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) { // Make sure we have VFP. - if (!Subtarget->hasVFP2()) return false; + if (!Subtarget->hasVFP2Base()) return false; MVT DstVT; Type *RetTy = I->getType(); @@ -1605,7 +1604,7 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) { unsigned Opc; Type *OpTy = I->getOperand(0)->getType(); if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS; - else if (OpTy->isDoubleTy() && !Subtarget->isFPOnlySP()) + else if (OpTy->isDoubleTy() && Subtarget->hasFP64()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD; else return false; @@ -1811,9 +1810,9 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) { // if we have them. // FIXME: It'd be nice to use NEON instructions. Type *Ty = I->getType(); - if (Ty->isFloatTy() && !Subtarget->hasVFP2()) + if (Ty->isFloatTy() && !Subtarget->hasVFP2Base()) return false; - if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP())) + if (Ty->isDoubleTy() && (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64())) return false; unsigned Opc; @@ -1855,7 +1854,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, default: report_fatal_error("Unsupported calling convention"); case CallingConv::Fast: - if (Subtarget->hasVFP2() && !isVarArg) { + if (Subtarget->hasVFP2Base() && !isVarArg) { if (!Subtarget->isAAPCS_ABI()) return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); // For AAPCS ABI targets, just use VFP variant of the calling convention. @@ -1866,7 +1865,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::CXX_FAST_TLS: // Use target triple & subtarget features to do actual dispatch. if (Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && + if (Subtarget->hasVFP2Base() && TM.Options.FloatABIType == FloatABI::Hard && !isVarArg) return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); else @@ -1935,11 +1934,11 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl &Args, case MVT::i32: break; case MVT::f32: - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2Base()) return false; break; case MVT::f64: - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2Base()) return false; break; } diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h index 8c0df4c2cbf9..5cd7006c22fc 100644 --- a/lib/Target/ARM/ARMFeatures.h +++ b/lib/Target/ARM/ARMFeatures.h @@ -1,9 +1,8 @@ //===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index a9d87ced31f3..bedb779bcba0 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1,9 +1,8 @@ //===- ARMFrameLowering.cpp - ARM Frame Information -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,6 +29,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -344,6 +344,10 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use /// this to produce a conservative estimate that we check in an assert() later. static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) { + // For Thumb1, push.w isn't available, so the first push will always push + // r7 and lr onto the stack first. + if (AFI.isThumb1OnlyFunction()) + return -AFI.getArgRegsSaveSize() - (2 * 4); // This is a conservative estimation: Assume the frame pointer being r7 and // pc("r15") up to r8 getting spilled before (= 8 registers). return -AFI.getArgRegsSaveSize() - (8 * 4); @@ -954,8 +958,12 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, } } // Use the base pointer if we have one. - if (RegInfo->hasBasePointer(MF)) + // FIXME: Maybe prefer sp on Thumb1 if it's legal and the offset is cheaper? + // That can happen if we forced a base pointer for a large call frame. + if (RegInfo->hasBasePointer(MF)) { FrameReg = RegInfo->getBaseRegister(); + Offset -= SPAdj; + } return Offset; } @@ -1476,13 +1484,17 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, } // FIXME: Make generic? -static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, - const ARMBaseInstrInfo &TII) { +static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF, + const ARMBaseInstrInfo &TII) { unsigned FnSize = 0; for (auto &MBB : MF) { for (auto &MI : MBB) FnSize += TII.getInstSizeInBytes(MI); } + if (MF.getJumpTableInfo()) + for (auto &Table: MF.getJumpTableInfo()->getJumpTables()) + FnSize += Table.MBBs.size() * 4; + FnSize += MF.getConstantPool()->getConstants().size() * 4; return FnSize; } @@ -1726,7 +1738,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, bool ForceLRSpill = false; if (!LRSpilled && AFI->isThumb1OnlyFunction()) { - unsigned FnSize = GetFunctionSizeInBytes(MF, TII); + unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII); // Force LR to be spilled if the Thumb function size is > 2048. This enables // use of BL to implement far jump. If it turns out that it's not needed // then the branch fix up path will undo it. @@ -1771,13 +1783,59 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, } EstimatedStackSize += 16; // For possible paddings. - unsigned EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit; + if (AFI->isThumb1OnlyFunction()) { + // For Thumb1, don't bother to iterate over the function. The only + // instruction that requires an emergency spill slot is a store to a + // frame index. + // + // tSTRspi, which is used for sp-relative accesses, has an 8-bit unsigned + // immediate. tSTRi, which is used for bp- and fp-relative accesses, has + // a 5-bit unsigned immediate. + // + // We could try to check if the function actually contains a tSTRspi + // that might need the spill slot, but it's not really important. + // Functions with VLAs or extremely large call frames are rare, and + // if a function is allocating more than 1KB of stack, an extra 4-byte + // slot probably isn't relevant. + if (RegInfo->hasBasePointer(MF)) + EstimatedRSStackSizeLimit = (1U << 5) * 4; + else + EstimatedRSStackSizeLimit = (1U << 8) * 4; + EstimatedRSFixedSizeLimit = (1U << 5) * 4; + } else { + EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit; + } + // Final estimate of whether sp or bp-relative accesses might require + // scavenging. + bool HasLargeStack = EstimatedStackSize > EstimatedRSStackSizeLimit; + + // If the stack pointer moves and we don't have a base pointer, the + // estimate logic doesn't work. The actual offsets might be larger when + // we're constructing a call frame, or we might need to use negative + // offsets from fp. + bool HasMovingSP = MFI.hasVarSizedObjects() || + (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)); + bool HasBPOrFixedSP = RegInfo->hasBasePointer(MF) || !HasMovingSP; + + // If we have a frame pointer, we assume arguments will be accessed + // relative to the frame pointer. Check whether fp-relative accesses to + // arguments require scavenging. + // + // We could do slightly better on Thumb1; in some cases, an sp-relative + // offset would be legal even though an fp-relative offset is not. int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI); - bool BigFrameOffsets = EstimatedStackSize >= EstimatedRSStackSizeLimit || - MFI.hasVarSizedObjects() || - (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)) || - // For large argument stacks fp relative addressed may overflow. - (HasFP && (MaxFixedOffset - MaxFPOffset) >= (int)EstimatedRSStackSizeLimit); + bool HasLargeArgumentList = + HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit; + + bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP || + HasLargeArgumentList; + LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit + << "; EstimatedStack" << EstimatedStackSize + << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset + << "; BigFrameOffsets: " << BigFrameOffsets + << "\n"); if (BigFrameOffsets || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { AFI->setHasStackFrame(true); @@ -1802,8 +1860,17 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, CS1Spilled = true; } - // This is true when we inserted a spill for an unused register that can now - // be used for register scavenging. + // This is true when we inserted a spill for a callee-save GPR which is + // not otherwise used by the function. This guaranteees it is possible + // to scavenge a register to hold the address of a stack slot. On Thumb1, + // the register must be a valid operand to tSTRi, i.e. r4-r7. For other + // subtargets, this is any GPR, i.e. r4-r11 or lr. + // + // If we don't insert a spill, we instead allocate an emergency spill + // slot, which can be used by scavenging to spill an arbitrary register. + // + // We currently don't try to figure out whether any specific instruction + // requires scavening an additional register. bool ExtraCSSpill = false; if (AFI->isThumb1OnlyFunction()) { @@ -1912,7 +1979,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, NumGPRSpills++; CS1Spilled = true; assert(!MRI.isReserved(Reg) && "Should not be reserved"); - if (!MRI.isPhysRegUsed(Reg)) + if (Reg != ARM::LR && !MRI.isPhysRegUsed(Reg)) ExtraCSSpill = true; UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg)); if (Reg == ARM::LR) @@ -1937,7 +2004,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, UnspilledCS1GPRs.erase(LRPos); ForceLRSpill = false; - if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR)) + if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR) && + !AFI->isThumb1OnlyFunction()) ExtraCSSpill = true; } @@ -1959,7 +2027,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(Reg); LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) << " to make up alignment\n"); - if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg)) + if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg) && + !(Reg == ARM::LR && AFI->isThumb1OnlyFunction())) ExtraCSSpill = true; break; } @@ -1988,8 +2057,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned Reg = UnspilledCS1GPRs.back(); UnspilledCS1GPRs.pop_back(); if (!MRI.isReserved(Reg) && - (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || - Reg == ARM::LR)) { + (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) { Extras.push_back(Reg); NumExtras--; } @@ -2012,10 +2080,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, ExtraCSSpill = true; } } - if (!ExtraCSSpill && !AFI->isThumb1OnlyFunction()) { - // note: Thumb1 functions spill to R12, not the stack. Reserve a slot - // closest to SP or frame pointer. + if (!ExtraCSSpill) { + // Reserve a slot closest to SP or frame pointer. assert(RS && "Register scavenging not provided"); + LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n"); const TargetRegisterClass &RC = ARM::GPRRegClass; unsigned Size = TRI->getSpillSize(RC); unsigned Align = TRI->getSpillAlignment(RC); @@ -2028,6 +2096,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(ARM::LR); AFI->setLRIsSpilledForFarJump(true); } + AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); } MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 2f7e23840e75..7544ca3c38d6 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -1,9 +1,8 @@ //===- ARMTargetFrameLowering.h - Define frame lowering for ARM -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp index d5dacbe08770..0fa32a0abeff 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -1,9 +1,8 @@ //===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h index ccf09db69937..b5ac694e01f7 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.h +++ b/lib/Target/ARM/ARMHazardRecognizer.h @@ -1,9 +1,8 @@ //===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 8e0e82388251..b349627b67b1 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -120,8 +119,7 @@ public: SDValue &Offset, SDValue &Opc); bool SelectAddrMode3Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); - bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, - int Lwb, int Upb, bool FP16); + bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, bool FP16); bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); @@ -131,6 +129,7 @@ public: // Thumb Addressing Modes: bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRRSext(SDValue N, SDValue &Base, SDValue &Offset); bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, @@ -147,6 +146,9 @@ public: SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); + template + bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); @@ -452,8 +454,10 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { if (Subtarget->isThumb()) { if (Val <= 255) return 1; // MOV if (Subtarget->hasV6T2Ops() && - (Val <= 0xffff || ARM_AM::getT2SOImmValSplatVal(Val) != -1)) - return 1; // MOVW + (Val <= 0xffff || // MOV + ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW + ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN + return 1; if (Val <= 510) return 2; // MOV + ADDi8 if (~Val <= 255) return 2; // MOV + MVN if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL @@ -463,7 +467,7 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs } - if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT + if (Subtarget->useMovt()) return 2; // MOVW + MOVT return 3; // Literal pool load } @@ -900,7 +904,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, } bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, - int Lwb, int Upb, bool FP16) { + bool FP16) { if (!CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { @@ -922,7 +926,7 @@ bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offse int RHSC; const int Scale = FP16 ? 2 : 4; - if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) { + if (isScaledConstantInRange(N.getOperand(1), Scale, -255, 256, RHSC)) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); @@ -960,16 +964,12 @@ bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offse bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset) { - int Lwb = -256 + 1; - int Upb = 256; - return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false); + return IsAddressingMode5(N, Base, Offset, /*FP16=*/ false); } bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset) { - int Lwb = -512 + 1; - int Upb = 512; - return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true); + return IsAddressingMode5(N, Base, Offset, /*FP16=*/ true); } bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, @@ -1033,8 +1033,22 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N, // Thumb Addressing Modes //===----------------------------------------------------------------------===// -bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, - SDValue &Base, SDValue &Offset){ +static bool shouldUseZeroOffsetLdSt(SDValue N) { + // Negative numbers are difficult to materialise in thumb1. If we are + // selecting the add of a negative, instead try to select ri with a zero + // offset, so create the add node directly which will become a sub. + if (N.getOpcode() != ISD::ADD) + return false; + + // Look for an imm which is not legal for ld/st, but is legal for sub. + if (auto C = dyn_cast(N.getOperand(1))) + return C->getSExtValue() < 0 && C->getSExtValue() >= -255; + + return false; +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeRRSext(SDValue N, SDValue &Base, + SDValue &Offset) { if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) { ConstantSDNode *NC = dyn_cast(N); if (!NC || !NC->isNullValue()) @@ -1049,9 +1063,22 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, return true; } +bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, SDValue &Base, + SDValue &Offset) { + if (shouldUseZeroOffsetLdSt(N)) + return false; // Select ri instead + return SelectThumbAddrModeRRSext(N, Base, Offset); +} + bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm) { + if (shouldUseZeroOffsetLdSt(N)) { + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + if (!CurDAG->isBaseWithConstantOffset(N)) { if (N.getOpcode() == ISD::ADD) { return false; // We want to select register offset instead @@ -1117,25 +1144,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, if (!CurDAG->isBaseWithConstantOffset(N)) return false; - RegisterSDNode *LHSR = dyn_cast(N.getOperand(0)); - if (N.getOperand(0).getOpcode() == ISD::FrameIndex || - (LHSR && LHSR->getReg() == ARM::SP)) { + if (N.getOperand(0).getOpcode() == ISD::FrameIndex) { // If the RHS is + imm8 * scale, fold into addr mode. int RHSC; if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) { Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast(Base)->getIndex(); + int FI = cast(Base)->getIndex(); + // Make sure the offset is inside the object, or we might fail to + // allocate an emergency spill slot. (An out-of-range access is UB, but + // it could show up anyway.) + MachineFrameInfo &MFI = MF->getFrameInfo(); + if (RHSC * 4 < MFI.getObjectSize(FI)) { // For LHS+RHS to result in an offset that's a multiple of 4 the object // indexed by the LHS must be 4-byte aligned. - MachineFrameInfo &MFI = MF->getFrameInfo(); - if (MFI.getObjectAlignment(FI) < 4) + if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4) MFI.setObjectAlignment(FI, 4); - Base = CurDAG->getTargetFrameIndex( - FI, TLI->getPointerTy(CurDAG->getDataLayout())); + if (MFI.getObjectAlignment(FI) >= 4) { + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } } - OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); - return true; } } @@ -1248,6 +1278,35 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, return false; } +template +bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || + CurDAG->isBaseWithConstantOffset(N)) { + if (auto RHS = dyn_cast(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (isShiftedInt<7, Shift>(RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { @@ -2072,10 +2131,12 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, default: llvm_unreachable("unhandled vld/vst lane type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; // Quad-register operations: + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 0; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 1; break; @@ -2192,7 +2253,10 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, case MVT::v8i8: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v4i16: - case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v8i16: + case MVT::v4f16: + case MVT::v8f16: + OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: case MVT::v4f32: @@ -2577,6 +2641,44 @@ void ARMDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { default: break; + case ISD::STORE: { + // For Thumb1, match an sp-relative store in C++. This is a little + // unfortunate, but I don't think I can make the chain check work + // otherwise. (The chain of the store has to be the same as the chain + // of the CopyFromReg, or else we can't replace the CopyFromReg with + // a direct reference to "SP".) + // + // This is only necessary on Thumb1 because Thumb1 sp-relative stores use + // a different addressing mode from other four-byte stores. + // + // This pattern usually comes up with call arguments. + StoreSDNode *ST = cast(N); + SDValue Ptr = ST->getBasePtr(); + if (Subtarget->isThumb1Only() && ST->isUnindexed()) { + int RHSC = 0; + if (Ptr.getOpcode() == ISD::ADD && + isScaledConstantInRange(Ptr.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) + Ptr = Ptr.getOperand(0); + + if (Ptr.getOpcode() == ISD::CopyFromReg && + cast(Ptr.getOperand(1))->getReg() == ARM::SP && + Ptr.getOperand(0) == ST->getChain()) { + SDValue Ops[] = {ST->getValue(), + CurDAG->getRegister(ARM::SP, MVT::i32), + CurDAG->getTargetConstant(RHSC, dl, MVT::i32), + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + ST->getChain()}; + MachineSDNode *ResNode = + CurDAG->getMachineNode(ARM::tSTRspi, dl, MVT::Other, Ops); + MachineMemOperand *MemOp = ST->getMemOperand(); + CurDAG->setNodeMemRefs(cast(ResNode), {MemOp}); + ReplaceNode(N, ResNode); + return; + } + } + break; + } case ISD::WRITE_REGISTER: if (tryWriteRegister(N)) return; @@ -2586,6 +2688,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; break; case ISD::INLINEASM: + case ISD::INLINEASM_BR: if (tryInlineAsm(N)) return; break; @@ -2895,6 +2998,16 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ARMISD::WLS: { + SDValue Ops[] = { N->getOperand(1), // Loop count + N->getOperand(2), // Exit target + N->getOperand(0) }; + SDNode *LoopStart = + CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops); + ReplaceUses(N, LoopStart); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::BRCOND: { // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) @@ -2922,6 +3035,36 @@ void ARMDAGToDAGISel::Select(SDNode *N) { unsigned CC = (unsigned) cast(N2)->getZExtValue(); if (InFlag.getOpcode() == ARMISD::CMPZ) { + if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) { + SDValue Int = InFlag.getOperand(0); + uint64_t ID = cast(Int->getOperand(1))->getZExtValue(); + + // Handle low-overhead loops. + if (ID == Intrinsic::loop_decrement_reg) { + SDValue Elements = Int.getOperand(2); + SDValue Size = CurDAG->getTargetConstant( + cast(Int.getOperand(3))->getZExtValue(), dl, + MVT::i32); + + SDValue Args[] = { Elements, Size, Int.getOperand(0) }; + SDNode *LoopDec = + CurDAG->getMachineNode(ARM::t2LoopDec, dl, + CurDAG->getVTList(MVT::i32, MVT::Other), + Args); + ReplaceUses(Int.getNode(), LoopDec); + + SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain }; + SDNode *LoopEnd = + CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs); + + ReplaceUses(N, LoopEnd); + CurDAG->RemoveDeadNode(N); + CurDAG->RemoveDeadNode(InFlag.getNode()); + CurDAG->RemoveDeadNode(Int.getNode()); + return; + } + } + bool SwitchEQNEToPLMI; SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI); InFlag = N->getOperand(4); @@ -3979,9 +4122,9 @@ bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){ // If an opcode was found then we can lower the read to a VFP instruction. if (Opcode) { - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2Base()) return false; - if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8()) + if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8Base()) return false; Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), @@ -4090,7 +4233,7 @@ bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){ .Default(0); if (Opcode) { - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2Base()) return false; Ops = { N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; @@ -4290,7 +4433,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ if (!Changed) return false; - SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), + SDValue New = CurDAG->getNode(N->getOpcode(), SDLoc(N), CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); New->setNodeId(-1); ReplaceNode(N, New.getNode()); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 21de0f6a7630..18bb9bf3eccc 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1,9 +1,8 @@ //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -80,6 +79,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -113,6 +113,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "arm-isel" @@ -220,6 +221,121 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } +void ARMTargetLowering::setAllExpand(MVT VT) { + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); + + // We support these really simple operations even on types where all + // the actual arithmetic has to be broken down into simpler + // operations or turned into library calls. + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::UNDEF, VT, Legal); +} + +void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, + LegalizeAction Action) { + setLoadExtAction(ISD::EXTLOAD, From, To, Action); + setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); + setLoadExtAction(ISD::SEXTLOAD, From, To, Action); +} + +void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { + const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; + + for (auto VT : IntTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); + + // No native support for these. + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + + if (!HasMVEFP) { + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + } + } + + const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; + for (auto VT : FloatTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + if (!HasMVEFP) + setAllExpand(VT); + + // These are legal or custom whether we have MVE.fp or not + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + + if (HasMVEFP) { + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); + setOperationAction(ISD::FROUND, VT, Legal); + + // No native support for these. + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + } + } + + // We 'support' these types up to bitcast/load/store level, regardless of + // MVE integer-only / float support. Only doing FP data processing on the FP + // vector types is inhibited at integer-only level. + const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; + for (auto VT : LongTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + setAllExpand(VT); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + } + // We can do bitwise operations on v2i64 vectors + setOperationAction(ISD::AND, MVT::v2i64, Legal); + setOperationAction(ISD::OR, MVT::v2i64, Legal); + setOperationAction(ISD::XOR, MVT::v2i64, Legal); + + // It is legal to extload from v4i8 to v4i16 or v4i32. + addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); + addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); + addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); + + // Some truncating stores are legal too. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); +} + ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -240,7 +356,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. - if (Subtarget->isThumb() && Subtarget->hasVFP2() && + if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { static const struct { const RTLIB::Libcall Op; @@ -509,10 +625,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, else addRegisterClass(MVT::i32, &ARM::GPRRegClass); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && - !Subtarget->isThumb1Only()) { + if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && + Subtarget->hasFPRegs()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); + if (!Subtarget->hasVFP2Base()) + setAllExpand(MVT::f32); + if (!Subtarget->hasFP64()) + setAllExpand(MVT::f64); } if (Subtarget->hasFullFP16()) { @@ -528,9 +648,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + addAllExtLoads(VT, InnerVT, Expand); } setOperationAction(ISD::MULHS, VT, Expand); @@ -547,6 +665,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + if (Subtarget->hasMVEIntegerOps()) + addMVEVectorTypes(Subtarget->hasMVEFloatOps()); + + // Combine low-overhead loop intrinsics so that we can lower i1 types. + if (Subtarget->hasLOB()) + setTargetDAGCombine(ISD::BRCOND); + if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); @@ -565,11 +690,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } + } + if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { // v2f64 is legal so that QR subregs can be extracted as f64 elements, but - // neither Neon nor VFP support any arithmetic operations on it. - // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively - // supported for v4f32. + // none of Neon, MVE or VFP supports any arithmetic operations on it. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); @@ -603,7 +728,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); + } + if (Subtarget->hasNEON()) { + // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively + // supported for v4f32. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); @@ -697,7 +826,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); // NEON only has FMA instructions as of VFP4. - if (!Subtarget->hasVFP4()) { + if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); setOperationAction(ISD::FMA, MVT::v4f32, Expand); } @@ -711,9 +840,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::BUILD_VECTOR); - setTargetDAGCombine(ISD::VECTOR_SHUFFLE); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); @@ -731,7 +857,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - if (Subtarget->isFPOnlySP()) { + if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + } + + if (!Subtarget->hasFP64()) { // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, @@ -767,9 +899,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + } + + if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); } + if (!Subtarget->hasFP16()) + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + + if (!Subtarget->hasFP64()) + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. @@ -832,6 +974,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + // MVE lowers 64 bit shifts to lsll and lsrl + // assuming that ISD::SRL and SRA of i64 are already marked custom + if (Subtarget->hasMVEIntegerOps()) + setOperationAction(ISD::SHL, MVT::i64, Custom); + // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. if (Subtarget->isThumb1Only()) { setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); @@ -1029,7 +1176,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. @@ -1079,7 +1226,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); @@ -1087,7 +1234,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); - if (!Subtarget->hasVFP4()) { + if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); } @@ -1095,7 +1242,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Various VFP goodness if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. - if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { + if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); } @@ -1115,7 +1262,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // FP-ARMv8 implements a lot of rounding-like FP operations. - if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); @@ -1124,12 +1271,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + if (Subtarget->hasNEON()) { + setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + } - if (!Subtarget->isFPOnlySP()) { + if (Subtarget->hasFP64()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); @@ -1141,6 +1290,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } + // FP16 often need to be promoted to call lib functions + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + + setOperationAction(ISD::FROUND, MVT::f16, Legal); + } + if (Subtarget->hasNEON()) { // vmin and vmax aren't available in a scalar form, so we use // a NEON instruction with an undef lane instead. @@ -1177,11 +1344,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); + if (Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::SHL); setStackPointerRegisterToSaveRestore(ARM::SP); if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || - !Subtarget->hasVFP2()) + !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) setSchedulingPreference(Sched::RegPressure); else setSchedulingPreference(Sched::Hybrid); @@ -1204,6 +1373,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); + + if (Subtarget->isThumb() || Subtarget->isThumb2()) + setTargetDAGCombine(ISD::ABS); } bool ARMTargetLowering::useSoftFloat() const { @@ -1288,6 +1460,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::SSAT: return "ARMISD::SSAT"; case ARMISD::USAT: return "ARMISD::USAT"; + case ARMISD::ASRL: return "ARMISD::ASRL"; + case ARMISD::LSRL: return "ARMISD::LSRL"; + case ARMISD::LSLL: return "ARMISD::LSLL"; + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1332,23 +1508,25 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; - case ARMISD::VSHL: return "ARMISD::VSHL"; - case ARMISD::VSHRs: return "ARMISD::VSHRs"; - case ARMISD::VSHRu: return "ARMISD::VSHRu"; - case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; - case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; - case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; - case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; - case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; - case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; - case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; - case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; - case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; - case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; - case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; - case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; - case ARMISD::VSLI: return "ARMISD::VSLI"; - case ARMISD::VSRI: return "ARMISD::VSRI"; + case ARMISD::VSHLs: return "ARMISD::VSHLs"; + case ARMISD::VSHLu: return "ARMISD::VSHLu"; + case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; + case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; + case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; + case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; + case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; + case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; + case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; + case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; + case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; + case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; + case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; + case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; + case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; + case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; + case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; + case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; + case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; @@ -1410,6 +1588,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; + case ARMISD::WLS: return "ARMISD::WLS"; } return nullptr; } @@ -1423,11 +1602,14 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, /// getRegClassFor - Return the register class that should be used for the /// specified value type. -const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { +const TargetRegisterClass * +ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + (void)isDivergent; // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to - // load / store 4 to 8 consecutive D registers. - if (Subtarget->hasNEON()) { + // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive + // MVE Q registers. + if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) @@ -1590,8 +1772,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, // Calling Convention Implementation //===----------------------------------------------------------------------===// -#include "ARMGenCallingConv.inc" - /// getEffectiveCallingConv - Get the effective calling convention, taking into /// account presence of floating point hardware and calling convention /// limitations, such as support for variadic functions. @@ -1613,7 +1793,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; - else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && + else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && getTargetMachine().Options.FloatABIType == FloatABI::Hard && !isVarArg) return CallingConv::ARM_AAPCS_VFP; @@ -1622,10 +1802,11 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::Fast: case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) + if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; return CallingConv::ARM_APCS; - } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) + } else if (Subtarget->hasVFP2Base() && + !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; @@ -1807,29 +1988,42 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); - bool isThisReturn = false; - bool isSibCall = false; + bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool isThisReturn = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); + bool PreferIndirect = false; // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; + if (isa(Callee)) { + // If we're optimizing for minimum size and the function is called three or + // more times in this block, we can improve codesize by calling indirectly + // as BLXr has a 16-bit encoding. + auto *GV = cast(Callee)->getGlobal(); + if (CLI.CS) { + auto *BB = CLI.CS.getParent(); + PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && + count_if(GV->users(), [&BB](const User *U) { + return isa(U) && + cast(U)->getParent() == BB; + }) > 2; + } + } if (isTailCall) { // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, - isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), - Outs, OutVals, Ins, DAG); + isTailCall = IsEligibleForTailCallOptimization( + Callee, CallConv, isVarArg, isStructRet, + MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, + PreferIndirect); if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. - if (isTailCall) { + if (isTailCall) ++NumTailCalls; - isSibCall = true; - } } // Analyze operands of the call, assigning locations to each operand. @@ -1841,14 +2035,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - // For tail calls, memory operands are available in our caller's stack. - if (isSibCall) + if (isTailCall) { + // For tail calls, memory operands are available in our caller's stack. NumBytes = 0; - - // Adjust the stack pointer for the new arguments... - // These operations are automatically eliminated by the prolog/epilog pass - if (!isSibCall) + } else { + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); + } SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); @@ -1970,7 +2164,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } - } else if (!isSibCall) { + } else if (!isTailCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1984,32 +2178,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - // Tail call byval lowering might overwrite argument registers so in case of - // tail call optimization the copies to registers are lowered later. - if (!isTailCall) - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - - // For tail calls lower the arguments to the 'real' stack slot. - if (isTailCall) { - // Force all the incoming stack arguments to be loaded from the stack - // before any new outgoing arguments are stored to the stack, because the - // outgoing stack slots may alias the incoming argument stack slots, and - // the alias isn't otherwise explicit. This is slightly more conservative - // than necessary, because it means that each store effectively depends - // on every argument instead of just those arguments it would clobber. - - // Do not flag preceding copytoreg stuff together with the following stuff. - InFlag = SDValue(); - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -2064,17 +2236,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } else if (isa(Callee)) { - // If we're optimizing for minimum size and the function is called three or - // more times in this block, we can improve codesize by calling indirectly - // as BLXr has a 16-bit encoding. - auto *GV = cast(Callee)->getGlobal(); - auto *BB = CLI.CS.getParent(); - bool PreferIndirect = - Subtarget->isThumb() && MF.getFunction().optForMinSize() && - count_if(GV->users(), [&BB](const User *U) { - return isa(U) && cast(U)->getParent() == BB; - }) > 2; - if (!PreferIndirect) { isDirect = true; bool isDef = GV->isStrongDefinitionForLinker(); @@ -2098,7 +2259,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; - Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) Callee = @@ -2142,7 +2303,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && // Emit regular call when code size is the priority - !MF.getFunction().optForMinSize()) + !Subtarget->hasMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -2306,28 +2467,25 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. -bool -ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - bool isCalleeStructRet, - bool isCallerStructRet, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, - SelectionDAG& DAG) const { +bool ARMTargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG, + const bool isIndirect) const { MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); assert(Subtarget->supportsTailCall()); - // Tail calls to function pointers cannot be optimized for Thumb1 if the args + // Indirect tail calls cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. if (Subtarget->isThumb1Only() && Outs.size() >= 4 && - !isa(Callee.getNode())) - return false; + (!isa(Callee.getNode()) || isIndirect)) + return false; // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2756,7 +2914,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, auto M = const_cast(DAG.getMachineFunction(). getFunction().getParent()); auto GV = new GlobalVariable( - *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, + *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + Twine(AFI->createPICLabelUId()) @@ -3225,7 +3383,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else if (Subtarget->isRWPI() && !IsRO) { // SB-relative. SDValue RelAddr; - if (Subtarget->useMovt(DAG.getMachineFunction())) { + if (Subtarget->useMovt()) { ++NumMovwMovt; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); @@ -3245,7 +3403,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. - if (Subtarget->useMovt(DAG.getMachineFunction())) { + if (Subtarget->useMovt()) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. @@ -3268,7 +3426,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); - if (Subtarget->useMovt(DAG.getMachineFunction())) + if (Subtarget->useMovt()) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register @@ -3288,7 +3446,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); - assert(Subtarget->useMovt(DAG.getMachineFunction()) && + assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt"); assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Windows"); @@ -3309,7 +3467,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, TargetFlags)); if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, @@ -3615,7 +3773,8 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(), - CCInfo.getNextStackOffset(), 4); + CCInfo.getNextStackOffset(), + std::max(4U, TotalArgRegsSaveSize)); AFI->setVarArgsFrameIndex(FrameIndex); } @@ -3891,6 +4050,22 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + + // If the RHS is a constant zero then the V (overflow) flag will never be + // set. This can allow us to simplify GE to PL or LT to MI, which can be + // simpler for other passes (like the peephole optimiser) to deal with. + if (isNullConstant(RHS)) { + switch (CondCode) { + default: break; + case ARMCC::GE: + CondCode = ARMCC::PL; + break; + case ARMCC::LT: + CondCode = ARMCC::MI; + break; + } + } + ARMISD::NodeType CompareType; switch (CondCode) { default: @@ -3910,7 +4085,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, bool InvalidOnQNaN) const { - assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); + assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); SDValue Cmp; SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) @@ -4175,18 +4350,18 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, // Start by selecting the GE condition code for opcodes that return true for // 'equality' if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || - CC == ISD::SETULE) + CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) CondCode = ARMCC::GE; // and GT for opcodes that return false for 'equality'. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || - CC == ISD::SETULT) + CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) CondCode = ARMCC::GT; // Since we are constrained to GE/GT, if the opcode contains 'less', we need // to swap the compare operands. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || - CC == ISD::SETULT) + CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) swpCmpOps = true; // Both GT and GE are ordered comparisons, and return false for 'unordered'. @@ -4212,8 +4387,9 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, } // 'unordered or not equal' is 'anything but equal', so use the EQ condition - // code and swap the VSEL operands. - if (CC == ISD::SETUNE) { + // code and swap the VSEL operands. Also do this if we don't care about the + // unordered case. + if (CC == ISD::SETUNE || CC == ISD::SETNE) { CondCode = ARMCC::EQ; swpVselOps = true; } @@ -4222,7 +4398,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const { - if (Subtarget->isFPOnlySP() && VT == MVT::f64) { + if (!Subtarget->hasFP64() && VT == MVT::f64) { FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), FalseVal); TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, @@ -4428,6 +4604,16 @@ static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, return false; } +bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { + if (VT == MVT::f32) + return !Subtarget->hasVFP2Base(); + if (VT == MVT::f64) + return !Subtarget->hasFP64(); + if (VT == MVT::f16) + return !Subtarget->hasFullFP16(); + return false; +} + SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); @@ -4471,9 +4657,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); - if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { - DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, - dl); + if (isUnsupportedFloatingType(LHS.getValueType())) { + DAG.getTargetLoweringInfo().softenSetCCOperands( + DAG, LHS.getValueType(), LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4494,8 +4680,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) - if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || + TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { @@ -4507,6 +4694,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + // Choose GE over PL, which vsel does now support + if (cast(ARMcc)->getZExtValue() == ARMCC::PL) + ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); } @@ -4514,12 +4704,15 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { bool InvalidOnQNaN; FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); - // Normalize the fp compare. If RHS is zero we keep it there so we match - // CMPFPw0 instead of CMPFP. - if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) && - (TrueVal.getValueType() == MVT::f16 || - TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + // Normalize the fp compare. If RHS is zero we prefer to keep it there so we + // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we + // must use VSEL (limited condition codes), due to not having conditional f16 + // moves. + if (Subtarget->hasFPARMv8Base() && + !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && + (TrueVal.getValueType() == MVT::f16 || + TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); @@ -4708,9 +4901,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(4); SDLoc dl(Op); - if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { - DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, - dl); + if (isUnsupportedFloatingType(LHS.getValueType())) { + DAG.getTargetLoweringInfo().softenSetCCOperands( + DAG, LHS.getValueType(), LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4855,7 +5048,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { + if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), @@ -4919,7 +5112,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { + if (isUnsupportedFloatingType(VT)) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), @@ -4952,7 +5145,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) - Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ @@ -4960,11 +5153,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) - Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) - Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); @@ -5469,40 +5662,100 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, return Res; } +/// Getvshiftimm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || + !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { + Cnt = -Cnt; + return true; + } + return false; +} + static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); + int64_t Cnt; if (!VT.isVector()) return SDValue(); - // Lower vector shifts on NEON to use VSHL. - assert(ST->hasNEON() && "unexpected vector shift"); + // We essentially have two forms here. Shift by an immediate and shift by a + // vector register (there are also shift by a gpr, but that is just handled + // with a tablegen pattern). We cannot easily match shift by an immediate in + // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. + // For shifting by a vector, we don't have VSHR, only VSHL (which can be + // signed or unsigned, and a negative shift indicates a shift right). + if (N->getOpcode() == ISD::SHL) { + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), + N->getOperand(1)); + } - // Left shifts translate directly to the vshiftu intrinsic. - if (N->getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, - MVT::i32), - N->getOperand(0), N->getOperand(1)); + assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && + "unexpected vector shift opcode"); - assert((N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } - // NEON uses the same intrinsics for both left and right shifts. For - // right shifts, the shift amounts are negative, so negate the vector of - // shift amounts. + // Other right shifts we don't have operations for (we use a shift left by a + // negative number). EVT ShiftVT = N->getOperand(1).getValueType(); - SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, - getZeroVector(ShiftVT, DAG, dl), - N->getOperand(1)); - Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? - Intrinsic::arm_neon_vshifts : - Intrinsic::arm_neon_vshiftu); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, dl, MVT::i32), - N->getOperand(0), NegatedCount); + SDValue NegatedCount = DAG.getNode( + ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, @@ -5514,15 +5767,59 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, if (VT != MVT::i64) return SDValue(); - assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SHL) && "Unknown shift to lower!"); + unsigned ShOpc = N->getOpcode(); + if (ST->hasMVEIntegerOps()) { + SDValue ShAmt = N->getOperand(1); + unsigned ShPartsOpc = ARMISD::LSLL; + ConstantSDNode *Con = dyn_cast(ShAmt); + + // If the shift amount is greater than 32 then do the default optimisation + if (Con && Con->getZExtValue() > 32) + return SDValue(); + + // Extract the lower 32 bits of the shift amount if it's an i64 + if (ShAmt->getValueType(0) == MVT::i64) + ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, + DAG.getConstant(0, dl, MVT::i32)); + + if (ShOpc == ISD::SRL) { + if (!Con) + // There is no t2LSRLr instruction so negate and perform an lsll if the + // shift amount is in a register, emulating a right shift. + ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(0, dl, MVT::i32), ShAmt); + else + // Else generate an lsrl on the immediate shift amount + ShPartsOpc = ARMISD::LSRL; + } else if (ShOpc == ISD::SRA) + ShPartsOpc = ARMISD::ASRL; + + // Lower 32 bits of the destination/source + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + // Upper 32 bits of the destination/source + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(1, dl, MVT::i32)); + + // Generate the shift operation as computed above + Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, + ShAmt); + // The upper 32 bits come from the second return value of lsll + Hi = SDValue(Lo.getNode(), 1); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + } + // We only lower SRA, SRL of 1 here, all others use generic lowering. - if (!isOneConstant(N->getOperand(1))) + if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) return SDValue(); // If we are in thumb mode, we don't have RRX. - if (ST->isThumb1Only()) return SDValue(); + if (ST->isThumb1Only()) + return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), @@ -5731,7 +6028,7 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { } /// isNEONModifiedImm - Check if the specified splat value corresponds to a -/// valid vector constant for a NEON instruction with a "modified immediate" +/// valid vector constant for a NEON or MVE instruction with a "modified immediate" /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, @@ -5817,6 +6114,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, break; } + // cmode == 0b1101 is not supported for MVE VMVN + if (type == MVEVMVNModImm) + return SDValue(); + if ((SplatBits & ~0xffffff) == 0 && ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { // Value = 0x00nnffff: Op=x, Cmode=1101. @@ -5902,12 +6203,12 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, } } - if (!ST->hasVFP3()) + if (!ST->hasVFP3Base()) return SDValue(); // Use the default (constant pool) lowering for double constants when we have // an SP-only FPU - if (IsDouble && Subtarget->isFPOnlySP()) + if (IsDouble && !Subtarget->hasFP64()) return SDValue(); // Try splatting with a VMOV.f32... @@ -6383,13 +6684,15 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (SplatUndef.isAllOnesValue()) return DAG.getUNDEF(VT); - if (SplatBitSize <= 64) { + if ((ST->hasNEON() && SplatBitSize <= 64) || + (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { // Check if an immediate VMOV works. EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); + if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -6397,10 +6700,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); - Val = isNEONModifiedImm(NegatedImm, - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VmovVT, VT.is128BitVector(), - VMVNModImm); + Val = isNEONModifiedImm( + NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VmovVT, VT.is128BitVector(), + ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -6515,10 +6818,13 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; + MVT FVT = VT.getVectorElementType().getSimpleVT(); + assert(FVT == MVT::f32 || FVT == MVT::f16); + MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, Op.getOperand(i))); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) @@ -6544,7 +6850,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return shuffle; } - if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { + if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector // into two 64-bit vectors; we might discover a better way to lower it. SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElts); @@ -6799,6 +7105,38 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } +enum ShuffleOpCodes { + OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result +}; + +static bool isLegalMVEShuffleOp(unsigned PFEntry) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + switch (OpNum) { + case OP_COPY: + case OP_VREV: + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return true; + } + return false; +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values @@ -6820,7 +7158,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) + if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) return true; } @@ -6828,15 +7166,22 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { unsigned Imm, WhichResult; unsigned EltSize = VT.getScalarSizeInBits(); - return (EltSize >= 32 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isVREVMask(M, VT, 64) || - isVREVMask(M, VT, 32) || - isVREVMask(M, VT, 16) || - isVEXTMask(M, VT, ReverseVEXT, Imm) || - isVTBLMask(M, VT) || - isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || - ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); + if (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16)) + return true; + else if (Subtarget->hasNEON() && + (isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTBLMask(M, VT) || + isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) + return true; + else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && + isReverseMask(M, VT)) + return true; + else + return false; } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit @@ -6848,24 +7193,6 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); - enum { - OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> - OP_VREV, - OP_VDUP0, - OP_VDUP1, - OP_VDUP2, - OP_VDUP3, - OP_VEXT1, - OP_VEXT2, - OP_VEXT3, - OP_VUZPL, // VUZP, left result - OP_VUZPR, // VUZP, right result - OP_VZIPL, // VZIP, left result - OP_VZIPR, // VZIP, right result - OP_VTRNL, // VTRN, left result - OP_VTRNR // VTRN, right result - }; - if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); @@ -6955,7 +7282,8 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, DAG.getConstant(ExtractNum, DL, MVT::i32)); } -static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); @@ -6999,9 +7327,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(Lane, dl, MVT::i32)); } - bool ReverseVEXT; - unsigned Imm; - if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + bool ReverseVEXT = false; + unsigned Imm = 0; + if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, @@ -7015,7 +7343,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } @@ -7025,14 +7353,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // source operands and with masks corresponding to both results of one of // these operations, DAG memoization will ensure that a single node is // used for both shuffles. - unsigned WhichResult; - bool isV_UNDEF; - if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( - ShuffleMask, VT, WhichResult, isV_UNDEF)) { - if (isV_UNDEF) - V2 = V1; - return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) - .getValue(WhichResult); + unsigned WhichResult = 0; + bool isV_UNDEF = false; + if (ST->hasNEON()) { + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, VT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + V2 = V1; + return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) + .getValue(WhichResult); + } } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize @@ -7050,7 +7380,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // -> // concat(VZIP(v1, v2):0, :1) // - if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { + if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); @@ -7092,8 +7422,18 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + if (Cost <= 4) { + if (ST->hasNEON()) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + else if (isLegalMVEShuffleOp(PFEntry)) { + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; + unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; + if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + } } // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. @@ -7118,22 +7458,50 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) + if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); - if (VT == MVT::v8i8) + if (ST->hasNEON() && VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; return SDValue(); } -static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering:: +LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa(Lane)) return SDValue(); + SDValue Elt = Op.getOperand(1); + EVT EltVT = Elt.getValueType(); + if (getTypeAction(*DAG.getContext(), EltVT) == + TargetLowering::TypePromoteFloat) { + // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, + // but the type system will try to do that if we don't intervene. + // Reinterpret any such vector-element insertion as one with the + // corresponding integer types. + + SDLoc dl(Op); + + EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); + assert(getTypeAction(*DAG.getContext(), IEltVT) != + TargetLowering::TypePromoteFloat); + + SDValue VecIn = Op.getOperand(0); + EVT VecVT = VecIn.getValueType(); + EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, + VecVT.getVectorNumElements()); + + SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); + SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); + SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, + IVecIn, IElt, Lane); + return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); + } + return Op; } @@ -7809,8 +8177,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return SDValue(); const auto &ST = static_cast(DAG.getSubtarget()); - const auto &MF = DAG.getMachineFunction(); - const bool MinSize = MF.getFunction().optForMinSize(); + const bool MinSize = ST.hasMinSize(); const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() : ST.hasDivideInARMMode(); @@ -8063,7 +8430,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); - case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); @@ -8149,6 +8516,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, break; case ISD::SRL: case ISD::SRA: + case ISD::SHL: Res = Expand64BitShift(N, DAG, Subtarget); break; case ISD::SREM: @@ -8175,6 +8543,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::INTRINSIC_WO_CHAIN: return ReplaceLongIntrinsic(N, Results, DAG); + case ISD::ABS: + lowerABS(N, Results, DAG); + return ; + } if (Res.getNode()) Results.push_back(Res); @@ -8980,7 +9352,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); - if (Subtarget->useMovt(*MF)) { + if (Subtarget->useMovt()) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); @@ -9003,18 +9375,23 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .addMemOperand(CPMMO); else BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .addMemOperand(CPMMO); } BB->addSuccessor(loopMBB); @@ -9262,7 +9639,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .add(MI.getOperand(2)) // Rn .add(MI.getOperand(3)) // PredImm .add(MI.getOperand(4)) // PredReg - .add(MI.getOperand(0)); // Rt + .add(MI.getOperand(0)) // Rt + .cloneMemRefs(MI); MI.eraseFromParent(); return BB; } @@ -10372,6 +10750,22 @@ static SDValue PerformAddeSubeCombine(SDNode *N, return SDValue(); } +static SDValue PerformABSCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SDValue res; + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) + return SDValue(); + + if (!TLI.expandABS(N, res, DAG)) + return SDValue(); + + return res; +} + /// PerformADDECombine - Target-specific dag combine transform from /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL @@ -10419,11 +10813,28 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, if (Level == BeforeLegalizeTypes) return true; - if (Subtarget->isThumb() && Subtarget->isThumb1Only()) + if (N->getOpcode() != ISD::SHL) return true; - if (N->getOpcode() != ISD::SHL) + if (Subtarget->isThumb1Only()) { + // Avoid making expensive immediates by commuting shifts. (This logic + // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted + // for free.) + if (N->getOpcode() != ISD::SHL) + return true; + SDValue N1 = N->getOperand(0); + if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && + N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) + return true; + if (auto *Const = dyn_cast(N1->getOperand(1))) { + if (Const->getAPIntValue().ult(256)) + return false; + if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && + Const->getAPIntValue().sgt(-256)) + return false; + } return true; + } // Turn off commute-with-shift transform after legalization, so it doesn't // conflict with PerformSHLSimplify. (We could try to detect when @@ -10432,9 +10843,8 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, return false; } -bool -ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N, - CombineLevel Level) const { +bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { if (!Subtarget->isThumb1Only()) return true; @@ -10444,6 +10854,15 @@ ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N, return false; } +bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { + if (!Subtarget->hasNEON()) { + if (Subtarget->isThumb1Only()) + return VT.getScalarSizeInBits() <= 32; + return true; + } + return VT.isScalarInteger(); +} + static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -10830,7 +11249,7 @@ static SDValue PerformANDCombine(SDNode *N, APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && + if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; @@ -11308,7 +11727,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, const ARMSubtarget *Subtarget) { // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); - if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) + if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); // vmovrrd(load f64) -> (load i32), (load i32) @@ -11329,9 +11748,11 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); - SDValue NewLD2 = DAG.getLoad( - MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), - std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); + + SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, + LD->getPointerInfo().getWithOffset(4), + std::min(4U, LD->getAlignment()), + LD->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); if (DCI.DAG.getDataLayout().isBigEndian()) @@ -11922,10 +12343,14 @@ static SDValue PerformVDUPLANECombine(SDNode *N, /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. static SDValue PerformVDUPCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + if (!Subtarget->hasNEON()) + return SDValue(); + // Match VDUP(LOAD) -> VLD1DUP. // We match this pattern here rather than waiting for isel because the // transform is only legal for unindexed loads. @@ -12132,11 +12557,11 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would - // be lossy. We also can't handle more then 4 lanes, since these intructions - // only support v2i32/v4i32 types. + // be lossy. We also can't handle anything other than 2 or 4 lanes, since + // these intructions only support v2i32/v4i32 types. return SDValue(); } @@ -12190,11 +12615,11 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would - // be lossy. We also can't handle more then 4 lanes, since these intructions - // only support v2i32/v4i32 types. + // be lossy. We also can't handle anything other than 2 or 4 lanes, since + // these intructions only support v2i32/v4i32 types. return SDValue(); } @@ -12220,58 +12645,6 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, ConvInput, DAG.getConstant(C, dl, MVT::i32)); } -/// Getvshiftimm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift operation, where all the elements of the -/// build_vector must have the same constant integer value. -static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { - // Ignore bit_converts. - while (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); - BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ElementBits) || - SplatBitSize > ElementBits) - return false; - Cnt = SplatBits.getSExtValue(); - return true; -} - -/// isVShiftLImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift left operation. That value must be in the range: -/// 0 <= Value < ElementBits for a left shift; or -/// 0 <= Value <= ElementBits for a long left shift. -static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); -} - -/// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - if (!isIntrinsic) - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); - if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { - Cnt = -Cnt; - return true; - } - return false; -} - /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); @@ -12307,12 +12680,12 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { - VShiftOpc = ARMISD::VSHL; + VShiftOpc = ARMISD::VSHLIMM; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { - VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? - ARMISD::VSHRs : ARMISD::VSHRu); + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM + : ARMISD::VSHRuIMM); break; } return SDValue(); @@ -12357,29 +12730,41 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: - VShiftOpc = ARMISD::VRSHRs; break; + VShiftOpc = ARMISD::VRSHRsIMM; + break; case Intrinsic::arm_neon_vrshiftu: - VShiftOpc = ARMISD::VRSHRu; break; + VShiftOpc = ARMISD::VRSHRuIMM; + break; case Intrinsic::arm_neon_vrshiftn: - VShiftOpc = ARMISD::VRSHRN; break; + VShiftOpc = ARMISD::VRSHRNIMM; + break; case Intrinsic::arm_neon_vqshifts: - VShiftOpc = ARMISD::VQSHLs; break; + VShiftOpc = ARMISD::VQSHLsIMM; + break; case Intrinsic::arm_neon_vqshiftu: - VShiftOpc = ARMISD::VQSHLu; break; + VShiftOpc = ARMISD::VQSHLuIMM; + break; case Intrinsic::arm_neon_vqshiftsu: - VShiftOpc = ARMISD::VQSHLsu; break; + VShiftOpc = ARMISD::VQSHLsuIMM; + break; case Intrinsic::arm_neon_vqshiftns: - VShiftOpc = ARMISD::VQSHRNs; break; + VShiftOpc = ARMISD::VQSHRNsIMM; + break; case Intrinsic::arm_neon_vqshiftnu: - VShiftOpc = ARMISD::VQSHRNu; break; + VShiftOpc = ARMISD::VQSHRNuIMM; + break; case Intrinsic::arm_neon_vqshiftnsu: - VShiftOpc = ARMISD::VQSHRNsu; break; + VShiftOpc = ARMISD::VQSHRNsuIMM; + break; case Intrinsic::arm_neon_vqrshiftns: - VShiftOpc = ARMISD::VQRSHRNs; break; + VShiftOpc = ARMISD::VQRSHRNsIMM; + break; case Intrinsic::arm_neon_vqrshiftnu: - VShiftOpc = ARMISD::VQRSHRNu; break; + VShiftOpc = ARMISD::VQRSHRNuIMM; + break; case Intrinsic::arm_neon_vqrshiftnsu: - VShiftOpc = ARMISD::VQRSHRNsu; break; + VShiftOpc = ARMISD::VQRSHRNsuIMM; + break; } SDLoc dl(N); @@ -12393,9 +12778,9 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) - VShiftOpc = ARMISD::VSLI; + VShiftOpc = ARMISD::VSLIIMM; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) - VShiftOpc = ARMISD::VSRI; + VShiftOpc = ARMISD::VSRIIMM; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } @@ -12420,8 +12805,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { /// combining instead of DAG legalizing because the build_vectors for 64-bit /// vector element shift counts are generally not legal, and it is hard to see /// their values after they get legalized to loads from a constant pool. -static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformShiftCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high @@ -12436,12 +12823,47 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, } } + if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && + N->getOperand(0)->getOpcode() == ISD::AND && + N->getOperand(0)->hasOneUse()) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't + // usually show up because instcombine prefers to canonicalize it to + // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come + // out of GEP lowering in some cases. + SDValue N0 = N->getOperand(0); + ConstantSDNode *ShiftAmtNode = dyn_cast(N->getOperand(1)); + if (!ShiftAmtNode) + return SDValue(); + uint32_t ShiftAmt = static_cast(ShiftAmtNode->getZExtValue()); + ConstantSDNode *AndMaskNode = dyn_cast(N0->getOperand(1)); + if (!AndMaskNode) + return SDValue(); + uint32_t AndMask = static_cast(AndMaskNode->getZExtValue()); + // Don't transform uxtb/uxth. + if (AndMask == 255 || AndMask == 65535) + return SDValue(); + if (isMask_32(AndMask)) { + uint32_t MaskedBits = countLeadingZeros(AndMask); + if (MaskedBits > ShiftAmt) { + SDLoc DL(N); + SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(MaskedBits, DL, MVT::i32)); + return DAG.getNode( + ISD::SRL, DL, MVT::i32, SHL, + DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); + } + } + } + // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); + if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) + return SDValue(); - assert(ST->hasNEON() && "unexpected vector shift"); int64_t Cnt; switch (N->getOpcode()) { @@ -12450,7 +12872,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); - return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), + return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; @@ -12458,8 +12880,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { - unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? - ARMISD::VSHRs : ARMISD::VSHRu); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); @@ -12606,6 +13028,45 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D return V; } +static SDValue PerformHWLoopCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + // Look for (brcond (xor test.set.loop.iterations, -1) + SDValue CC = N->getOperand(1); + unsigned Opc = CC->getOpcode(); + SDValue Int; + + if ((Opc == ISD::XOR || Opc == ISD::SETCC) && + (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) { + + assert((isa(CC->getOperand(1)) && + cast(CC->getOperand(1))->isOne()) && + "Expected to compare against 1"); + + Int = CC->getOperand(0); + } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN) + Int = CC; + else + return SDValue(); + + unsigned IntOp = cast(Int.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_iterations) + return SDValue(); + + SDLoc dl(Int); + SDValue Chain = N->getOperand(0); + SDValue Elements = Int.getOperand(2); + SDValue ExitBlock = N->getOperand(2); + + // TODO: Once we start supporting tail predication, we can add another + // operand to WLS for the number of elements processed in a vector loop. + + SDValue Ops[] = { Chain, Elements, ExitBlock }; + SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; +} + /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. SDValue ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { @@ -12779,15 +13240,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> - // merge t3, t4 - // where t1 = (SUBCARRY (SUB x, y), z, 0) - // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) - // t3 = if K != 0 then (SHL t2:0, K) else t2:0 - // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] + // t1 = (USUBO (SUB x, y), 1) + // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) + // Result = if K != 0 then (SHL t2:0, K) else t2:0 + // + // This also handles the special case of comparing against zero; it's + // essentially, the same pattern, except there's no SUBS: + // CMOV x, z, !=, (CMPZ x, 0) -> + // t1 = (USUBO x, 1) + // t2 = (SUBCARRY x, t1:0, t1:1) + // Result = if K != 0 then (SHL t2:0, K) else t2:0 const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && - (FalseVal.getOpcode() == ARMISD::SUBS) && - (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) && + ((FalseVal.getOpcode() == ARMISD::SUBS && + FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || + (FalseVal == LHS && isNullConstant(RHS))) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned ShiftAmount = TrueConst->logBase2(); @@ -12795,10 +13262,6 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { TrueVal = DAG.getConstant(1, dl, VT); SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); - // Make it a carry, not a borrow. - SDValue Carry = DAG.getNode( - ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1)); - Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry); if (ShiftAmount) Res = DAG.getNode(ISD::SHL, dl, VT, Res, @@ -12826,6 +13289,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); @@ -12834,6 +13298,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); + case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); @@ -12845,7 +13310,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); - case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); + case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); @@ -12854,7 +13319,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SRL: + return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); @@ -12957,9 +13423,9 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, +bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, + unsigned Alignment, + MachineMemOperand::Flags, bool *Fast) const { // Depends what it gets converted into if the type is weird. if (!VT.isSimple()) @@ -12967,23 +13433,18 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); + auto Ty = VT.getSimpleVT().SimpleTy; - switch (VT.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::i8: - case MVT::i16: - case MVT::i32: { + if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { // Unaligned access can use (for example) LRDB, LRDH, LDR if (AllowsUnaligned) { if (Fast) *Fast = Subtarget->hasV7Ops(); return true; } - return false; } - case MVT::f64: - case MVT::v2f64: { + + if (Ty == MVT::f64 || Ty == MVT::v2f64) { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses @@ -12992,9 +13453,54 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, *Fast = true; return true; } - return false; } + + if (!Subtarget->hasMVEIntegerOps()) + return false; + if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && + Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && + Ty != MVT::v2f64 && + // These are for truncated stores + Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16) + return false; + + if (Subtarget->isLittle()) { + // In little-endian MVE, the store instructions VSTRB.U8, + // VSTRH.U16 and VSTRW.U32 all store the vector register in + // exactly the same format, and differ only in the range of + // their immediate offset field and the required alignment. + // + // In particular, VSTRB.U8 can store a vector at byte alignment. + // So at this stage we can simply say that loads/stores of all + // 128-bit wide vector types are permitted at any alignment, + // because we know at least _one_ instruction can manage that. + // + // Later on we might find that some of those loads are better + // generated as VLDRW.U32 if alignment permits, to take + // advantage of the larger immediate range. But for the moment, + // all that matters is that if we don't lower the load then + // _some_ instruction can handle it. + if (Fast) + *Fast = true; + return true; + } else { + // In big-endian MVE, those instructions aren't so similar + // after all, because they reorder the bytes of the vector + // differently. So this time we can only store a particular + // kind of vector if its alignment is at least the element + // type. And we can't store vectors of i64 or f64 at all + // without having to do some postprocessing, because there's + // no VSTRD.U64. + if (Ty == MVT::v16i8 || + ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || + ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { + if (Fast) + *Fast = true; + return true; + } } + + return false; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, @@ -13003,24 +13509,24 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, (DstAlign == 0 || DstAlign % AlignCheck == 0)); } -EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - const Function &F = MF.getFunction(); - +EVT ARMTargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && - !F.hasFnAttribute(Attribute::NoImplicitFloat)) { + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { + (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, + MachineMemOperand::MONone, &Fast) && + Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || - (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && + (allowsMisalignedMemoryAccesses( + MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::f64; } @@ -13089,6 +13595,46 @@ bool ARMTargetLowering::isFNegFree(EVT VT) const { return false; } +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// sext/zext can be folded into vsubl. +bool ARMTargetLowering::shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + if (!Subtarget->hasNEON() || !I->getType()->isVectorTy()) + return false; + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + return true; + } + default: + return false; + } + return false; +} + bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); @@ -13105,7 +13651,7 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || - U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) return false; return true; @@ -13142,7 +13688,6 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { unsigned Scale = 1; switch (VT.getSimpleVT().SimpleTy) { - default: return false; case MVT::i1: case MVT::i8: // Scale == 1; @@ -13151,7 +13696,8 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { // Scale == 2; Scale = 2; break; - case MVT::i32: + default: + // On thumb1 we load most things (i32, i64, floats, etc) with a LDR // Scale == 4; Scale = 4; break; @@ -13159,38 +13705,58 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if ((V & (Scale - 1)) != 0) return false; - V /= Scale; - return V == (V & ((1LL << 5) - 1)); + return isUInt<5>(V / Scale); } static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { - bool isNeg = false; + if (!VT.isInteger() && !VT.isFloatingPoint()) + return false; + if (VT.isVector() && Subtarget->hasNEON()) + return false; + if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && + !Subtarget->hasMVEFloatOps()) + return false; + + bool IsNeg = false; if (V < 0) { - isNeg = true; - V = - V; + IsNeg = true; + V = -V; } - switch (VT.getSimpleVT().SimpleTy) { - default: return false; - case MVT::i1: - case MVT::i8: - case MVT::i16: - case MVT::i32: - // + imm12 or - imm8 - if (isNeg) - return V == (V & ((1LL << 8) - 1)); - return V == (V & ((1LL << 12) - 1)); - case MVT::f32: - case MVT::f64: - // Same as ARM mode. FIXME: NEON? - if (!Subtarget->hasVFP2()) - return false; - if ((V & 3) != 0) + unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U); + + // MVE: size * imm7 + if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { + switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { + case MVT::i32: + case MVT::f32: + return isShiftedUInt<7,2>(V); + case MVT::i16: + case MVT::f16: + return isShiftedUInt<7,1>(V); + case MVT::i8: + return isUInt<7>(V); + default: return false; - V >>= 2; - return V == (V & ((1LL << 8) - 1)); + } } + + // half VLDR: 2 * imm8 + if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) + return isShiftedUInt<8, 1>(V); + // VLDR and LDRD: 4 * imm8 + if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) + return isShiftedUInt<8, 2>(V); + + if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { + // + imm12 or - imm8 + if (IsNeg) + return isUInt<8>(V); + return isUInt<12>(V); + } + + return false; } /// isLegalAddressImmediate - Return true if the integer value can be used @@ -13218,18 +13784,15 @@ static bool isLegalAddressImmediate(int64_t V, EVT VT, case MVT::i8: case MVT::i32: // +- imm12 - return V == (V & ((1LL << 12) - 1)); + return isUInt<12>(V); case MVT::i16: // +- imm8 - return V == (V & ((1LL << 8) - 1)); + return isUInt<8>(V); case MVT::f32: case MVT::f64: - if (!Subtarget->hasVFP2()) // FIXME: NEON? + if (!Subtarget->hasVFP2Base()) // FIXME: NEON? return false; - if ((V & 3) != 0) - return false; - V >>= 2; - return V == (V & ((1LL << 8) - 1)); + return isShiftedUInt<8, 2>(V); } } @@ -13649,13 +14212,13 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, EVT VT = Op.getValueType(); const unsigned DstSz = VT.getScalarSizeInBits(); const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); + (void)SrcSz; assert(SrcSz == Known.getBitWidth()); assert(DstSz > SrcSz); if (Op.getOpcode() == ARMISD::VGETLANEs) Known = Known.sext(DstSz); else { - Known = Known.zext(DstSz); - Known.Zero.setBitsFrom(SrcSz); + Known = Known.zext(DstSz, true /* extended bits are known zero */); } assert(DstSz == Known.getBitWidth()); break; @@ -13790,7 +14353,7 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2Base()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; @@ -13822,6 +14385,7 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const { } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; + case 'T': return C_RegisterClass; // All 'U+' constraints are addresses. case 'U': return C_Memory; } @@ -13867,7 +14431,8 @@ using RCPair = std::pair; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint.size() == 1) { + switch (Constraint.size()) { + case 1: // GCC ARM Constraint Letters switch (Constraint[0]) { case 'l': // Low regs or general regs. @@ -13913,7 +14478,25 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( return RCPair(0U, &ARM::QPR_VFP2RegClass); break; } + break; + + case 2: + if (Constraint[0] == 'T') { + switch (Constraint[1]) { + default: + break; + case 'e': + return RCPair(0U, &ARM::tGPREvenRegClass); + case 'o': + return RCPair(0U, &ARM::tGPROddRegClass); + } + } + break; + + default: + break; } + if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); @@ -14272,28 +14855,107 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && + SDValue SrcVal = Op.getOperand(0); + const unsigned DstSz = Op.getValueType().getSizeInBits(); + const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); + assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && "Unexpected type for custom-lowering FP_EXTEND"); + assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && + "With both FP DP and 16, any FP conversion is legal!"); + + assert(!(DstSz == 32 && Subtarget->hasFP16()) && + "With FP16, 16 to 32 conversion is legal!"); + + // Either we are converting from 16 -> 64, without FP16 and/or + // FP.double-precision or without Armv8-fp. So we must do it in two + // steps. + // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 + // without FP16. So we must do a function call. + SDLoc Loc(Op); RTLIB::Libcall LC; - LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); + if (SrcSz == 16) { + // Instruction from 16 -> 32 + if (Subtarget->hasFP16()) + SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal); + // Lib call from 16 -> 32 + else { + LC = RTLIB::getFPEXT(MVT::f16, MVT::f32); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unexpected type for custom-lowering FP_EXTEND"); + SrcVal = + makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first; + } + } - SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, - SDLoc(Op)).first; + if (DstSz != 64) + return SrcVal; + // For sure now SrcVal is 32 bits + if (Subtarget->hasFP64()) // Instruction from 32 -> 64 + return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal); + + LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unexpected type for custom-lowering FP_EXTEND"); + return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getOperand(0).getValueType() == MVT::f64 && - Subtarget->isFPOnlySP() && + SDValue SrcVal = Op.getOperand(0); + EVT SrcVT = SrcVal.getValueType(); + EVT DstVT = Op.getValueType(); + const unsigned DstSz = Op.getValueType().getSizeInBits(); + const unsigned SrcSz = SrcVT.getSizeInBits(); + (void)DstSz; + assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && "Unexpected type for custom-lowering FP_ROUND"); - RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && + "With both FP DP and 16, any FP conversion is legal!"); - SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, - SDLoc(Op)).first; + SDLoc Loc(Op); + + // Instruction from 32 -> 16 if hasFP16 is valid + if (SrcSz == 32 && Subtarget->hasFP16()) + return Op; + + // Lib call from 32 -> 16 / 64 -> [32, 16] + RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unexpected type for custom-lowering FP_ROUND"); + return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first; +} + +void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const { + assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); + MVT HalfT = MVT::i32; + SDLoc dl(N); + SDValue Hi, Lo, Tmp; + + if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || + !isOperationLegalOrCustom(ISD::UADDO, HalfT)) + return ; + + unsigned OpTypeBits = HalfT.getScalarSizeInBits(); + SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); + + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(0, dl, HalfT)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(1, dl, HalfT)); + + Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, + DAG.getConstant(OpTypeBits - 1, dl, + getShiftAmountTy(HalfT, DAG.getDataLayout()))); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, + SDValue(Lo.getNode(), 1)); + Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); + Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); + + Results.push_back(Lo); + Results.push_back(Hi); } bool @@ -14314,14 +14976,15 @@ bool ARM::isBitFieldInvertedMask(unsigned v) { /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. -bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - if (!Subtarget->hasVFP3()) +bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { + if (!Subtarget->hasVFP3Base()) return false; if (VT == MVT::f16 && Subtarget->hasFullFP16()) return ARM_AM::getFP16Imm(Imm) != -1; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; - if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) + if (VT == MVT::f64 && Subtarget->hasFP64()) return ARM_AM::getFP64Imm(Imm) != -1; return false; } @@ -14590,6 +15253,9 @@ ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // and up to 64 bits on the non-M profiles TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + if (AI->isFloatingPointOperation()) + return AtomicExpansionKind::CmpXChg; + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) @@ -14621,6 +15287,36 @@ bool ARMTargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO(); } +void ARMTargetLowering::insertSSPDeclarations(Module &M) const { + if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return TargetLowering::insertSSPDeclarations(M); + + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal("__security_cookie", + Type::getInt8PtrTy(M.getContext())); + + // MSVC CRT has a function to validate security cookie. + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) + F->addAttribute(1, Attribute::AttrKind::InReg); +} + +Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { + // MSVC CRT has a global variable holding security cookie. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return M.getGlobalVariable("__security_cookie"); + return TargetLowering::getSDagStackGuard(M); +} + +Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return M.getFunction("__security_check_cookie"); + return TargetLowering::getSSPStackGuardCheck(M); +} + bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { // If we do not have NEON, vector types are not natively supported. @@ -14658,6 +15354,10 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasV6T2Ops(); } +bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { + return !Subtarget->hasMinSize(); +} + Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -14850,8 +15550,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = Builder.CreateConstGEP1_32( - BaseAddr, VecTy->getVectorNumElements() * Factor); + BaseAddr = + Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, + VecTy->getVectorNumElements() * Factor); SmallVector Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); @@ -14990,7 +15691,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr, LaneLen * Factor); SmallVector Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 7a9fc739fc13..1675ec59a354 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -1,9 +1,8 @@ //===- ARMISelLowering.h - ARM DAG Lowering Interface -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -77,6 +76,10 @@ class VectorType; PIC_ADD, // Add with a PC operand and a PIC label. + ASRL, // MVE long arithmetic shift right. + LSRL, // MVE long shift right. + LSLL, // MVE long shift left. + CMP, // ARM compare instructions. CMN, // ARM CMN instructions. CMPZ, // ARM compare that sets only Z flag. @@ -122,6 +125,8 @@ class VectorType; WIN__CHKSTK, // Windows' __chkstk call to do stack probing. WIN__DBZCHK, // Windows' divide by zero check + WLS, // Low-overhead loops, While Loop Start + VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. @@ -134,32 +139,36 @@ class VectorType; VCGTU, // Vector compare unsigned greater than. VTST, // Vector test bits. + // Vector shift by vector + VSHLs, // ...left/right by signed + VSHLu, // ...left/right by unsigned + // Vector shift by immediate: - VSHL, // ...left - VSHRs, // ...right (signed) - VSHRu, // ...right (unsigned) + VSHLIMM, // ...left + VSHRsIMM, // ...right (signed) + VSHRuIMM, // ...right (unsigned) // Vector rounding shift by immediate: - VRSHRs, // ...right (signed) - VRSHRu, // ...right (unsigned) - VRSHRN, // ...right narrow + VRSHRsIMM, // ...right (signed) + VRSHRuIMM, // ...right (unsigned) + VRSHRNIMM, // ...right narrow // Vector saturating shift by immediate: - VQSHLs, // ...left (signed) - VQSHLu, // ...left (unsigned) - VQSHLsu, // ...left (signed to unsigned) - VQSHRNs, // ...right narrow (signed) - VQSHRNu, // ...right narrow (unsigned) - VQSHRNsu, // ...right narrow (signed to unsigned) + VQSHLsIMM, // ...left (signed) + VQSHLuIMM, // ...left (unsigned) + VQSHLsuIMM, // ...left (signed to unsigned) + VQSHRNsIMM, // ...right narrow (signed) + VQSHRNuIMM, // ...right narrow (unsigned) + VQSHRNsuIMM, // ...right narrow (signed to unsigned) // Vector saturating rounding shift by immediate: - VQRSHRNs, // ...right narrow (signed) - VQRSHRNu, // ...right narrow (unsigned) - VQRSHRNsu, // ...right narrow (signed to unsigned) + VQRSHRNsIMM, // ...right narrow (signed) + VQRSHRNuIMM, // ...right narrow (unsigned) + VQRSHRNsuIMM, // ...right narrow (signed to unsigned) // Vector shift and insert: - VSLI, // ...left - VSRI, // ...right + VSLIIMM, // ...left + VSRIIMM, // ...right // Vector get lane (VMOV scalar to ARM core register) // (These are used for 8- and 16-bit element types only.) @@ -322,17 +331,21 @@ class VectorType; /// is "fast" by reference in the second argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, + MachineMemOperand::Flags Flags, bool *Fast) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override; + bool isFNegFree(EVT VT) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; @@ -454,7 +467,8 @@ class VectorType; /// getRegClassFor - Return the register class that should be used for the /// specified value type. - const TargetRegisterClass *getRegClassFor(MVT VT) const override; + const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent = false) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { @@ -479,7 +493,8 @@ class VectorType; /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize = false) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, @@ -544,6 +559,10 @@ class VectorType; bool useLoadStackGuardNode() const override; + void insertSSPDeclarations(Module &M) const override; + Value *getSDagStackGuard(const Module &M) const override; + Function *getSSPStackGuardCheck(const Module &M) const override; + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; @@ -568,6 +587,8 @@ class VectorType; return HasStandaloneRem; } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const; @@ -593,8 +614,11 @@ class VectorType; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; - bool shouldFoldShiftPairToMask(const SDNode *N, - CombineLevel Level) const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; + + bool preferIncOfAddToSubOfNot(EVT VT) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -680,6 +704,7 @@ class VectorType; const ARMSubtarget *ST) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -693,6 +718,8 @@ class VectorType; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + void lowerABS(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; @@ -755,15 +782,13 @@ class VectorType; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. - bool IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - bool isCalleeStructRet, - bool isCallerStructRet, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, - SelectionDAG& DAG) const; + bool IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG, + const bool isIndirect) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, @@ -781,6 +806,8 @@ class VectorType; bool shouldConsiderGEPOffsetSplit() const override { return true; } + bool isUnsupportedFloatingType(EVT VT) const; + SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const; @@ -806,11 +833,15 @@ class VectorType; MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; + void addMVEVectorTypes(bool HasMVEFP); + void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); + void setAllExpand(MVT VT); }; enum NEONModImmType { VMOVModImm, VMVNModImm, + MVEVMVNModImm, OtherModImm }; diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 0df48ba61299..bc93a058720c 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -1,9 +1,8 @@ //===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -110,6 +109,9 @@ def AddrModeT2_i8s4 : AddrMode<15>; def AddrMode_i12 : AddrMode<16>; def AddrMode5FP16 : AddrMode<17>; def AddrModeT2_ldrex : AddrMode<18>; +def AddrModeT2_i7s4 : AddrMode<19>; +def AddrModeT2_i7s2 : AddrMode<20>; +def AddrModeT2_i7 : AddrMode<21>; // Load / store index mode. class IndexMode val> { @@ -121,14 +123,15 @@ def IndexModePost : IndexMode<2>; def IndexModeUpd : IndexMode<3>; // Instruction execution domain. -class Domain val> { - bits<3> Value = val; +class Domain val> { + bits<4> Value = val; } def GenericDomain : Domain<0>; def VFPDomain : Domain<1>; // Instructions in VFP domain only def NeonDomain : Domain<2>; // Instructions in Neon domain only def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8 +def MVEDomain : Domain<8>; // Instructions in MVE and ARMv8.1m //===----------------------------------------------------------------------===// // ARM special operands. @@ -185,6 +188,86 @@ def s_cc_out : OptionalDefOperand { let DecoderMethod = "DecodeCCOutOperand"; } +// VPT predicate + +def VPTPredNOperand : AsmOperandClass { + let Name = "VPTPredN"; + let PredicateMethod = "isVPTPred"; +} +def VPTPredROperand : AsmOperandClass { + let Name = "VPTPredR"; + let PredicateMethod = "isVPTPred"; +} +def undef_tied_input; + +// Operand classes for the cluster of MC operands describing a +// VPT-predicated MVE instruction. +// +// There are two of these classes. Both of them have the same first +// two options: +// +// $cond (an integer) indicates the instruction's predication status: +// * ARMVCC::None means it's unpredicated +// * ARMVCC::Then means it's in a VPT block and appears with the T suffix +// * ARMVCC::Else means it's in a VPT block and appears with the E suffix. +// During code generation, unpredicated and predicated instructions +// are indicated by setting this parameter to 'None' or to 'Then'; the +// third value 'Else' is only used for assembly and disassembly. +// +// $cond_reg (type VCCR) gives the input predicate register. This is +// always either zero_reg or VPR, but needs to be modelled as an +// explicit operand so that it can be register-allocated and spilled +// when these operands are used in code generation). +// +// For 'vpred_r', there's an extra operand $inactive, which specifies +// the vector register which will supply any lanes of the output +// register that the predication mask prevents from being written by +// this instruction. It's always tied to the actual output register +// (i.e. must be allocated into the same physical reg), but again, +// code generation will need to model it as a separate input value. +// +// 'vpred_n' doesn't have that extra operand: it only has $cond and +// $cond_reg. This variant is used for any instruction that can't, or +// doesn't want to, tie $inactive to the output register. Sometimes +// that's because another input parameter is already tied to it (e.g. +// instructions that both read and write their Qd register even when +// unpredicated, either because they only partially overwrite it like +// a narrowing integer conversion, or simply because the instruction +// encoding doesn't have enough register fields to make the output +// independent of all inputs). It can also be because the instruction +// is defined to set disabled output lanes to zero rather than leaving +// them unchanged (vector loads), or because it doesn't output a +// vector register at all (stores, compares). In any of these +// situations it's unnecessary to have an extra operand tied to the +// output, and inconvenient to leave it there unused. + +// Base class for both kinds of vpred. +class vpred_ops : OperandWithDefaultOps { + let PrintMethod = "printVPTPredicateOperand"; + let OperandNamespace = "ARM"; + let MIOperandInfo = !con((ops i32imm:$cond, VCCR:$cond_reg), extra_mi); + + // For convenience, we provide a string value that can be appended + // to the constraints string. It's empty for vpred_n, and for + // vpred_r it ties the $inactive operand to the output q-register + // (which by convention will be called $Qd). + string vpred_constraint; +} + +def vpred_r : vpred_ops<(ops (v4i32 undef_tied_input)), (ops MQPR:$inactive)> { + let ParserMatchClass = VPTPredROperand; + let OperandType = "OPERAND_VPRED_R"; + let DecoderMethod = "DecodeVpredROperand"; + let vpred_constraint = ",$Qd = $vp.inactive"; +} + +def vpred_n : vpred_ops<(ops), (ops)> { + let ParserMatchClass = VPTPredNOperand; + let OperandType = "OPERAND_VPRED_N"; + let vpred_constraint = ""; +} + // ARM special operands for disassembly only. // def SetEndAsmOperand : ImmAsmOperand<0,1> { @@ -285,6 +368,8 @@ class VFP3InstAlias : InstAlias, Requires<[HasVFP3]>; class NEONInstAlias : InstAlias, Requires<[HasNEON]>; +class MVEInstAlias + : InstAlias, Requires<[HasMVEInt, IsThumb]>; class VFP2MnemonicAlias : MnemonicAlias, @@ -325,8 +410,8 @@ class InstTemplate : AsmPseudoInst, Requires<[HasVFP2]>; class NEONAsmPseudo : AsmPseudoInst, Requires<[HasNEON]>; +class MVEAsmPseudo + : AsmPseudoInst, Requires<[HasMVEInt]>; // Pseudo instructions for the code generator. class PseudoInst pattern> @@ -1556,6 +1643,8 @@ class AHI5 opcod1, bits<2> opcod2, dag oops, dag iops, // Loads & stores operate on both NEON and VFP pipelines. let D = VFPNeonDomain; + + let isUnpredicable = 1; // FP16 instructions cannot in general be conditional } // VFP Load / store multiple pseudo instructions. @@ -1903,6 +1992,8 @@ class AHuI opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, let Inst{11-8} = 0b1001; // Half precision let Inst{7-6} = opcod4; let Inst{4} = opcod5; + + let isUnpredicable = 1; // FP16 instructions cannot in general be conditional } // Half precision, unary, non-predicated @@ -1931,6 +2022,8 @@ class AHuInp opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, let Inst{11-8} = 0b1001; // Half precision let Inst{7-6} = opcod4; let Inst{4} = opcod5; + + let isUnpredicable = 1; // FP16 instructions cannot in general be conditional } // Half precision, binary @@ -1957,6 +2050,8 @@ class AHbI opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, let Inst{11-8} = 0b1001; // Half precision let Inst{6} = op6; let Inst{4} = op4; + + let isUnpredicable = 1; // FP16 instructions cannot in general be conditional } // Half precision, binary, not predicated @@ -1986,6 +2081,8 @@ class AHbInp opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops, let Inst{11-8} = 0b1001; // Half precision let Inst{6} = opcod3; let Inst{4} = 0; + + let isUnpredicable = 1; // FP16 instructions cannot in general be conditional } // VFP conversion instructions @@ -2494,7 +2591,7 @@ class NEONFPPat : Pat { // VFP/NEON Instruction aliases for type suffices. // Note: When EmitPriority == 1, the alias will be used for printing class VFPDataTypeInstAlias : - InstAlias, Requires<[HasVFP2]>; + InstAlias, Requires<[HasFPRegs]>; // Note: When EmitPriority == 1, the alias will be used for printing multiclass VFPDTAnyInstAlias { diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index bcc31f5fa4cc..388c889349b7 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -95,7 +94,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { const ARMSubtarget &Subtarget = MF.getSubtarget(); const TargetMachine &TM = MF.getTarget(); - if (!Subtarget.useMovt(MF)) { + if (!Subtarget.useMovt()) { if (TM.isPositionIndependent()) expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12); else diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h index c87fb97448c9..042b53f0f8c3 100644 --- a/lib/Target/ARM/ARMInstrInfo.h +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -1,9 +1,8 @@ //===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 13abdc9687ec..e35145463852 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -1,9 +1,8 @@ //===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -100,6 +99,18 @@ def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>, SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>]>; +// ARMlsll, ARMlsrl, ARMasrl +def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisInt<4>]>; + +// TODO Add another operand for 'Size' so that we can re-use this node when we +// start supporting *TP versions. +def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, OtherVT>]>; + def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>; def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>; def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>; @@ -172,6 +183,10 @@ def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; +def ARMasrl : SDNode<"ARMISD::ASRL", SDT_ARMIntShiftParts, []>; +def ARMlsrl : SDNode<"ARMISD::LSRL", SDT_ARMIntShiftParts, []>; +def ARMlsll : SDNode<"ARMISD::LSLL", SDT_ARMIntShiftParts, []>; + def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; @@ -214,189 +229,44 @@ def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; -//===----------------------------------------------------------------------===// -// ARM Instruction Predicate Definitions. -// -def HasV4T : Predicate<"Subtarget->hasV4TOps()">, - AssemblerPredicate<"HasV4TOps", "armv4t">; -def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; -def HasV5T : Predicate<"Subtarget->hasV5TOps()">, - AssemblerPredicate<"HasV5TOps", "armv5t">; -def NoV5T : Predicate<"!Subtarget->hasV5TOps()">; -def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, - AssemblerPredicate<"HasV5TEOps", "armv5te">; -def HasV6 : Predicate<"Subtarget->hasV6Ops()">, - AssemblerPredicate<"HasV6Ops", "armv6">; -def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; -def HasV6M : Predicate<"Subtarget->hasV6MOps()">, - AssemblerPredicate<"HasV6MOps", - "armv6m or armv6t2">; -def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">, - AssemblerPredicate<"HasV8MBaselineOps", - "armv8m.base">; -def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">, - AssemblerPredicate<"HasV8MMainlineOps", - "armv8m.main">; -def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, - AssemblerPredicate<"HasV6T2Ops", "armv6t2">; -def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; -def HasV6K : Predicate<"Subtarget->hasV6KOps()">, - AssemblerPredicate<"HasV6KOps", "armv6k">; -def NoV6K : Predicate<"!Subtarget->hasV6KOps()">; -def HasV7 : Predicate<"Subtarget->hasV7Ops()">, - AssemblerPredicate<"HasV7Ops", "armv7">; -def HasV8 : Predicate<"Subtarget->hasV8Ops()">, - AssemblerPredicate<"HasV8Ops", "armv8">; -def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, - AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; -def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, - AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; -def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, - AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; -def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, - AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; -def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, - AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; -def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, - AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; -def NoVFP : Predicate<"!Subtarget->hasVFP2()">; -def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, - AssemblerPredicate<"FeatureVFP2", "VFP2">; -def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, - AssemblerPredicate<"FeatureVFP3", "VFP3">; -def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, - AssemblerPredicate<"FeatureVFP4", "VFP4">; -def HasDPVFP : Predicate<"!Subtarget->isFPOnlySP()">, - AssemblerPredicate<"!FeatureVFPOnlySP", - "double precision VFP">; -def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">; -def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON", "NEON">; -def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, - AssemblerPredicate<"FeatureSHA2", "sha2">; -def HasAES : Predicate<"Subtarget->hasAES()">, - AssemblerPredicate<"FeatureAES", "aes">; -def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<"FeatureCrypto", "crypto">; -def HasDotProd : Predicate<"Subtarget->hasDotProd()">, - AssemblerPredicate<"FeatureDotProd", "dotprod">; -def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<"FeatureCRC", "crc">; -def HasRAS : Predicate<"Subtarget->hasRAS()">, - AssemblerPredicate<"FeatureRAS", "ras">; -def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16","half-float conversions">; -def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, - AssemblerPredicate<"FeatureFullFP16","full half-float">; -def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, - AssemblerPredicate<"FeatureFP16FML","full half-float fml">; -def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">, - AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">; -def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, - AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">; -def HasDSP : Predicate<"Subtarget->hasDSP()">, - AssemblerPredicate<"FeatureDSP", "dsp">; -def HasDB : Predicate<"Subtarget->hasDataBarrier()">, - AssemblerPredicate<"FeatureDB", - "data-barriers">; -def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">, - AssemblerPredicate<"FeatureDFB", - "full-data-barrier">; -def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">, - AssemblerPredicate<"FeatureV7Clrex", - "v7 clrex">; -def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">, - AssemblerPredicate<"FeatureAcquireRelease", - "acquire/release">; -def HasMP : Predicate<"Subtarget->hasMPExtension()">, - AssemblerPredicate<"FeatureMP", - "mp-extensions">; -def HasVirtualization: Predicate<"false">, - AssemblerPredicate<"FeatureVirtualization", - "virtualization-extensions">; -def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, - AssemblerPredicate<"FeatureTrustZone", - "TrustZone">; -def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">, - AssemblerPredicate<"Feature8MSecExt", - "ARMv8-M Security Extensions">; -def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; -def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; -def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; -def IsThumb : Predicate<"Subtarget->isThumb()">, - AssemblerPredicate<"ModeThumb", "thumb">; -def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; -def IsThumb2 : Predicate<"Subtarget->isThumb2()">, - AssemblerPredicate<"ModeThumb,FeatureThumb2", - "thumb2">; -def IsMClass : Predicate<"Subtarget->isMClass()">, - AssemblerPredicate<"FeatureMClass", "armv*m">; -def IsNotMClass : Predicate<"!Subtarget->isMClass()">, - AssemblerPredicate<"!FeatureMClass", - "!armv*m">; -def IsARM : Predicate<"!Subtarget->isThumb()">, - AssemblerPredicate<"!ModeThumb", "arm-mode">; -def IsMachO : Predicate<"Subtarget->isTargetMachO()">; -def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">; -def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; -def IsWindows : Predicate<"Subtarget->isTargetWindows()">; -def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">; -def IsReadTPHard : Predicate<"Subtarget->isReadTPHard()">; -def IsReadTPSoft : Predicate<"!Subtarget->isReadTPHard()">; -def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, - AssemblerPredicate<"FeatureNaClTrap", "NaCl">; -def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; - -def UseNegativeImmediates : - Predicate<"false">, - AssemblerPredicate<"!FeatureNoNegativeImmediates", - "NegativeImmediates">; - -// FIXME: Eventually this will be just "hasV6T2Ops". -let RecomputePerFunction = 1 in { - def UseMovt : Predicate<"Subtarget->useMovt(*MF)">; - def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; - def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; - def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; - - def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&" - " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||" - "MF->getFunction().optForMinSize())">; -} -def UseMulOps : Predicate<"Subtarget->useMulOps()">; - -// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. -// But only select them if more precision in FP computation is allowed, and when -// they are not slower than a mul + add sequence. -// Do not use them for Darwin platforms. -def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast && " - " Subtarget->hasVFP4()) && " - "!Subtarget->isTargetDarwin() &&" - "Subtarget->useFPVMLx()">; - -def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; -def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; - -def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">; -def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">; - -def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||" - "!Subtarget->useNEONForSinglePrecisionFP()">; -def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&" - "Subtarget->useNEONForSinglePrecisionFP()">; - -let RecomputePerFunction = 1 in { - def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">; - def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">; -} - -def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">; - -// Armv8.5-A extensions -def HasSB : Predicate<"Subtarget->hasSB()">, - AssemblerPredicate<"FeatureSB", "sb">; +// Vector operations shared between NEON and MVE + +def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; + +// VDUPLANE can produce a quad-register result from a double-register source, +// so the result is not constrained to match the source. +def ARMvduplane : SDNode<"ARMISD::VDUPLANE", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>>; + +def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; +def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; +def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; + +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; +def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; + +def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def ARMvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; +def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; +def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; + + +def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>,]>; +def ARMvshlImm : SDNode<"ARMISD::VSHLIMM", SDTARMVSHIMM>; +def ARMvshrsImm : SDNode<"ARMISD::VSHRsIMM", SDTARMVSHIMM>; +def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; +def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; +def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; + +def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop, + [SDNPHasChain]>; //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -552,6 +422,16 @@ def reglist : Operand { let DecoderMethod = "DecodeRegListOperand"; } +// A list of general purpose registers and APSR separated by comma. +// Used by CLRM +def RegListWithAPSRAsmOperand : AsmOperandClass { let Name = "RegListWithAPSR"; } +def reglist_with_apsr : Operand { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = RegListWithAPSRAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeRegListOperand"; +} + def GPRPairOp : RegisterOperand; def DPRRegListAsmOperand : AsmOperandClass { @@ -576,6 +456,21 @@ def spr_reglist : Operand { let DecoderMethod = "DecodeSPRRegListOperand"; } +def FPSRegListWithVPRAsmOperand : AsmOperandClass { let Name = + "FPSRegListWithVPR"; } +def fp_sreglist_with_vpr : Operand { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = FPSRegListWithVPRAsmOperand; + let PrintMethod = "printRegisterList"; +} +def FPDRegListWithVPRAsmOperand : AsmOperandClass { let Name = + "FPDRegListWithVPR"; } +def fp_dreglist_with_vpr : Operand { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = FPDRegListWithVPRAsmOperand; + let PrintMethod = "printRegisterList"; +} + // An operand for the CONSTPOOL_ENTRY pseudo-instruction. def cpinst_operand : Operand { let PrintMethod = "printCPInstOperand"; @@ -621,6 +516,55 @@ def rot_imm : Operand, PatLeaf<(i32 imm), [{ let ParserMatchClass = RotImmAsmOperand; } +// Power-of-two operand for MVE VIDUP and friends, which encode +// {1,2,4,8} as its log to base 2, i.e. as {0,1,2,3} respectively +def MVE_VIDUP_imm_asmoperand : AsmOperandClass { + let Name = "VIDUP_imm"; + let PredicateMethod = "isPowerTwoInRange<1,8>"; + let RenderMethod = "addPowerTwoOperands"; + let DiagnosticString = "vector increment immediate must be 1, 2, 4 or 8"; +} +def MVE_VIDUP_imm : Operand { + let EncoderMethod = "getPowerTwoOpValue"; + let DecoderMethod = "DecodePowerTwoOperand<0,3>"; + let ParserMatchClass = MVE_VIDUP_imm_asmoperand; +} + +// Pair vector indexing +class MVEPairVectorIndexOperand : AsmOperandClass { + let Name = "MVEPairVectorIndex"#start; + let RenderMethod = "addMVEPairVectorIndexOperands"; + let PredicateMethod = "isMVEPairVectorIndex<"#start#", "#end#">"; +} + +class MVEPairVectorIndex : Operand { + let PrintMethod = "printVectorIndex"; + let EncoderMethod = "getMVEPairVectorIndexOpValue<"#opval#">"; + let DecoderMethod = "DecodeMVEPairVectorIndexOperand<"#opval#">"; + let MIOperandInfo = (ops i32imm); +} + +def MVEPairVectorIndex0 : MVEPairVectorIndex<"0"> { + let ParserMatchClass = MVEPairVectorIndexOperand<"0", "1">; +} + +def MVEPairVectorIndex2 : MVEPairVectorIndex<"2"> { + let ParserMatchClass = MVEPairVectorIndexOperand<"2", "3">; +} + +// Vector indexing +class MVEVectorIndexOperand : AsmOperandClass { + let Name = "MVEVectorIndex"#NumLanes; + let RenderMethod = "addMVEVectorIndexOperands"; + let PredicateMethod = "isVectorIndexInRange<"#NumLanes#">"; +} + +class MVEVectorIndex : Operand { + let PrintMethod = "printVectorIndex"; + let ParserMatchClass = MVEVectorIndexOperand; + let MIOperandInfo = (ops i32imm); +} + // shift_imm: An integer that encodes a shift amount and the type of shift // (asr or lsl). The 6-bit immediate encodes as: // {5} 0 ==> lsl @@ -718,24 +662,11 @@ def mod_imm_neg : Operand, PatLeaf<(imm), [{ } /// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal() -def arm_i32imm : PatLeaf<(imm), [{ - if (Subtarget->useMovt(*MF)) +def arm_i32imm : IntImmLeafuseMovt()) return true; - return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); -}]> { - // Ideally this would be an IntImmLeaf, but then we wouldn't have access to - // the MachineFunction. - let GISelPredicateCode = [{ - const auto &MF = *MI.getParent()->getParent(); - if (STI.useMovt(MF)) - return true; - - const auto &MO = MI.getOperand(1); - if (!MO.isCImm()) - return false; - return ARM_AM::isSOImmTwoPartVal(MO.getCImm()->getZExtValue()); - }]; -} + return ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue()); +}]>; /// imm0_1 predicate - Immediate in the range [0,1]. def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; } @@ -952,6 +883,32 @@ def imm1_16 : Operand, ImmLeaf { + let Name = "MVEShiftImm1_7"; + // Reason we're doing this is because instruction vshll.s8 t1 encoding + // accepts 1,7 but the t2 encoding accepts 8. By doing this we can get a + // better diagnostic message if someone uses bigger immediate than the t1/t2 + // encodings allow. + let DiagnosticString = "operand must be an immediate in the range [1,8]"; +} +def mve_shift_imm1_7 : Operand { + let ParserMatchClass = MVEShiftImm1_7AsmOperand; + let EncoderMethod = "getMVEShiftImmOpValue"; +} + +def MVEShiftImm1_15AsmOperand: ImmAsmOperand<1,15> { + let Name = "MVEShiftImm1_15"; + // Reason we're doing this is because instruction vshll.s16 t1 encoding + // accepts 1,15 but the t2 encoding accepts 16. By doing this we can get a + // better diagnostic message if someone uses bigger immediate than the t1/t2 + // encodings allow. + let DiagnosticString = "operand must be an immediate in the range [1,16]"; +} +def mve_shift_imm1_15 : Operand { + let ParserMatchClass = MVEShiftImm1_15AsmOperand; + let EncoderMethod = "getMVEShiftImmOpValue"; +} + // Define ARM specific addressing modes. // addrmode_imm12 := reg +/- imm12 // @@ -1332,6 +1289,15 @@ def addr_offset_none : MemOperand, let MIOperandInfo = (ops GPR:$base); } +// t_addr_offset_none := reg [r0-r7] +def MemNoOffsetTAsmOperand : AsmOperandClass { let Name = "MemNoOffsetT"; } +def t_addr_offset_none : MemOperand { + let PrintMethod = "printAddrMode7Operand"; + let DecoderMethod = "DecodetGPRRegisterClass"; + let ParserMatchClass = MemNoOffsetTAsmOperand; + let MIOperandInfo = (ops tGPR:$base); +} + def nohash_imm : Operand { let PrintMethod = "printNoHashImmediate"; } @@ -5931,6 +5897,12 @@ include "ARMInstrVFP.td" include "ARMInstrNEON.td" +//===----------------------------------------------------------------------===// +// MVE Support +// + +include "ARMInstrMVE.td" + //===----------------------------------------------------------------------===// // Assembler aliases // diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td new file mode 100644 index 000000000000..3e7ae55c7fc8 --- /dev/null +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -0,0 +1,4591 @@ +//===-- ARMInstrMVE.td - MVE support for ARM ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM MVE instruction set. +// +//===----------------------------------------------------------------------===// + +class ExpandImmAsmOp : AsmOperandClass { + let Name = !strconcat("ExpandImm", shift); + let PredicateMethod = !strconcat("isExpImm<", shift, ">"); + let RenderMethod = "addImmOperands"; +} +class InvertedExpandImmAsmOp : AsmOperandClass { + let Name = !strconcat("InvertedExpandImm", shift, "_", size); + let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">"); + let RenderMethod = "addImmOperands"; +} + +class ExpandImm : Operand { + let ParserMatchClass = ExpandImmAsmOp; + let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>"); + let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">"); + let PrintMethod = "printExpandedImmOperand"; +} +class InvertedExpandImm : Operand { + let ParserMatchClass = InvertedExpandImmAsmOp; + let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>"); + let PrintMethod = "printExpandedImmOperand"; + // No decoder method needed, because this operand type is only used + // by aliases (VAND and VORN) +} + +def expzero00 : ExpandImm<"0">; +def expzero08 : ExpandImm<"8">; +def expzero16 : ExpandImm<"16">; +def expzero24 : ExpandImm<"24">; + +def expzero00inv16 : InvertedExpandImm<"0", "16">; +def expzero08inv16 : InvertedExpandImm<"8", "16">; + +def expzero00inv32 : InvertedExpandImm<"0", "32">; +def expzero08inv32 : InvertedExpandImm<"8", "32">; +def expzero16inv32 : InvertedExpandImm<"16", "32">; +def expzero24inv32 : InvertedExpandImm<"24", "32">; + +// VPT condition mask +def vpt_mask : Operand { + let PrintMethod = "printVPTMask"; + let ParserMatchClass = it_mask_asmoperand; + let EncoderMethod = "getVPTMaskOpValue"; + let DecoderMethod = "DecodeVPTMaskOperand"; +} + +// VPT/VCMP restricted predicate for sign invariant types +def pred_restricted_i_asmoperand : AsmOperandClass { + let Name = "CondCodeRestrictedI"; + let RenderMethod = "addITCondCodeOperands"; + let PredicateMethod = "isITCondCodeRestrictedI"; + let ParserMethod = "parseITCondCode"; + let DiagnosticString = "condition code for sign-independent integer "# + "comparison must be EQ or NE"; +} + +// VPT/VCMP restricted predicate for signed types +def pred_restricted_s_asmoperand : AsmOperandClass { + let Name = "CondCodeRestrictedS"; + let RenderMethod = "addITCondCodeOperands"; + let PredicateMethod = "isITCondCodeRestrictedS"; + let ParserMethod = "parseITCondCode"; + let DiagnosticString = "condition code for signed integer "# + "comparison must be EQ, NE, LT, GT, LE or GE"; +} + +// VPT/VCMP restricted predicate for unsigned types +def pred_restricted_u_asmoperand : AsmOperandClass { + let Name = "CondCodeRestrictedU"; + let RenderMethod = "addITCondCodeOperands"; + let PredicateMethod = "isITCondCodeRestrictedU"; + let ParserMethod = "parseITCondCode"; + let DiagnosticString = "condition code for unsigned integer "# + "comparison must be EQ, NE, HS or HI"; +} + +// VPT/VCMP restricted predicate for floating point +def pred_restricted_fp_asmoperand : AsmOperandClass { + let Name = "CondCodeRestrictedFP"; + let RenderMethod = "addITCondCodeOperands"; + let PredicateMethod = "isITCondCodeRestrictedFP"; + let ParserMethod = "parseITCondCode"; + let DiagnosticString = "condition code for floating-point "# + "comparison must be EQ, NE, LT, GT, LE or GE"; +} + +class VCMPPredicateOperand : Operand; + +def pred_basic_i : VCMPPredicateOperand { + let PrintMethod = "printMandatoryRestrictedPredicateOperand"; + let ParserMatchClass = pred_restricted_i_asmoperand; + let DecoderMethod = "DecodeRestrictedIPredicateOperand"; + let EncoderMethod = "getRestrictedCondCodeOpValue"; +} + +def pred_basic_u : VCMPPredicateOperand { + let PrintMethod = "printMandatoryRestrictedPredicateOperand"; + let ParserMatchClass = pred_restricted_u_asmoperand; + let DecoderMethod = "DecodeRestrictedUPredicateOperand"; + let EncoderMethod = "getRestrictedCondCodeOpValue"; +} + +def pred_basic_s : VCMPPredicateOperand { + let PrintMethod = "printMandatoryRestrictedPredicateOperand"; + let ParserMatchClass = pred_restricted_s_asmoperand; + let DecoderMethod = "DecodeRestrictedSPredicateOperand"; + let EncoderMethod = "getRestrictedCondCodeOpValue"; +} + +def pred_basic_fp : VCMPPredicateOperand { + let PrintMethod = "printMandatoryRestrictedPredicateOperand"; + let ParserMatchClass = pred_restricted_fp_asmoperand; + let DecoderMethod = "DecodeRestrictedFPPredicateOperand"; + let EncoderMethod = "getRestrictedCondCodeOpValue"; +} + +// Register list operands for interleaving load/stores +def VecList2QAsmOperand : AsmOperandClass { + let Name = "VecListTwoMQ"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addMVEVecListOperands"; + let DiagnosticString = "operand must be a list of two consecutive "# + "q-registers in range [q0,q7]"; +} + +def VecList2Q : RegisterOperand { + let ParserMatchClass = VecList2QAsmOperand; + let PrintMethod = "printMVEVectorList<2>"; +} + +def VecList4QAsmOperand : AsmOperandClass { + let Name = "VecListFourMQ"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addMVEVecListOperands"; + let DiagnosticString = "operand must be a list of four consecutive "# + "q-registers in range [q0,q7]"; +} + +def VecList4Q : RegisterOperand { + let ParserMatchClass = VecList4QAsmOperand; + let PrintMethod = "printMVEVectorList<4>"; +} + +// taddrmode_imm7 := reg[r0-r7] +/- (imm7 << shift) +class TMemImm7ShiftOffsetAsmOperand : AsmOperandClass { + let Name = "TMemImm7Shift"#shift#"Offset"; + let PredicateMethod = "isMemImm7ShiftedOffset<"#shift#",ARM::tGPRRegClassID>"; + let RenderMethod = "addMemImmOffsetOperands"; +} + +class taddrmode_imm7 : MemOperand { + let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand; + // They are printed the same way as the T2 imm8 version + let PrintMethod = "printT2AddrModeImm8Operand"; + // This can also be the same as the T2 version. + let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">"; + let DecoderMethod = "DecodeTAddrModeImm7<"#shift#">"; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t2addrmode_imm7 := reg +/- (imm7) +class MemImm7ShiftOffsetAsmOperand : AsmOperandClass { + let Name = "MemImm7Shift"#shift#"Offset"; + let PredicateMethod = "isMemImm7ShiftedOffset<" # shift # + ",ARM::GPRnopcRegClassID>"; + let RenderMethod = "addMemImmOffsetOperands"; +} + +def MemImm7Shift0OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<0>; +def MemImm7Shift1OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<1>; +def MemImm7Shift2OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<2>; +class T2AddrMode_Imm7 : MemOperand, + ComplexPattern", []> { + let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">"; + let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 0>"; + let ParserMatchClass = + !cast("MemImm7Shift"#shift#"OffsetAsmOperand"); + let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm); +} + +class t2addrmode_imm7 : T2AddrMode_Imm7 { + // They are printed the same way as the imm8 version + let PrintMethod = "printT2AddrModeImm8Operand"; +} + +class MemImm7ShiftOffsetWBAsmOperand : AsmOperandClass { + let Name = "MemImm7Shift"#shift#"OffsetWB"; + let PredicateMethod = "isMemImm7ShiftedOffset<" # shift # + ",ARM::rGPRRegClassID>"; + let RenderMethod = "addMemImmOffsetOperands"; +} + +def MemImm7Shift0OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<0>; +def MemImm7Shift1OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<1>; +def MemImm7Shift2OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<2>; + +class t2addrmode_imm7_pre : T2AddrMode_Imm7 { + // They are printed the same way as the imm8 version + let PrintMethod = "printT2AddrModeImm8Operand"; + let ParserMatchClass = + !cast("MemImm7Shift"#shift#"OffsetWBAsmOperand"); + let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 1>"; + let MIOperandInfo = (ops rGPR:$base, i32imm:$offsim); +} + +class t2am_imm7shiftOffsetAsmOperand + : AsmOperandClass { let Name = "Imm7Shift"#shift; } +def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>; +def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>; +def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>; + +class t2am_imm7_offset : MemOperand { + // They are printed the same way as the imm8 version + let PrintMethod = "printT2AddrModeImm8OffsetOperand"; + let ParserMatchClass = + !cast("t2am_imm7shift"#shift#"OffsetAsmOperand"); + let EncoderMethod = "getT2ScaledImmOpValue<7,"#shift#">"; + let DecoderMethod = "DecodeT2Imm7<"#shift#">"; +} + +// Operands for gather/scatter loads of the form [Rbase, Qoffsets] +class MemRegRQOffsetAsmOperand : AsmOperandClass { + let Name = "MemRegRQS"#shift#"Offset"; + let PredicateMethod = "isMemRegRQOffset<"#shift#">"; + let RenderMethod = "addMemRegRQOffsetOperands"; +} + +def MemRegRQS0OffsetAsmOperand : MemRegRQOffsetAsmOperand<0>; +def MemRegRQS1OffsetAsmOperand : MemRegRQOffsetAsmOperand<1>; +def MemRegRQS2OffsetAsmOperand : MemRegRQOffsetAsmOperand<2>; +def MemRegRQS3OffsetAsmOperand : MemRegRQOffsetAsmOperand<3>; + +// mve_addr_rq_shift := reg + vreg{ << UXTW #shift} +class mve_addr_rq_shift : MemOperand { + let EncoderMethod = "getMveAddrModeRQOpValue"; + let PrintMethod = "printMveAddrModeRQOperand<"#shift#">"; + let ParserMatchClass = + !cast("MemRegRQS"#shift#"OffsetAsmOperand"); + let DecoderMethod = "DecodeMveAddrModeRQ"; + let MIOperandInfo = (ops GPRnopc:$base, MQPR:$offsreg); +} + +class MemRegQOffsetAsmOperand : AsmOperandClass { + let Name = "MemRegQS"#shift#"Offset"; + let PredicateMethod = "isMemRegQOffset<"#shift#">"; + let RenderMethod = "addMemImmOffsetOperands"; +} + +def MemRegQS2OffsetAsmOperand : MemRegQOffsetAsmOperand<2>; +def MemRegQS3OffsetAsmOperand : MemRegQOffsetAsmOperand<3>; + +// mve_addr_q_shift := vreg {+ #imm7s2/4} +class mve_addr_q_shift : MemOperand { + let EncoderMethod = "getMveAddrModeQOpValue<"#shift#">"; + // Can be printed same way as other reg + imm operands + let PrintMethod = "printT2AddrModeImm8Operand"; + let ParserMatchClass = + !cast("MemRegQS"#shift#"OffsetAsmOperand"); + let DecoderMethod = "DecodeMveAddrModeQ<"#shift#">"; + let MIOperandInfo = (ops MQPR:$base, i32imm:$imm); +} + +// --------- Start of base classes for the instructions themselves + +class MVE_MI pattern> + : Thumb2XI, + Requires<[HasMVEInt]> { + let D = MVEDomain; + let DecoderNamespace = "MVE"; +} + +// MVE_p is used for most predicated instructions, to add the cluster +// of input operands that provides the VPT suffix (none, T or E) and +// the input predicate register. +class MVE_p pattern=[]> + : MVE_MI { + let Inst{31-29} = 0b111; + let Inst{27-26} = 0b11; +} + +class MVE_f pattern=[]> + : MVE_p { + let Predicates = [HasMVEFloat]; +} + +class MVE_MI_with_pred pattern> + : Thumb2I, + Requires<[HasV8_1MMainline, HasMVEInt]> { + let D = MVEDomain; + let DecoderNamespace = "MVE"; +} + +class MVE_VMOV_lane_base pattern> + : Thumb2I, + Requires<[HasV8_1MMainline, HasMVEInt]> { + let D = MVEDomain; + let DecoderNamespace = "MVE"; +} + +class MVE_ScalarShift pattern=[]> + : MVE_MI_with_pred { + let Inst{31-20} = 0b111010100101; + let Inst{8} = 0b1; + +} + +class MVE_ScalarShiftSingleReg pattern=[]> + : MVE_ScalarShift { + bits<4> RdaDest; + + let Inst{19-16} = RdaDest{3-0}; +} + +class MVE_ScalarShiftSRegImm op5_4, list pattern=[]> + : MVE_ScalarShiftSingleReg { + bits<5> imm; + + let Inst{15} = 0b0; + let Inst{14-12} = imm{4-2}; + let Inst{11-8} = 0b1111; + let Inst{7-6} = imm{1-0}; + let Inst{5-4} = op5_4{1-0}; + let Inst{3-0} = 0b1111; +} + +def MVE_SQSHL : MVE_ScalarShiftSRegImm<"sqshl", 0b11>; +def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>; +def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>; +def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>; + +class MVE_ScalarShiftSRegReg op5_4, list pattern=[]> + : MVE_ScalarShiftSingleReg { + bits<4> Rm; + + let Inst{15-12} = Rm{3-0}; + let Inst{11-8} = 0b1111; + let Inst{7-6} = 0b00; + let Inst{5-4} = op5_4{1-0}; + let Inst{3-0} = 0b1101; +} + +def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>; +def MVE_UQRSHL : MVE_ScalarShiftSRegReg<"uqrshl", 0b00>; + +class MVE_ScalarShiftDoubleReg pattern=[]> + : MVE_ScalarShift { + bits<4> RdaLo; + bits<4> RdaHi; + + let Inst{19-17} = RdaLo{3-1}; + let Inst{11-9} = RdaHi{3-1}; +} + +class MVE_ScalarShiftDRegImm op5_4, bit op16, + list pattern=[]> + : MVE_ScalarShiftDoubleReg< + iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, long_shift:$imm), + "$RdaLo, $RdaHi, $imm", "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + pattern> { + bits<5> imm; + + let Inst{16} = op16; + let Inst{15} = 0b0; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; + let Inst{5-4} = op5_4{1-0}; + let Inst{3-0} = 0b1111; +} + +class MVE_ScalarShiftDRegReg pattern=[]> + : MVE_ScalarShiftDoubleReg< + iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm), + "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo," + "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + pattern> { + bits<4> Rm; + + let Inst{16} = op16; + let Inst{15-12} = Rm{3-0}; + let Inst{7-6} = 0b00; + let Inst{5} = op5; + let Inst{4} = 0b0; + let Inst{3-0} = 0b1101; + + // Custom decoder method because of the following overlapping encodings: + // ASRL and SQRSHR + // LSLL and UQRSHL + // SQRSHRL and SQRSHR + // UQRSHLL and UQRSHL + let DecoderMethod = "DecodeMVEOverlappingLongShift"; +} + +def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + (ARMasrl tGPREven:$RdaLo_src, + tGPROdd:$RdaHi_src, rGPR:$Rm))]>; +def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + (ARMasrl tGPREven:$RdaLo_src, + tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; +def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + (ARMlsll tGPREven:$RdaLo_src, + tGPROdd:$RdaHi_src, rGPR:$Rm))]>; +def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + (ARMlsll tGPREven:$RdaLo_src, + tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; +def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + (ARMlsrl tGPREven:$RdaLo_src, + tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; + +def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1, 0b1>; +def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>; +def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>; + +def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0, 0b1>; +def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>; +def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>; + +// start of mve_rDest instructions + +class MVE_rDest pattern=[]> +// Always use vpred_n and not vpred_r: with the output register being +// a GPR and not a vector register, there can't be any question of +// what to put in its inactive lanes. + : MVE_p { + + let Inst{25-23} = 0b101; + let Inst{11-9} = 0b111; + let Inst{4} = 0b0; +} + +class MVE_VABAV size, list pattern=[]> + : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm), + NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src", + pattern> { + bits<4> Qm; + bits<4> Qn; + bits<4> Rda; + + let Inst{28} = U; + let Inst{22} = 0b0; + let Inst{21-20} = size{1-0}; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{15-12} = Rda{3-0}; + let Inst{8} = 0b1; + let Inst{7} = Qn{3}; + let Inst{6} = 0b0; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b1; +} + +def MVE_VABAVs8 : MVE_VABAV<"s8", 0b0, 0b00>; +def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>; +def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>; +def MVE_VABAVu8 : MVE_VABAV<"u8", 0b1, 0b00>; +def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>; +def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>; + +class MVE_VADDV size, list pattern=[]> + : MVE_rDest<(outs tGPREven:$Rda), iops, NoItinerary, + iname, suffix, "$Rda, $Qm", cstr, pattern> { + bits<3> Qm; + bits<4> Rda; + + let Inst{28} = U; + let Inst{22-20} = 0b111; + let Inst{19-18} = size{1-0}; + let Inst{17-16} = 0b01; + let Inst{15-13} = Rda{3-1}; + let Inst{12} = 0b0; + let Inst{8-6} = 0b100; + let Inst{5} = A; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +multiclass MVE_VADDV_A size, + list pattern=[]> { + def acc : MVE_VADDV<"vaddva", suffix, + (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src", + 0b1, U, size, pattern>; + def no_acc : MVE_VADDV<"vaddv", suffix, + (ins MQPR:$Qm), "", + 0b0, U, size, pattern>; +} + +defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>; +defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>; +defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>; +defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; +defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; +defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; + +class MVE_VADDLV pattern=[]> + : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, + suffix, "$RdaLo, $RdaHi, $Qm", cstr, pattern> { + bits<3> Qm; + bits<4> RdaLo; + bits<4> RdaHi; + + let Inst{28} = U; + let Inst{22-20} = RdaHi{3-1}; + let Inst{19-18} = 0b10; + let Inst{17-16} = 0b01; + let Inst{15-13} = RdaLo{3-1}; + let Inst{12} = 0b0; + let Inst{8-6} = 0b100; + let Inst{5} = A; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +multiclass MVE_VADDLV_A pattern=[]> { + def acc : MVE_VADDLV<"vaddlva", suffix, + (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), + "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + 0b1, U, pattern>; + def no_acc : MVE_VADDLV<"vaddlv", suffix, + (ins MQPR:$Qm), "", + 0b0, U, pattern>; +} + + +defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>; +defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>; + +class MVE_VMINMAXNMV pattern=[]> + : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), + NoItinerary, iname, suffix, "$RdaSrc, $Qm", + "$RdaDest = $RdaSrc", pattern> { + bits<3> Qm; + bits<4> RdaDest; + + let Inst{28} = sz; + let Inst{22-20} = 0b110; + let Inst{19-18} = 0b11; + let Inst{17} = bit_17; + let Inst{16} = 0b0; + let Inst{15-12} = RdaDest{3-0}; + let Inst{8} = 0b1; + let Inst{7} = bit_7; + let Inst{6-5} = 0b00; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; + + let Predicates = [HasMVEFloat]; +} + +multiclass MVE_VMINMAXNMV_fty pattern=[]> { + def f32 : MVE_VMINMAXNMV; + def f16 : MVE_VMINMAXNMV; +} + +defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>; +defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>; + +multiclass MVE_VMINMAXNMAV_fty pattern=[]> { + def f32 : MVE_VMINMAXNMV; + def f16 : MVE_VMINMAXNMV; +} + +defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>; +defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>; + +class MVE_VMINMAXV size, + bit bit_17, bit bit_7, list pattern=[]> + : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary, + iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> { + bits<3> Qm; + bits<4> RdaDest; + + let Inst{28} = U; + let Inst{22-20} = 0b110; + let Inst{19-18} = size{1-0}; + let Inst{17} = bit_17; + let Inst{16} = 0b0; + let Inst{15-12} = RdaDest{3-0}; + let Inst{8} = 0b1; + let Inst{7} = bit_7; + let Inst{6-5} = 0b00; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +multiclass MVE_VMINMAXV_ty pattern=[]> { + def s8 : MVE_VMINMAXV; + def s16 : MVE_VMINMAXV; + def s32 : MVE_VMINMAXV; + def u8 : MVE_VMINMAXV; + def u16 : MVE_VMINMAXV; + def u32 : MVE_VMINMAXV; +} + +defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>; +defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>; + +multiclass MVE_VMINMAXAV_ty pattern=[]> { + def s8 : MVE_VMINMAXV; + def s16 : MVE_VMINMAXV; + def s32 : MVE_VMINMAXV; +} + +defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>; +defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>; + +class MVE_VMLAMLSDAV pattern=[]> + : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix, + "$RdaDest, $Qn, $Qm", cstr, pattern> { + bits<4> RdaDest; + bits<3> Qm; + bits<3> Qn; + + let Inst{28} = bit_28; + let Inst{22-20} = 0b111; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = sz; + let Inst{15-13} = RdaDest{3-1}; + let Inst{12} = X; + let Inst{8} = bit_8; + let Inst{7-6} = 0b00; + let Inst{5} = A; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = bit_0; +} + +multiclass MVE_VMLAMLSDAV_X pattern=[]> { + def _noexch : MVE_VMLAMLSDAV; + def _exch : MVE_VMLAMLSDAV; +} + +multiclass MVE_VMLAMLSDAV_XA pattern=[]> { + defm _noacc : MVE_VMLAMLSDAV_X; + defm _acc : MVE_VMLAMLSDAV_X; +} + +multiclass MVE_VMLADAV_multi pattern=[]> { + defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>; +} + +defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>; +defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>; +defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>; +defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>; + +defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>; +defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>; + +// vmlav aliases vmladav +foreach acc = ["_acc", "_noacc"] in { + foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in { + def : MVEInstAlias("MVE_VMLADAV"#suffix#acc#"_noexch") + tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; + } +} + +multiclass MVE_VMLSDAV_multi pattern=[]> { + defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>; +} + +defm MVE_VMLSDAVs8 : MVE_VMLSDAV_multi<"s8", 0, 0b1>; +defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>; +defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>; + +// Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH +class MVE_VMLALDAVBase pattern=[]> + : MVE_rDest<(outs tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest), iops, NoItinerary, + iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, pattern> { + bits<4> RdaLoDest; + bits<4> RdaHiDest; + bits<3> Qm; + bits<3> Qn; + + let Inst{28} = bit_28; + let Inst{22-20} = RdaHiDest{3-1}; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = sz; + let Inst{15-13} = RdaLoDest{3-1}; + let Inst{12} = X; + let Inst{8} = bit_8; + let Inst{7-6} = 0b00; + let Inst{5} = A; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = bit_0; +} + +multiclass MVE_VMLALDAVBase_X pattern=[]> { + def _noexch : MVE_VMLALDAVBase; + def _exch : MVE_VMLALDAVBase; +} + +multiclass MVE_VMLALDAVBase_XA pattern=[]> { + defm _noacc : MVE_VMLALDAVBase_X< + iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", + sz, bit_28, 0b0, bit_8, bit_0, pattern>; + defm _acc : MVE_VMLALDAVBase_X< + iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, + MQPR:$Qn, MQPR:$Qm), + "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc", + sz, bit_28, 0b1, bit_8, bit_0, pattern>; +} + +multiclass MVE_VRMLALDAVH_multi pattern=[]> { + defm "" : MVE_VMLALDAVBase_XA< + "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>; +} + +defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>; +defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>; + +// vrmlalvh aliases for vrmlaldavh +def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", + (MVE_VRMLALDAVHs32_noacc_noexch + tGPREven:$RdaLo, tGPROdd:$RdaHi, + MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; +def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", + (MVE_VRMLALDAVHs32_acc_noexch + tGPREven:$RdaLo, tGPROdd:$RdaHi, + MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; +def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", + (MVE_VRMLALDAVHu32_noacc_noexch + tGPREven:$RdaLo, tGPROdd:$RdaHi, + MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; +def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", + (MVE_VRMLALDAVHu32_acc_noexch + tGPREven:$RdaLo, tGPROdd:$RdaHi, + MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; + +multiclass MVE_VMLALDAV_multi pattern=[]> { + defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>; +} + +defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>; +defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>; +defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>; +defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>; + +// vmlalv aliases vmlaldav +foreach acc = ["_acc", "_noacc"] in { + foreach suffix = ["s16", "s32", "u16", "u32"] in { + def : MVEInstAlias("MVE_VMLALDAV"#suffix#acc#"_noexch") + tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest, + MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; + } +} + +multiclass MVE_VMLSLDAV_multi pattern=[]> { + defm "" : MVE_VMLALDAVBase_XA; +} + +defm MVE_VMLSLDAVs16 : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>; +defm MVE_VMLSLDAVs32 : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>; +defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>; + +// end of mve_rDest instructions + +// start of mve_comp instructions + +class MVE_comp pattern=[]> + : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), itin, iname, suffix, + "$Qd, $Qn, $Qm", vpred_r, cstr, pattern> { + bits<4> Qd; + bits<4> Qn; + bits<4> Qm; + + let Inst{22} = Qd{3}; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = 0b0; + let Inst{10-9} = 0b11; + let Inst{7} = Qn{3}; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +class MVE_VMINMAXNM pattern=[]> + : MVE_comp { + + let Inst{28} = 0b1; + let Inst{25-24} = 0b11; + let Inst{23} = 0b0; + let Inst{21} = bit_21; + let Inst{20} = sz; + let Inst{11} = 0b1; + let Inst{8} = 0b1; + let Inst{6} = 0b1; + let Inst{4} = 0b1; + + let Predicates = [HasMVEFloat]; +} + +def MVE_VMAXNMf32 : MVE_VMINMAXNM<"vmaxnm", "f32", 0b0, 0b0>; +def MVE_VMAXNMf16 : MVE_VMINMAXNM<"vmaxnm", "f16", 0b1, 0b0>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (fmaxnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), + (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; + def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), + (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; +} + +def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>; +def MVE_VMINNMf16 : MVE_VMINMAXNM<"vminnm", "f16", 0b1, 0b1>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (fminnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), + (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; + def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), + (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; +} + + +class MVE_VMINMAX size, + bit bit_4, list pattern=[]> + : MVE_comp { + + let Inst{28} = U; + let Inst{25-24} = 0b11; + let Inst{23} = 0b0; + let Inst{21-20} = size{1-0}; + let Inst{11} = 0b0; + let Inst{8} = 0b0; + let Inst{6} = 0b1; + let Inst{4} = bit_4; +} + +multiclass MVE_VMINMAX_all_sizes { + def s8 : MVE_VMINMAX; + def s16 : MVE_VMINMAX; + def s32 : MVE_VMINMAX; + def u8 : MVE_VMINMAX; + def u16 : MVE_VMINMAX; + def u32 : MVE_VMINMAX; +} + +defm MVE_VMAX : MVE_VMINMAX_all_sizes<"vmax", 0b0>; +defm MVE_VMIN : MVE_VMINMAX_all_sizes<"vmin", 0b1>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (smin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VMINs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (smin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VMINs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (smin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VMINs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + + def : Pat<(v16i8 (smax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VMAXs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (smax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VMAXs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (smax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VMAXs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + + def : Pat<(v16i8 (umin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VMINu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (umin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VMINu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (umin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VMINu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + + def : Pat<(v16i8 (umax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VMAXu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (umax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VMAXu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (umax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VMAXu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; +} + +// end of mve_comp instructions + +// start of mve_bit instructions + +class MVE_bit_arith pattern=[]> + : MVE_p { + bits<4> Qd; + bits<4> Qm; + + let Inst{22} = Qd{3}; + let Inst{15-13} = Qd{2-0}; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; +} + +def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), + "vbic", "", "$Qd, $Qn, $Qm", ""> { + bits<4> Qn; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b110; + let Inst{21-20} = 0b01; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12-8} = 0b00001; + let Inst{7} = Qn{3}; + let Inst{6} = 0b1; + let Inst{4} = 0b1; + let Inst{0} = 0b0; +} + +class MVE_VREV size, bits<2> bit_8_7> + : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, + suffix, "$Qd, $Qm", ""> { + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17-16} = 0b00; + let Inst{12-9} = 0b0000; + let Inst{8-7} = bit_8_7; + let Inst{6} = 0b1; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00>; +def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>; +def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>; + +def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>; +def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>; + +def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), + (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; + def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), + (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>; + def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))), + (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>; + + def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))), + (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>; + def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))), + (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>; + + def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))), + (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>; + + def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))), + (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>; + def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))), + (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>; + def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))), + (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>; +} + +def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), + "vmvn", "", "$Qd, $Qm", ""> { + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{21-16} = 0b110000; + let Inst{12-6} = 0b0010111; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))), + (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>; + def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))), + (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>; + def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))), + (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>; + def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))), + (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>; +} + +class MVE_bit_ops bit_21_20, bit bit_28> + : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), + iname, "", "$Qd, $Qn, $Qm", ""> { + bits<4> Qn; + + let Inst{28} = bit_28; + let Inst{25-23} = 0b110; + let Inst{21-20} = bit_21_20; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12-8} = 0b00001; + let Inst{7} = Qn{3}; + let Inst{6} = 0b1; + let Inst{4} = 0b1; + let Inst{0} = 0b0; +} + +def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>; +def MVE_VORN : MVE_bit_ops<"vorn", 0b11, 0b0>; +def MVE_VORR : MVE_bit_ops<"vorr", 0b10, 0b0>; +def MVE_VAND : MVE_bit_ops<"vand", 0b00, 0b0>; + +// add ignored suffixes as aliases + +foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f32"] in { + def : MVEInstAlias<"vbic${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", + (MVE_VBIC MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; + def : MVEInstAlias<"veor${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", + (MVE_VEOR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; + def : MVEInstAlias<"vorn${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", + (MVE_VORN MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; + def : MVEInstAlias<"vorr${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", + (MVE_VORR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; + def : MVEInstAlias<"vand${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", + (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VAND (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VAND (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VAND (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), + (v2i64 (MVE_VAND (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; + + def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VORR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VORR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VORR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), + (v2i64 (MVE_VORR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; + + def : Pat<(v16i8 (xor (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VEOR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (xor (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VEOR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (xor (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VEOR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + def : Pat<(v2i64 (xor (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), + (v2i64 (MVE_VEOR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; + + def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (vnotq MQPR:$val2))), + (v16i8 (MVE_VBIC (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (vnotq MQPR:$val2))), + (v8i16 (MVE_VBIC (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (vnotq MQPR:$val2))), + (v4i32 (MVE_VBIC (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (vnotq MQPR:$val2))), + (v2i64 (MVE_VBIC (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; + + def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (vnotq MQPR:$val2))), + (v16i8 (MVE_VORN (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (vnotq MQPR:$val2))), + (v8i16 (MVE_VORN (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (vnotq MQPR:$val2))), + (v4i32 (MVE_VORN (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (vnotq MQPR:$val2))), + (v2i64 (MVE_VORN (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; +} + +class MVE_bit_cmode cmode, dag inOps> + : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary, + iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> { + bits<8> imm; + bits<4> Qd; + + let Inst{28} = imm{7}; + let Inst{27-23} = 0b11111; + let Inst{22} = Qd{3}; + let Inst{21-19} = 0b000; + let Inst{18-16} = imm{6-4}; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = 0b0; + let Inst{11-8} = cmode; + let Inst{7-6} = 0b01; + let Inst{4} = 0b1; + let Inst{3-0} = imm{3-0}; +} + +class MVE_VORR cmode, ExpandImm imm_type> + : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { + let Inst{5} = 0b0; +} + +def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>; +def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>; +def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>; +def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>; +def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>; +def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>; + +def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", + (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; +def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", + (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; +def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", + (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; +def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", + (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; +def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", + (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; +def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", + (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; + +def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", + (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>; + +class MVE_VBIC cmode, ExpandImm imm_type> + : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { + let Inst{5} = 0b1; +} + +def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>; +def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>; +def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>; +def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>; +def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>; +def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>; + +def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", + (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", + (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", + (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", + (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", + (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", + (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; + +class MVE_VMOV_lane_direction { + bit bit_20; + dag oops; + dag iops; + string ops; + string cstr; +} +def MVE_VMOV_from_lane : MVE_VMOV_lane_direction { + let bit_20 = 0b1; + let oops = (outs rGPR:$Rt); + let iops = (ins MQPR:$Qd); + let ops = "$Rt, $Qd$Idx"; + let cstr = ""; +} +def MVE_VMOV_to_lane : MVE_VMOV_lane_direction { + let bit_20 = 0b0; + let oops = (outs MQPR:$Qd); + let iops = (ins MQPR:$Qd_src, rGPR:$Rt); + let ops = "$Qd$Idx, $Rt"; + let cstr = "$Qd = $Qd_src"; +} + +class MVE_VMOV_lane + : MVE_VMOV_lane_base { + bits<4> Qd; + bits<4> Rt; + + let Inst{31-24} = 0b11101110; + let Inst{23} = U; + let Inst{20} = dir.bit_20; + let Inst{19-17} = Qd{2-0}; + let Inst{15-12} = Rt{3-0}; + let Inst{11-8} = 0b1011; + let Inst{7} = Qd{3}; + let Inst{4-0} = 0b10000; +} + +class MVE_VMOV_lane_32 + : MVE_VMOV_lane<"32", 0b0, (ins MVEVectorIndex<4>:$Idx), dir> { + bits<2> Idx; + let Inst{22} = 0b0; + let Inst{6-5} = 0b00; + let Inst{16} = Idx{1}; + let Inst{21} = Idx{0}; + + let Predicates = [HasFPRegsV8_1M]; +} + +class MVE_VMOV_lane_16 + : MVE_VMOV_lane:$Idx), dir> { + bits<3> Idx; + let Inst{22} = 0b0; + let Inst{5} = 0b1; + let Inst{16} = Idx{2}; + let Inst{21} = Idx{1}; + let Inst{6} = Idx{0}; +} + +class MVE_VMOV_lane_8 + : MVE_VMOV_lane:$Idx), dir> { + bits<4> Idx; + let Inst{22} = 0b1; + let Inst{16} = Idx{3}; + let Inst{21} = Idx{2}; + let Inst{6} = Idx{1}; + let Inst{5} = Idx{0}; +} + +def MVE_VMOV_from_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_from_lane>; +def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>; +def MVE_VMOV_from_lane_s16 : MVE_VMOV_lane_16<"s16", 0b0, MVE_VMOV_from_lane>; +def MVE_VMOV_from_lane_u16 : MVE_VMOV_lane_16<"u16", 0b1, MVE_VMOV_from_lane>; +def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>; +def MVE_VMOV_from_lane_s8 : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>; +def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>; +def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>; + +let Predicates = [HasMVEInt] in { + def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane), + (f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>; + def : Pat<(insertelt (v2f64 MQPR:$src1), DPR:$src2, imm:$lane), + (INSERT_SUBREG (v2f64 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), DPR:$src2, (DSubReg_f64_reg imm:$lane))>; + + def : Pat<(extractelt (v4i32 MQPR:$src), imm:$lane), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>; + def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>; + + def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>; + def : Pat<(vector_insert (v8i16 MQPR:$src1), rGPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_16 MQPR:$src1, rGPR:$src2, imm:$lane)>; + + def : Pat<(ARMvgetlanes (v16i8 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; + + def : Pat<(v16i8 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + def : Pat<(v8i16 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_16 (v8i16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + def : Pat<(v4i32 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_32 (v4i32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + + // Floating point patterns, still enabled under HasMVEInt + def : Pat<(extractelt (v4f32 MQPR:$src), imm:$lane), + (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), SPR)>; + def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane), + (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>; + + def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; + def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane), + (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>; + + def : Pat<(v4f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; + def : Pat<(v4f32 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + def : Pat<(v8f16 (scalar_to_vector HPR:$src)), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>; + def : Pat<(v8f16 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; +} + +// end of mve_bit instructions + +// start of MVE Integer instructions + +class MVE_int size, list pattern=[]> + : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, + iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", pattern> { + bits<4> Qd; + bits<4> Qn; + bits<4> Qm; + + let Inst{22} = Qd{3}; + let Inst{21-20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{7} = Qn{3}; + let Inst{6} = 0b1; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; +} + +class MVE_VMULt1 size, list pattern=[]> + : MVE_int<"vmul", suffix, size, pattern> { + + let Inst{28} = 0b0; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-8} = 0b01001; + let Inst{4} = 0b1; + let Inst{0} = 0b0; +} + +def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>; +def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>; +def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; +} + +class MVE_VQxDMULH size, bit rounding, + list pattern=[]> + : MVE_int { + + let Inst{28} = rounding; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-8} = 0b01011; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +class MVE_VQDMULH size, list pattern=[]> + : MVE_VQxDMULH<"vqdmulh", suffix, size, 0b0, pattern>; +class MVE_VQRDMULH size, list pattern=[]> + : MVE_VQxDMULH<"vqrdmulh", suffix, size, 0b1, pattern>; + +def MVE_VQDMULHi8 : MVE_VQDMULH<"s8", 0b00>; +def MVE_VQDMULHi16 : MVE_VQDMULH<"s16", 0b01>; +def MVE_VQDMULHi32 : MVE_VQDMULH<"s32", 0b10>; + +def MVE_VQRDMULHi8 : MVE_VQRDMULH<"s8", 0b00>; +def MVE_VQRDMULHi16 : MVE_VQRDMULH<"s16", 0b01>; +def MVE_VQRDMULHi32 : MVE_VQRDMULH<"s32", 0b10>; + +class MVE_VADDSUB size, bit subtract, + list pattern=[]> + : MVE_int { + + let Inst{28} = subtract; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-8} = 0b01000; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +class MVE_VADD size, list pattern=[]> + : MVE_VADDSUB<"vadd", suffix, size, 0b0, pattern>; +class MVE_VSUB size, list pattern=[]> + : MVE_VADDSUB<"vsub", suffix, size, 0b1, pattern>; + +def MVE_VADDi8 : MVE_VADD<"i8", 0b00>; +def MVE_VADDi16 : MVE_VADD<"i16", 0b01>; +def MVE_VADDi32 : MVE_VADD<"i32", 0b10>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VADDi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VADDi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VADDi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; +} + +def MVE_VSUBi8 : MVE_VSUB<"i8", 0b00>; +def MVE_VSUBi16 : MVE_VSUB<"i16", 0b01>; +def MVE_VSUBi32 : MVE_VSUB<"i32", 0b10>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (v16i8 (MVE_VSUBi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (v8i16 (MVE_VSUBi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), + (v4i32 (MVE_VSUBi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; +} + +class MVE_VQADDSUB size, list pattern=[]> + : MVE_int { + + let Inst{28} = U; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-10} = 0b000; + let Inst{9} = subtract; + let Inst{8} = 0b0; + let Inst{4} = 0b1; + let Inst{0} = 0b0; +} + +class MVE_VQADD size, list pattern=[]> + : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>; +class MVE_VQSUB size, list pattern=[]> + : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>; + +def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00>; +def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>; +def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>; +def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00>; +def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>; +def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>; + +def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00>; +def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>; +def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>; +def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00>; +def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>; +def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>; + +class MVE_VABD_int size, list pattern=[]> + : MVE_int<"vabd", suffix, size, pattern> { + + let Inst{28} = U; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-8} = 0b00111; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>; +def MVE_VABDs16 : MVE_VABD_int<"s16", 0b0, 0b01>; +def MVE_VABDs32 : MVE_VABD_int<"s32", 0b0, 0b10>; +def MVE_VABDu8 : MVE_VABD_int<"u8", 0b1, 0b00>; +def MVE_VABDu16 : MVE_VABD_int<"u16", 0b1, 0b01>; +def MVE_VABDu32 : MVE_VABD_int<"u32", 0b1, 0b10>; + +class MVE_VRHADD size, list pattern=[]> + : MVE_int<"vrhadd", suffix, size, pattern> { + + let Inst{28} = U; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-8} = 0b00001; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>; +def MVE_VRHADDs16 : MVE_VRHADD<"s16", 0b0, 0b01>; +def MVE_VRHADDs32 : MVE_VRHADD<"s32", 0b0, 0b10>; +def MVE_VRHADDu8 : MVE_VRHADD<"u8", 0b1, 0b00>; +def MVE_VRHADDu16 : MVE_VRHADD<"u16", 0b1, 0b01>; +def MVE_VRHADDu32 : MVE_VRHADD<"u32", 0b1, 0b10>; + +class MVE_VHADDSUB size, list pattern=[]> + : MVE_int { + + let Inst{28} = U; + let Inst{25-23} = 0b110; + let Inst{16} = 0b0; + let Inst{12-10} = 0b000; + let Inst{9} = subtract; + let Inst{8} = 0b0; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +class MVE_VHADD size, + list pattern=[]> + : MVE_VHADDSUB<"vhadd", suffix, U, 0b0, size, pattern>; +class MVE_VHSUB size, + list pattern=[]> + : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; + +def MVE_VHADDs8 : MVE_VHADD<"s8", 0b0, 0b00>; +def MVE_VHADDs16 : MVE_VHADD<"s16", 0b0, 0b01>; +def MVE_VHADDs32 : MVE_VHADD<"s32", 0b0, 0b10>; +def MVE_VHADDu8 : MVE_VHADD<"u8", 0b1, 0b00>; +def MVE_VHADDu16 : MVE_VHADD<"u16", 0b1, 0b01>; +def MVE_VHADDu32 : MVE_VHADD<"u32", 0b1, 0b10>; + +def MVE_VHSUBs8 : MVE_VHSUB<"s8", 0b0, 0b00>; +def MVE_VHSUBs16 : MVE_VHSUB<"s16", 0b0, 0b01>; +def MVE_VHSUBs32 : MVE_VHSUB<"s32", 0b0, 0b10>; +def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>; +def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>; +def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>; + +class MVE_VDUP pattern=[]> + : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, + "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> { + bits<4> Qd; + bits<4> Rt; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b101; + let Inst{22} = B; + let Inst{21-20} = 0b10; + let Inst{19-17} = Qd{2-0}; + let Inst{16} = 0b0; + let Inst{15-12} = Rt; + let Inst{11-8} = 0b1011; + let Inst{7} = Qd{3}; + let Inst{6} = 0b0; + let Inst{5} = E; + let Inst{4-0} = 0b10000; +} + +def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>; +def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>; +def MVE_VDUP8 : MVE_VDUP<"8", 0b1, 0b0>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP8 rGPR:$elem)>; + def : Pat<(v8i16 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP16 rGPR:$elem)>; + def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP32 rGPR:$elem)>; + + def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)), + (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; + // For the 16-bit and 8-bit vduplanes we don't care about the signedness + // of the lane move operation as we only want the lowest 8/16 bits anyway. + def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)), + (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; + def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), + (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; + + def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), + (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; + def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), + (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; + + def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), + (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; + def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)), + (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; +} + + +class MVEIntSingleSrc size, + list pattern=[]> + : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary, + iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> { + bits<4> Qd; + bits<4> Qm; + + let Inst{22} = Qd{3}; + let Inst{19-18} = size{1-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; +} + +class MVE_VCLSCLZ size, + bit count_zeroes, list pattern=[]> + : MVEIntSingleSrc { + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{21-20} = 0b11; + let Inst{17-16} = 0b00; + let Inst{12-8} = 0b00100; + let Inst{7} = count_zeroes; + let Inst{6} = 0b1; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>; +def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>; +def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>; + +def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>; +def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>; +def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>; + +class MVE_VABSNEG_int size, bit negate, + list pattern=[]> + : MVEIntSingleSrc { + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{21-20} = 0b11; + let Inst{17-16} = 0b01; + let Inst{12-8} = 0b00011; + let Inst{7} = negate; + let Inst{6} = 0b1; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>; +def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>; +def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (abs (v16i8 MQPR:$v))), + (v16i8 (MVE_VABSs8 $v))>; + def : Pat<(v8i16 (abs (v8i16 MQPR:$v))), + (v8i16 (MVE_VABSs16 $v))>; + def : Pat<(v4i32 (abs (v4i32 MQPR:$v))), + (v4i32 (MVE_VABSs32 $v))>; +} + +def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>; +def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>; +def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))), + (v16i8 (MVE_VNEGs8 $v))>; + def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))), + (v8i16 (MVE_VNEGs16 $v))>; + def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))), + (v4i32 (MVE_VNEGs32 $v))>; +} + +class MVE_VQABSNEG size, + bit negate, list pattern=[]> + : MVEIntSingleSrc { + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{21-20} = 0b11; + let Inst{17-16} = 0b00; + let Inst{12-8} = 0b00111; + let Inst{7} = negate; + let Inst{6} = 0b1; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>; +def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>; +def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>; + +def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>; +def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>; +def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>; + +class MVE_mod_imm cmode, bit op, + dag iops, list pattern=[]> + : MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm", + vpred_r, "", pattern> { + bits<13> imm; + bits<4> Qd; + + let Inst{28} = imm{7}; + let Inst{25-23} = 0b111; + let Inst{22} = Qd{3}; + let Inst{21-19} = 0b000; + let Inst{18-16} = imm{6-4}; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = 0b0; + let Inst{11-8} = cmode{3-0}; + let Inst{7-6} = 0b01; + let Inst{5} = op; + let Inst{4} = 0b1; + let Inst{3-0} = imm{3-0}; + + let DecoderMethod = "DecodeMVEModImmInstruction"; +} + +let isReMaterializable = 1 in { +let isAsCheapAsAMove = 1 in { +def MVE_VMOVimmi8 : MVE_mod_imm<"vmov", "i8", {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm)>; +def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm)> { + let Inst{9} = imm{9}; +} +def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm)> { + let Inst{11-8} = imm{11-8}; +} +def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm)>; +def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm)>; +} // let isAsCheapAsAMove = 1 + +def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm)> { + let Inst{9} = imm{9}; +} +def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm)> { + let Inst{11-8} = imm{11-8}; +} +} // let isReMaterializable = 1 + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvmovImm timm:$simm)), + (v16i8 (MVE_VMOVimmi8 nImmSplatI8:$simm))>; + def : Pat<(v8i16 (ARMvmovImm timm:$simm)), + (v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>; + def : Pat<(v4i32 (ARMvmovImm timm:$simm)), + (v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>; + + def : Pat<(v8i16 (ARMvmvnImm timm:$simm)), + (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>; + def : Pat<(v4i32 (ARMvmvnImm timm:$simm)), + (v4i32 (MVE_VMVNimmi32 nImmVMOVI32:$simm))>; + + def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)), + (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>; +} + +class MVE_VMINMAXA size, + bit bit_12, list pattern=[]> + : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), + NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", + pattern> { + bits<4> Qd; + bits<4> Qm; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17-16} = 0b11; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = bit_12; + let Inst{11-6} = 0b111010; + let Inst{5} = Qm{3}; + let Inst{4} = 0b0; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b1; +} + +def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>; +def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>; +def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>; + +def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>; +def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>; +def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>; + +// end of MVE Integer instructions + +// start of mve_imm_shift instructions + +def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd), + (ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm), + NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm", + vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc"> { + bits<5> imm; + bits<4> Qd; + bits<4> RdmDest; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b101; + let Inst{22} = Qd{3}; + let Inst{21} = 0b1; + let Inst{20-16} = imm{4-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{12-4} = 0b011111100; + let Inst{3-0} = RdmDest{3-0}; +} + +class MVE_shift_imm pattern=[]> + : MVE_p { + bits<4> Qd; + bits<4> Qm; + + let Inst{22} = Qd{3}; + let Inst{15-13} = Qd{2-0}; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; +} + +class MVE_VMOVL sz, bit U, + list pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm), + iname, suffix, "$Qd, $Qm", vpred_r, "", + pattern> { + let Inst{28} = U; + let Inst{25-23} = 0b101; + let Inst{21} = 0b1; + let Inst{20-19} = sz{1-0}; + let Inst{18-16} = 0b000; + let Inst{11-6} = 0b111101; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +multiclass MVE_VMOVL_shift_half sz, bit U, + list pattern=[]> { + def bh : MVE_VMOVL { + let Inst{12} = 0b0; + } + def th : MVE_VMOVL { + let Inst{12} = 0b1; + } +} + +defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>; +defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>; +defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>; +defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>; + +let Predicates = [HasMVEInt] in { + def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16), + (MVE_VMOVLs16bh MQPR:$src)>; + def : Pat<(sext_inreg (v8i16 MQPR:$src), v8i8), + (MVE_VMOVLs8bh MQPR:$src)>; + def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8), + (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>; + + // zext_inreg 16 -> 32 + def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))), + (MVE_VMOVLu16bh MQPR:$src)>; + // zext_inreg 8 -> 16 + def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))), + (MVE_VMOVLu8bh MQPR:$src)>; +} + + +class MVE_VSHLL_imm pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$Qm), immops), + iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> { + let Inst{28} = U; + let Inst{25-23} = 0b101; + let Inst{21} = 0b1; + let Inst{12} = th; + let Inst{11-6} = 0b111101; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +// The immediate VSHLL instructions accept shift counts from 1 up to +// the lane width (8 or 16), but the full-width shifts have an +// entirely separate encoding, given below with 'lw' in the name. + +class MVE_VSHLL_imm8 pattern=[]> + : MVE_VSHLL_imm { + bits<3> imm; + let Inst{20-19} = 0b01; + let Inst{18-16} = imm; +} + +class MVE_VSHLL_imm16 pattern=[]> + : MVE_VSHLL_imm { + bits<4> imm; + let Inst{20} = 0b1; + let Inst{19-16} = imm; +} + +def MVE_VSHLL_imms8bh : MVE_VSHLL_imm8 <"vshllb", "s8", 0b0, 0b0>; +def MVE_VSHLL_imms8th : MVE_VSHLL_imm8 <"vshllt", "s8", 0b0, 0b1>; +def MVE_VSHLL_immu8bh : MVE_VSHLL_imm8 <"vshllb", "u8", 0b1, 0b0>; +def MVE_VSHLL_immu8th : MVE_VSHLL_imm8 <"vshllt", "u8", 0b1, 0b1>; +def MVE_VSHLL_imms16bh : MVE_VSHLL_imm16<"vshllb", "s16", 0b0, 0b0>; +def MVE_VSHLL_imms16th : MVE_VSHLL_imm16<"vshllt", "s16", 0b0, 0b1>; +def MVE_VSHLL_immu16bh : MVE_VSHLL_imm16<"vshllb", "u16", 0b1, 0b0>; +def MVE_VSHLL_immu16th : MVE_VSHLL_imm16<"vshllt", "u16", 0b1, 0b1>; + +class MVE_VSHLL_by_lane_width size, + bit U, string ops, list pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm), + iname, suffix, ops, vpred_r, "", pattern> { + let Inst{28} = U; + let Inst{25-23} = 0b100; + let Inst{21-20} = 0b11; + let Inst{19-18} = size{1-0}; + let Inst{17-16} = 0b01; + let Inst{11-6} = 0b111000; + let Inst{4} = 0b0; + let Inst{0} = 0b1; +} + +multiclass MVE_VSHLL_lw sz, bit U, + string ops, list pattern=[]> { + def bh : MVE_VSHLL_by_lane_width { + let Inst{12} = 0b0; + } + def th : MVE_VSHLL_by_lane_width { + let Inst{12} = 0b1; + } +} + +defm MVE_VSHLL_lws8 : MVE_VSHLL_lw<"vshll", "s8", 0b00, 0b0, "$Qd, $Qm, #8">; +defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">; +defm MVE_VSHLL_lwu8 : MVE_VSHLL_lw<"vshll", "u8", 0b00, 0b1, "$Qd, $Qm, #8">; +defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">; + +class MVE_VxSHRN pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), + iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", + pattern> { + bits<5> imm; + + let Inst{28} = bit_28; + let Inst{25-23} = 0b101; + let Inst{21} = 0b0; + let Inst{20-16} = imm{4-0}; + let Inst{12} = bit_12; + let Inst{11-6} = 0b111111; + let Inst{4} = 0b0; + let Inst{0} = 0b1; +} + +def MVE_VRSHRNi16bh : MVE_VxSHRN< + "vrshrnb", "i16", 0b0, 0b1, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VRSHRNi16th : MVE_VxSHRN< + "vrshrnt", "i16", 0b1, 0b1,(ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VRSHRNi32bh : MVE_VxSHRN< + "vrshrnb", "i32", 0b0, 0b1, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} +def MVE_VRSHRNi32th : MVE_VxSHRN< + "vrshrnt", "i32", 0b1, 0b1, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} + +def MVE_VSHRNi16bh : MVE_VxSHRN< + "vshrnb", "i16", 0b0, 0b0, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VSHRNi16th : MVE_VxSHRN< + "vshrnt", "i16", 0b1, 0b0, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VSHRNi32bh : MVE_VxSHRN< + "vshrnb", "i32", 0b0, 0b0, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} +def MVE_VSHRNi32th : MVE_VxSHRN< + "vshrnt", "i32", 0b1, 0b0, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} + +class MVE_VxQRSHRUN pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), + iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", + pattern> { + bits<5> imm; + + let Inst{28} = bit_28; + let Inst{25-23} = 0b101; + let Inst{21} = 0b0; + let Inst{20-16} = imm{4-0}; + let Inst{12} = bit_12; + let Inst{11-6} = 0b111111; + let Inst{4} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN< + "vqrshrunb", "s16", 0b1, 0b0, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN< + "vqrshrunt", "s16", 0b1, 0b1, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN< + "vqrshrunb", "s32", 0b1, 0b0, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} +def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN< + "vqrshrunt", "s32", 0b1, 0b1, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} + +def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN< + "vqshrunb", "s16", 0b0, 0b0, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VQSHRUNs16th : MVE_VxQRSHRUN< + "vqshrunt", "s16", 0b0, 0b1, (ins shr_imm8:$imm)> { + let Inst{20-19} = 0b01; +} +def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN< + "vqshrunb", "s32", 0b0, 0b0, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} +def MVE_VQSHRUNs32th : MVE_VxQRSHRUN< + "vqshrunt", "s32", 0b0, 0b1, (ins shr_imm16:$imm)> { + let Inst{20} = 0b1; +} + +class MVE_VxQRSHRN pattern=[]> + : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), + iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", + pattern> { + bits<5> imm; + + let Inst{25-23} = 0b101; + let Inst{21} = 0b0; + let Inst{20-16} = imm{4-0}; + let Inst{12} = bit_12; + let Inst{11-6} = 0b111101; + let Inst{4} = 0b0; + let Inst{0} = bit_0; +} + +multiclass MVE_VxQRSHRN_types { + def s16 : MVE_VxQRSHRN { + let Inst{28} = 0b0; + let Inst{20-19} = 0b01; + } + def u16 : MVE_VxQRSHRN { + let Inst{28} = 0b1; + let Inst{20-19} = 0b01; + } + def s32 : MVE_VxQRSHRN { + let Inst{28} = 0b0; + let Inst{20} = 0b1; + } + def u32 : MVE_VxQRSHRN { + let Inst{28} = 0b1; + let Inst{20} = 0b1; + } +} + +defm MVE_VQRSHRNbh : MVE_VxQRSHRN_types<"vqrshrnb", 0b1, 0b0>; +defm MVE_VQRSHRNth : MVE_VxQRSHRN_types<"vqrshrnt", 0b1, 0b1>; +defm MVE_VQSHRNbh : MVE_VxQRSHRN_types<"vqshrnb", 0b0, 0b0>; +defm MVE_VQSHRNth : MVE_VxQRSHRN_types<"vqshrnt", 0b0, 0b1>; + +// end of mve_imm_shift instructions + +// start of mve_shift instructions + +class MVE_shift_by_vec size, bit bit_4, bit bit_8> + : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm, MQPR:$Qn), NoItinerary, + iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", []> { + // Shift instructions which take a vector of shift counts + bits<4> Qd; + bits<4> Qm; + bits<4> Qn; + + let Inst{28} = U; + let Inst{25-24} = 0b11; + let Inst{23} = 0b0; + let Inst{22} = Qd{3}; + let Inst{21-20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{15-13} = Qd{2-0}; + let Inst{12-9} = 0b0010; + let Inst{8} = bit_8; + let Inst{7} = Qn{3}; + let Inst{6} = 0b1; + let Inst{5} = Qm{3}; + let Inst{4} = bit_4; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +multiclass mve_shift_by_vec_multi { + def s8 : MVE_shift_by_vec; + def s16 : MVE_shift_by_vec; + def s32 : MVE_shift_by_vec; + def u8 : MVE_shift_by_vec; + def u16 : MVE_shift_by_vec; + def u32 : MVE_shift_by_vec; +} + +defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>; +defm MVE_VQSHL_by_vec : mve_shift_by_vec_multi<"vqshl", 0b1, 0b0>; +defm MVE_VQRSHL_by_vec : mve_shift_by_vec_multi<"vqrshl", 0b1, 0b1>; +defm MVE_VRSHL_by_vec : mve_shift_by_vec_multi<"vrshl", 0b0, 0b1>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))), + (v4i32 (MVE_VSHL_by_vecu32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))), + (v8i16 (MVE_VSHL_by_vecu16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))), + (v16i8 (MVE_VSHL_by_vecu8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>; + + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))), + (v4i32 (MVE_VSHL_by_vecs32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))), + (v8i16 (MVE_VSHL_by_vecs16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))), + (v16i8 (MVE_VSHL_by_vecs8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>; +} + +class MVE_shift_with_imm pattern=[]> + : MVE_p { + bits<4> Qd; + bits<4> Qm; + + let Inst{23} = 0b1; + let Inst{22} = Qd{3}; + let Inst{15-13} = Qd{2-0}; + let Inst{12-11} = 0b00; + let Inst{7-6} = 0b01; + let Inst{5} = Qm{3}; + let Inst{4} = 0b1; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +class MVE_VSxI_imm + : MVE_shift_with_imm { + bits<6> imm; + let Inst{28} = 0b1; + let Inst{25-24} = 0b11; + let Inst{21-16} = imm; + let Inst{10-9} = 0b10; + let Inst{8} = bit_8; +} + +def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> { + let Inst{21-19} = 0b001; +} + +def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, (ins shr_imm16:$imm)> { + let Inst{21-20} = 0b01; +} + +def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, (ins shr_imm32:$imm)> { + let Inst{21} = 0b1; +} + +def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, (ins imm0_7:$imm)> { + let Inst{21-19} = 0b001; +} + +def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, (ins imm0_15:$imm)> { + let Inst{21-20} = 0b01; +} + +def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,(ins imm0_31:$imm)> { + let Inst{21} = 0b1; +} + +class MVE_VQSHL_imm + : MVE_shift_with_imm<"vqshl", suffix, (outs MQPR:$Qd), + !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", + vpred_r, ""> { + bits<6> imm; + + let Inst{25-24} = 0b11; + let Inst{21-16} = imm; + let Inst{10-8} = 0b111; +} + +def MVE_VSLIimms8 : MVE_VQSHL_imm<"s8", (ins imm0_7:$imm)> { + let Inst{28} = 0b0; + let Inst{21-19} = 0b001; +} + +def MVE_VSLIimmu8 : MVE_VQSHL_imm<"u8", (ins imm0_7:$imm)> { + let Inst{28} = 0b1; + let Inst{21-19} = 0b001; +} + +def MVE_VSLIimms16 : MVE_VQSHL_imm<"s16", (ins imm0_15:$imm)> { + let Inst{28} = 0b0; + let Inst{21-20} = 0b01; +} + +def MVE_VSLIimmu16 : MVE_VQSHL_imm<"u16", (ins imm0_15:$imm)> { + let Inst{28} = 0b1; + let Inst{21-20} = 0b01; +} + +def MVE_VSLIimms32 : MVE_VQSHL_imm<"s32", (ins imm0_31:$imm)> { + let Inst{28} = 0b0; + let Inst{21} = 0b1; +} + +def MVE_VSLIimmu32 : MVE_VQSHL_imm<"u32", (ins imm0_31:$imm)> { + let Inst{28} = 0b1; + let Inst{21} = 0b1; +} + +class MVE_VQSHLU_imm + : MVE_shift_with_imm<"vqshlu", suffix, (outs MQPR:$Qd), + !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", + vpred_r, ""> { + bits<6> imm; + + let Inst{28} = 0b1; + let Inst{25-24} = 0b11; + let Inst{21-16} = imm; + let Inst{10-8} = 0b110; +} + +def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<"s8", (ins imm0_7:$imm)> { + let Inst{21-19} = 0b001; +} + +def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<"s16", (ins imm0_15:$imm)> { + let Inst{21-20} = 0b01; +} + +def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<"s32", (ins imm0_31:$imm)> { + let Inst{21} = 0b1; +} + +class MVE_VRSHR_imm + : MVE_shift_with_imm<"vrshr", suffix, (outs MQPR:$Qd), + !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", + vpred_r, ""> { + bits<6> imm; + + let Inst{25-24} = 0b11; + let Inst{21-16} = imm; + let Inst{10-8} = 0b010; +} + +def MVE_VRSHR_imms8 : MVE_VRSHR_imm<"s8", (ins shr_imm8:$imm)> { + let Inst{28} = 0b0; + let Inst{21-19} = 0b001; +} + +def MVE_VRSHR_immu8 : MVE_VRSHR_imm<"u8", (ins shr_imm8:$imm)> { + let Inst{28} = 0b1; + let Inst{21-19} = 0b001; +} + +def MVE_VRSHR_imms16 : MVE_VRSHR_imm<"s16", (ins shr_imm16:$imm)> { + let Inst{28} = 0b0; + let Inst{21-20} = 0b01; +} + +def MVE_VRSHR_immu16 : MVE_VRSHR_imm<"u16", (ins shr_imm16:$imm)> { + let Inst{28} = 0b1; + let Inst{21-20} = 0b01; +} + +def MVE_VRSHR_imms32 : MVE_VRSHR_imm<"s32", (ins shr_imm32:$imm)> { + let Inst{28} = 0b0; + let Inst{21} = 0b1; +} + +def MVE_VRSHR_immu32 : MVE_VRSHR_imm<"u32", (ins shr_imm32:$imm)> { + let Inst{28} = 0b1; + let Inst{21} = 0b1; +} + +class MVE_VSHR_imm + : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd), + !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", + vpred_r, ""> { + bits<6> imm; + + let Inst{25-24} = 0b11; + let Inst{21-16} = imm; + let Inst{10-8} = 0b000; +} + +def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm)> { + let Inst{28} = 0b0; + let Inst{21-19} = 0b001; +} + +def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm)> { + let Inst{28} = 0b1; + let Inst{21-19} = 0b001; +} + +def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm)> { + let Inst{28} = 0b0; + let Inst{21-20} = 0b01; +} + +def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm)> { + let Inst{28} = 0b1; + let Inst{21-20} = 0b01; +} + +def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm)> { + let Inst{28} = 0b0; + let Inst{21} = 0b1; +} + +def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm)> { + let Inst{28} = 0b1; + let Inst{21} = 0b1; +} + +class MVE_VSHL_imm + : MVE_shift_with_imm<"vshl", suffix, (outs MQPR:$Qd), + !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", + vpred_r, ""> { + bits<6> imm; + + let Inst{28} = 0b0; + let Inst{25-24} = 0b11; + let Inst{21-16} = imm; + let Inst{10-8} = 0b101; +} + +def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm)> { + let Inst{21-19} = 0b001; +} + +def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm)> { + let Inst{21-20} = 0b01; +} + +def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> { + let Inst{21} = 0b1; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)), + (v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>; + def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)), + (v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>; + def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)), + (v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>; + + def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)), + (v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>; + def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)), + (v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>; + def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)), + (v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>; + + def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)), + (v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>; + def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)), + (v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>; + def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)), + (v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>; +} + +// end of mve_shift instructions + +// start of MVE Floating Point instructions + +class MVE_float pattern=[]> + : MVE_f { + bits<4> Qm; + + let Inst{12} = 0b0; + let Inst{6} = 0b1; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; +} + +class MVE_VRINT op, string suffix, bits<2> size, + list pattern=[]> + : MVE_float { + bits<4> Qd; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{22} = Qd{3}; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17-16} = 0b10; + let Inst{15-13} = Qd{2-0}; + let Inst{11-10} = 0b01; + let Inst{9-7} = op{2-0}; + let Inst{4} = 0b0; + +} + +multiclass MVE_VRINT_ops size, list pattern=[]> { + def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>; + def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>; + def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>; + def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>; + def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>; + def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>; +} + +defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>; +defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))), + (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>; + def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))), + (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>; + def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))), + (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>; + def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))), + (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>; + def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))), + (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>; + def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))), + (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>; + def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))), + (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>; + def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))), + (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>; + def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))), + (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>; + def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))), + (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>; +} + +class MVEFloatArithNeon pattern=[]> + : MVE_float { + let Inst{20} = size; + let Inst{16} = 0b0; +} + +class MVE_VMUL_fp pattern=[]> + : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd), + (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "", + pattern> { + bits<4> Qd; + bits<4> Qn; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b110; + let Inst{22} = Qd{3}; + let Inst{21} = 0b0; + let Inst{19-17} = Qn{2-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{12-8} = 0b01101; + let Inst{7} = Qn{3}; + let Inst{4} = 0b1; +} + +def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>; +def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), + (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; + def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), + (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; +} + +class MVE_VCMLA pattern=[]> + : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd), + (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), + "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", pattern> { + bits<4> Qd; + bits<4> Qn; + bits<2> rot; + + let Inst{28} = 0b1; + let Inst{25} = 0b0; + let Inst{24-23} = rot; + let Inst{22} = Qd{3}; + let Inst{21} = 0b1; + let Inst{19-17} = Qn{2-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{12-8} = 0b01000; + let Inst{7} = Qn{3}; + let Inst{4} = 0b0; +} + +def MVE_VCMLAf16 : MVE_VCMLA<"f16", 0b0>; +def MVE_VCMLAf32 : MVE_VCMLA<"f32", 0b1>; + +class MVE_VADDSUBFMA_fp pattern=[]> + : MVEFloatArithNeon { + bits<4> Qd; + bits<4> Qn; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b110; + let Inst{22} = Qd{3}; + let Inst{21} = bit_21; + let Inst{19-17} = Qn{2-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{11-9} = 0b110; + let Inst{8} = bit_8; + let Inst{7} = Qn{3}; + let Inst{4} = bit_4; +} + +def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; +def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; + +def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; +def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; + +def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; +def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), + (v4f32 (MVE_VADDf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; + def : Pat<(v8f16 (fadd (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), + (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; +} + +def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; +def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), + (v4f32 (MVE_VSUBf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; + def : Pat<(v8f16 (fsub (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), + (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; +} + +class MVE_VCADD pattern=[]> + : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd), + (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), + "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> { + bits<4> Qd; + bits<4> Qn; + bit rot; + + let Inst{28} = 0b1; + let Inst{25} = 0b0; + let Inst{24} = rot; + let Inst{23} = 0b1; + let Inst{22} = Qd{3}; + let Inst{21} = 0b0; + let Inst{19-17} = Qn{2-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{12-8} = 0b01000; + let Inst{7} = Qn{3}; + let Inst{4} = 0b0; +} + +def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>; +def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>; + +class MVE_VABD_fp + : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), + "$Qd, $Qn, $Qm", vpred_r, ""> { + bits<4> Qd; + bits<4> Qn; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b110; + let Inst{22} = Qd{3}; + let Inst{21} = 0b1; + let Inst{20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{15-13} = Qd{2-0}; + let Inst{11-8} = 0b1101; + let Inst{7} = Qn{3}; + let Inst{4} = 0b0; +} + +def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>; +def MVE_VABDf16 : MVE_VABD_fp<"f16", 0b1>; + +class MVE_VCVT_fix pattern=[]> + : MVE_float<"vcvt", suffix, + (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6), + "$Qd, $Qm, $imm6", vpred_r, "", pattern> { + bits<4> Qd; + bits<6> imm6; + + let Inst{28} = U; + let Inst{25-23} = 0b111; + let Inst{22} = Qd{3}; + let Inst{21} = 0b1; + let Inst{19-16} = imm6{3-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{11-10} = 0b11; + let Inst{9} = fsi; + let Inst{8} = op; + let Inst{7} = 0b0; + let Inst{4} = 0b1; + + let DecoderMethod = "DecodeMVEVCVTt1fp"; +} + +class MVE_VCVT_imm_asmop : AsmOperandClass { + let PredicateMethod = "isImmediate<1," # Bits # ">"; + let DiagnosticString = + "MVE fixed-point immediate operand must be between 1 and " # Bits; + let Name = "MVEVcvtImm" # Bits; + let RenderMethod = "addImmOperands"; +} +class MVE_VCVT_imm: Operand { + let ParserMatchClass = MVE_VCVT_imm_asmop; + let EncoderMethod = "getNEONVcvtImm32OpValue"; + let DecoderMethod = "DecodeVCVTImmOperand"; +} + +class MVE_VCVT_fix_f32 + : MVE_VCVT_fix> { + let Inst{20} = imm6{4}; +} +class MVE_VCVT_fix_f16 + : MVE_VCVT_fix> { + let Inst{20} = 0b1; +} + +def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>; +def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>; +def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>; +def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>; +def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>; +def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>; +def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>; +def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>; + +class MVE_VCVT_fp_int_anpm size, bit op, string anpm, + bits<2> rm, list pattern=[]> + : MVE_float { + bits<4> Qd; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{22} = Qd{3}; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17-16} = 0b11; + let Inst{15-13} = Qd{2-0}; + let Inst{12-10} = 0b000; + let Inst{9-8} = rm; + let Inst{7} = op; + let Inst{4} = 0b0; +} + +multiclass MVE_VCVT_fp_int_anpm_multi size, bit op, + list pattern=[]> { + def a : MVE_VCVT_fp_int_anpm; + def n : MVE_VCVT_fp_int_anpm; + def p : MVE_VCVT_fp_int_anpm; + def m : MVE_VCVT_fp_int_anpm; +} + +// This defines instructions such as MVE_VCVTu16f16a, with an explicit +// rounding-mode suffix on the mnemonic. The class below will define +// the bare MVE_VCVTu16f16 (with implied rounding toward zero). +defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>; +defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>; +defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>; +defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>; + +class MVE_VCVT_fp_int size, bits<2> op, + list pattern=[]> + : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), + (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> { + bits<4> Qd; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{22} = Qd{3}; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17-16} = 0b11; + let Inst{15-13} = Qd{2-0}; + let Inst{12-9} = 0b0011; + let Inst{8-7} = op; + let Inst{4} = 0b0; +} + +// The unsuffixed VCVT for float->int implicitly rounds toward zero, +// which I reflect here in the llvm instruction names +def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>; +def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>; +def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>; +def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>; +// Whereas VCVT for int->float rounds to nearest +def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>; +def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>; +def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>; +def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))), + (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>; + def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))), + (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>; + def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))), + (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>; + def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))), + (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>; + def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))), + (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>; + def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))), + (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>; + def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))), + (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>; + def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))), + (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>; +} + +class MVE_VABSNEG_fp size, bit negate, + list pattern=[]> + : MVE_float { + bits<4> Qd; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b111; + let Inst{22} = Qd{3}; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17-16} = 0b01; + let Inst{15-13} = Qd{2-0}; + let Inst{11-8} = 0b0111; + let Inst{7} = negate; + let Inst{4} = 0b0; +} + +def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>; +def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fabs MQPR:$src)), + (MVE_VABSf16 MQPR:$src)>; + def : Pat<(v4f32 (fabs MQPR:$src)), + (MVE_VABSf32 MQPR:$src)>; +} + +def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>; +def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fneg MQPR:$src)), + (MVE_VNEGf16 MQPR:$src)>; + def : Pat<(v4f32 (fneg MQPR:$src)), + (MVE_VNEGf32 MQPR:$src)>; +} + +class MVE_VMAXMINNMA pattern=[]> + : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), + NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", + pattern> { + bits<4> Qd; + bits<4> Qm; + + let Inst{28} = size; + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{21-16} = 0b111111; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = bit_12; + let Inst{11-6} = 0b111010; + let Inst{5} = Qm{3}; + let Inst{4} = 0b0; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b1; +} + +def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>; +def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>; + +def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>; +def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>; + +// end of MVE Floating Point instructions + +// start of MVE compares + +class MVE_VCMPqq bits_21_20, + VCMPPredicateOperand predtype, list pattern=[]> + : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc), + NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", pattern> { + // Base class for comparing two vector registers + bits<3> fc; + bits<4> Qn; + bits<4> Qm; + + let Inst{28} = bit_28; + let Inst{25-22} = 0b1000; + let Inst{21-20} = bits_21_20; + let Inst{19-17} = Qn{2-0}; + let Inst{16-13} = 0b1000; + let Inst{12} = fc{2}; + let Inst{11-8} = 0b1111; + let Inst{7} = fc{0}; + let Inst{6} = 0b0; + let Inst{5} = Qm{3}; + let Inst{4} = 0b0; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = fc{1}; + + let Constraints = ""; + + // We need a custom decoder method for these instructions because of + // the output VCCR operand, which isn't encoded in the instruction + // bits anywhere (there is only one choice for it) but has to be + // included in the MC operands so that codegen will be able to track + // its data flow between instructions, spill/reload it when + // necessary, etc. There seems to be no way to get the Tablegen + // decoder to emit an operand that isn't affected by any instruction + // bit. + let DecoderMethod = "DecodeMVEVCMP"; +} + +class MVE_VCMPqqf + : MVE_VCMPqq { + let Predicates = [HasMVEFloat]; +} + +class MVE_VCMPqqi size> + : MVE_VCMPqq { + let Inst{12} = 0b0; + let Inst{0} = 0b0; +} + +class MVE_VCMPqqu size> + : MVE_VCMPqq { + let Inst{12} = 0b0; + let Inst{0} = 0b1; +} + +class MVE_VCMPqqs size> + : MVE_VCMPqq { + let Inst{12} = 0b1; +} + +def MVE_VCMPf32 : MVE_VCMPqqf<"f32", 0b0>; +def MVE_VCMPf16 : MVE_VCMPqqf<"f16", 0b1>; + +def MVE_VCMPi8 : MVE_VCMPqqi<"i8", 0b00>; +def MVE_VCMPi16 : MVE_VCMPqqi<"i16", 0b01>; +def MVE_VCMPi32 : MVE_VCMPqqi<"i32", 0b10>; + +def MVE_VCMPu8 : MVE_VCMPqqu<"u8", 0b00>; +def MVE_VCMPu16 : MVE_VCMPqqu<"u16", 0b01>; +def MVE_VCMPu32 : MVE_VCMPqqu<"u32", 0b10>; + +def MVE_VCMPs8 : MVE_VCMPqqs<"s8", 0b00>; +def MVE_VCMPs16 : MVE_VCMPqqs<"s16", 0b01>; +def MVE_VCMPs32 : MVE_VCMPqqs<"s32", 0b10>; + +class MVE_VCMPqr bits_21_20, + VCMPPredicateOperand predtype, list pattern=[]> + : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, GPRwithZR:$Rm, predtype:$fc), + NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", pattern> { + // Base class for comparing a vector register with a scalar + bits<3> fc; + bits<4> Qn; + bits<4> Rm; + + let Inst{28} = bit_28; + let Inst{25-22} = 0b1000; + let Inst{21-20} = bits_21_20; + let Inst{19-17} = Qn{2-0}; + let Inst{16-13} = 0b1000; + let Inst{12} = fc{2}; + let Inst{11-8} = 0b1111; + let Inst{7} = fc{0}; + let Inst{6} = 0b1; + let Inst{5} = fc{1}; + let Inst{4} = 0b0; + let Inst{3-0} = Rm{3-0}; + + let Constraints = ""; + // Custom decoder method, for the same reason as MVE_VCMPqq + let DecoderMethod = "DecodeMVEVCMP"; +} + +class MVE_VCMPqrf + : MVE_VCMPqr { + let Predicates = [HasMVEFloat]; +} + +class MVE_VCMPqri size> + : MVE_VCMPqr { + let Inst{12} = 0b0; + let Inst{5} = 0b0; +} + +class MVE_VCMPqru size> + : MVE_VCMPqr { + let Inst{12} = 0b0; + let Inst{5} = 0b1; +} + +class MVE_VCMPqrs size> + : MVE_VCMPqr { + let Inst{12} = 0b1; +} + +def MVE_VCMPf32r : MVE_VCMPqrf<"f32", 0b0>; +def MVE_VCMPf16r : MVE_VCMPqrf<"f16", 0b1>; + +def MVE_VCMPi8r : MVE_VCMPqri<"i8", 0b00>; +def MVE_VCMPi16r : MVE_VCMPqri<"i16", 0b01>; +def MVE_VCMPi32r : MVE_VCMPqri<"i32", 0b10>; + +def MVE_VCMPu8r : MVE_VCMPqru<"u8", 0b00>; +def MVE_VCMPu16r : MVE_VCMPqru<"u16", 0b01>; +def MVE_VCMPu32r : MVE_VCMPqru<"u32", 0b10>; + +def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>; +def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>; +def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>; + +// end of MVE compares + +// start of MVE_qDest_qSrc + +class MVE_qDest_qSrc pattern=[]> + : MVE_p { + bits<4> Qd; + bits<4> Qm; + + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{15-13} = Qd{2-0}; + let Inst{11-9} = 0b111; + let Inst{6} = 0b0; + let Inst{5} = Qm{3}; + let Inst{4} = 0b0; + let Inst{3-1} = Qm{2-0}; +} + +class MVE_VQxDMLxDH size, list pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qn; + + let Inst{28} = subtract; + let Inst{21-20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12} = exch; + let Inst{8} = 0b0; + let Inst{7} = Qn{3}; + let Inst{0} = round; +} + +multiclass MVE_VQxDMLxDH_multi { + def s8 : MVE_VQxDMLxDH; + def s16 : MVE_VQxDMLxDH; + def s32 : MVE_VQxDMLxDH; +} + +defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; +defm MVE_VQDMLADHX : MVE_VQxDMLxDH_multi<"vqdmladhx", 0b1, 0b0, 0b0>; +defm MVE_VQRDMLADH : MVE_VQxDMLxDH_multi<"vqrdmladh", 0b0, 0b1, 0b0>; +defm MVE_VQRDMLADHX : MVE_VQxDMLxDH_multi<"vqrdmladhx", 0b1, 0b1, 0b0>; +defm MVE_VQDMLSDH : MVE_VQxDMLxDH_multi<"vqdmlsdh", 0b0, 0b0, 0b1>; +defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>; +defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>; +defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>; + +class MVE_VCMUL pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qn; + bits<2> rot; + + let Inst{28} = size; + let Inst{21-20} = 0b11; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12} = rot{1}; + let Inst{8} = 0b0; + let Inst{7} = Qn{3}; + let Inst{0} = rot{0}; + + let Predicates = [HasMVEFloat]; +} + +def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>; +def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>; + +class MVE_VMULL bits_21_20, + bit T, list pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qd; + bits<4> Qn; + bits<4> Qm; + + let Inst{28} = bit_28; + let Inst{21-20} = bits_21_20; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b1; + let Inst{12} = T; + let Inst{8} = 0b0; + let Inst{7} = Qn{3}; + let Inst{0} = 0b0; +} + +multiclass MVE_VMULL_multi bits_21_20> { + def bh : MVE_VMULL; + def th : MVE_VMULL; +} + +// For integer multiplies, bits 21:20 encode size, and bit 28 signedness. +// For polynomial multiplies, bits 21:20 take the unused value 0b11, and +// bit 28 switches to encoding the size. + +defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>; +defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>; +defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>; +defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>; +defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>; +defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>; +defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>; +defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>; + +class MVE_VxMULH size, + bit round, list pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qn; + + let Inst{28} = U; + let Inst{21-20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b1; + let Inst{12} = round; + let Inst{8} = 0b0; + let Inst{7} = Qn{3}; + let Inst{0} = 0b1; +} + +def MVE_VMULHs8 : MVE_VxMULH<"vmulh", "s8", 0b0, 0b00, 0b0>; +def MVE_VMULHs16 : MVE_VxMULH<"vmulh", "s16", 0b0, 0b01, 0b0>; +def MVE_VMULHs32 : MVE_VxMULH<"vmulh", "s32", 0b0, 0b10, 0b0>; +def MVE_VMULHu8 : MVE_VxMULH<"vmulh", "u8", 0b1, 0b00, 0b0>; +def MVE_VMULHu16 : MVE_VxMULH<"vmulh", "u16", 0b1, 0b01, 0b0>; +def MVE_VMULHu32 : MVE_VxMULH<"vmulh", "u32", 0b1, 0b10, 0b0>; + +def MVE_VRMULHs8 : MVE_VxMULH<"vrmulh", "s8", 0b0, 0b00, 0b1>; +def MVE_VRMULHs16 : MVE_VxMULH<"vrmulh", "s16", 0b0, 0b01, 0b1>; +def MVE_VRMULHs32 : MVE_VxMULH<"vrmulh", "s32", 0b0, 0b10, 0b1>; +def MVE_VRMULHu8 : MVE_VxMULH<"vrmulh", "u8", 0b1, 0b00, 0b1>; +def MVE_VRMULHu16 : MVE_VxMULH<"vrmulh", "u16", 0b1, 0b01, 0b1>; +def MVE_VRMULHu32 : MVE_VxMULH<"vrmulh", "u32", 0b1, 0b10, 0b1>; + +class MVE_VxMOVxN size, bit T, list pattern=[]> + : MVE_qDest_qSrc { + + let Inst{28} = bit_28; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17} = bit_17; + let Inst{16} = 0b1; + let Inst{12} = T; + let Inst{8} = 0b0; + let Inst{7} = !if(!eq(bit_17, 0), 1, 0); + let Inst{0} = 0b1; +} + +multiclass MVE_VxMOVxN_halves size> { + def bh : MVE_VxMOVxN; + def th : MVE_VxMOVxN; +} + +defm MVE_VMOVNi16 : MVE_VxMOVxN_halves<"vmovn", "i16", 0b1, 0b0, 0b00>; +defm MVE_VMOVNi32 : MVE_VxMOVxN_halves<"vmovn", "i32", 0b1, 0b0, 0b01>; +defm MVE_VQMOVNs16 : MVE_VxMOVxN_halves<"vqmovn", "s16", 0b0, 0b1, 0b00>; +defm MVE_VQMOVNs32 : MVE_VxMOVxN_halves<"vqmovn", "s32", 0b0, 0b1, 0b01>; +defm MVE_VQMOVNu16 : MVE_VxMOVxN_halves<"vqmovn", "u16", 0b1, 0b1, 0b00>; +defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>; +defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; +defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; + +class MVE_VCVT_ff pattern=[]> + : MVE_qDest_qSrc { + let Inst{28} = op; + let Inst{21-16} = 0b111111; + let Inst{12} = T; + let Inst{8-7} = 0b00; + let Inst{0} = 0b1; + + let Predicates = [HasMVEFloat]; +} + +multiclass MVE_VCVT_ff_halves { + def bh : MVE_VCVT_ff<"vcvtb", suffix, op, 0b0>; + def th : MVE_VCVT_ff<"vcvtt", suffix, op, 0b1>; +} + +defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>; +defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>; + +class MVE_VxCADD size, bit halve, + list pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qn; + bit rot; + + let Inst{28} = halve; + let Inst{21-20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12} = rot; + let Inst{8} = 0b1; + let Inst{7} = Qn{3}; + let Inst{0} = 0b0; +} + +def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>; +def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>; +def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>; + +def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>; +def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>; +def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>; + +class MVE_VADCSBC pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qn; + + let Inst{28} = subtract; + let Inst{21-20} = 0b11; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12} = I; + let Inst{8} = 0b1; + let Inst{7} = Qn{3}; + let Inst{0} = 0b0; + + // Custom decoder method in order to add the FPSCR operand(s), which + // Tablegen won't do right + let DecoderMethod = "DecodeMVEVADCInstruction"; +} + +def MVE_VADC : MVE_VADCSBC<"vadc", 0b0, 0b0, (ins cl_FPSCR_NZCV:$carryin)>; +def MVE_VADCI : MVE_VADCSBC<"vadci", 0b1, 0b0, (ins)>; + +def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>; +def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>; + +class MVE_VQDMULL pattern=[]> + : MVE_qDest_qSrc { + bits<4> Qn; + + let Inst{28} = size; + let Inst{21-20} = 0b11; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{12} = T; + let Inst{8} = 0b1; + let Inst{7} = Qn{3}; + let Inst{0} = 0b1; +} + +multiclass MVE_VQDMULL_halves { + def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>; + def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>; +} + +defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>; +defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>; + +// end of mve_qDest_qSrc + +// start of mve_qDest_rSrc + +class MVE_qr_base pattern=[]> + : MVE_p { + bits<4> Qd; + bits<4> Qn; + bits<4> Rm; + + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{19-17} = Qn{2-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{11-9} = 0b111; + let Inst{7} = Qn{3}; + let Inst{6} = 0b1; + let Inst{4} = 0b0; + let Inst{3-0} = Rm{3-0}; +} + +class MVE_qDest_rSrc pattern=[]> + : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm), + NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "", + pattern>; + +class MVE_qDestSrc_rSrc pattern=[]> + : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, rGPR:$Rm), + NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src", + pattern>; + +class MVE_qDest_single_rSrc pattern=[]> + : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, rGPR:$Rm), NoItinerary, iname, + suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", pattern> { + bits<4> Qd; + bits<4> Rm; + + let Inst{22} = Qd{3}; + let Inst{15-13} = Qd{2-0}; + let Inst{3-0} = Rm{3-0}; +} + +class MVE_VADDSUB_qr size, + bit bit_5, bit bit_12, bit bit_16, + bit bit_28, list pattern=[]> + : MVE_qDest_rSrc { + + let Inst{28} = bit_28; + let Inst{21-20} = size; + let Inst{16} = bit_16; + let Inst{12} = bit_12; + let Inst{8} = 0b1; + let Inst{5} = bit_5; +} + +multiclass MVE_VADDSUB_qr_sizes pattern=[]> { + def "8" : MVE_VADDSUB_qr; + def "16" : MVE_VADDSUB_qr; + def "32" : MVE_VADDSUB_qr; +} + +defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>; +defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>; +defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>; + +defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; +defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; +defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; + +class MVE_VQDMULL_qr pattern=[]> + : MVE_qDest_rSrc { + + let Inst{28} = size; + let Inst{21-20} = 0b11; + let Inst{16} = 0b0; + let Inst{12} = T; + let Inst{8} = 0b1; + let Inst{5} = 0b1; +} + +multiclass MVE_VQDMULL_qr_halves { + def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>; + def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>; +} + +defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>; +defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>; + +class MVE_VxADDSUB_qr bits_21_20, bit subtract, + list pattern=[]> + : MVE_qDest_rSrc { + + let Inst{28} = bit_28; + let Inst{21-20} = bits_21_20; + let Inst{16} = 0b0; + let Inst{12} = subtract; + let Inst{8} = 0b1; + let Inst{5} = 0b0; +} + +def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; +def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>; +def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>; +def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>; +def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>; +def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>; + +def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>; +def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>; +def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>; +def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>; +def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>; +def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>; + +let Predicates = [HasMVEFloat] in { + def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>; + def MVE_VADD_qr_f16 : MVE_VxADDSUB_qr<"vadd", "f16", 0b1, 0b11, 0b0>; + + def MVE_VSUB_qr_f32 : MVE_VxADDSUB_qr<"vsub", "f32", 0b0, 0b11, 0b1>; + def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>; +} + +class MVE_VxSHL_qr size, + bit bit_7, bit bit_17, list pattern=[]> + : MVE_qDest_single_rSrc { + + let Inst{28} = U; + let Inst{25-23} = 0b100; + let Inst{21-20} = 0b11; + let Inst{19-18} = size; + let Inst{17} = bit_17; + let Inst{16} = 0b1; + let Inst{12-8} = 0b11110; + let Inst{7} = bit_7; + let Inst{6-4} = 0b110; +} + +multiclass MVE_VxSHL_qr_types { + def s8 : MVE_VxSHL_qr; + def s16 : MVE_VxSHL_qr; + def s32 : MVE_VxSHL_qr; + def u8 : MVE_VxSHL_qr; + def u16 : MVE_VxSHL_qr; + def u32 : MVE_VxSHL_qr; +} + +defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>; +defm MVE_VRSHL_qr : MVE_VxSHL_qr_types<"vrshl", 0b0, 0b1>; +defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; +defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; +} + +class MVE_VBRSR size, list pattern=[]> + : MVE_qDest_rSrc { + + let Inst{28} = 0b1; + let Inst{21-20} = size; + let Inst{16} = 0b1; + let Inst{12} = 0b1; + let Inst{8} = 0b0; + let Inst{5} = 0b1; +} + +def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>; +def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>; +def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>; + +class MVE_VMUL_qr_int size, list pattern=[]> + : MVE_qDest_rSrc { + + let Inst{28} = 0b0; + let Inst{21-20} = size; + let Inst{16} = 0b1; + let Inst{12} = 0b1; + let Inst{8} = 0b0; + let Inst{5} = 0b1; +} + +def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; +def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; +def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; + +class MVE_VxxMUL_qr bits_21_20, list pattern=[]> + : MVE_qDest_rSrc { + + let Inst{28} = bit_28; + let Inst{21-20} = bits_21_20; + let Inst{16} = 0b1; + let Inst{12} = 0b0; + let Inst{8} = 0b0; + let Inst{5} = 0b1; +} + +def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>; +def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>; +def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>; + +def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; +def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; +def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; + +let Predicates = [HasMVEFloat] in { + def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; + def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; +} + +class MVE_VFMAMLA_qr bits_21_20, bit S, + list pattern=[]> + : MVE_qDestSrc_rSrc { + + let Inst{28} = bit_28; + let Inst{21-20} = bits_21_20; + let Inst{16} = 0b1; + let Inst{12} = S; + let Inst{8} = 0b0; + let Inst{5} = 0b0; +} + +def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; +def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>; +def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>; +def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>; +def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>; +def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>; + +def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>; +def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>; +def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>; +def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; +def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; +def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; + +let Predicates = [HasMVEFloat] in { + def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>; + def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>; + def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>; + def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>; +} + +class MVE_VQDMLAH_qr size, + bit bit_5, bit bit_12, list pattern=[]> + : MVE_qDestSrc_rSrc { + + let Inst{28} = U; + let Inst{21-20} = size; + let Inst{16} = 0b0; + let Inst{12} = bit_12; + let Inst{8} = 0b0; + let Inst{5} = bit_5; +} + +multiclass MVE_VQDMLAH_qr_types { + def s8 : MVE_VQDMLAH_qr; + def s16 : MVE_VQDMLAH_qr; + def s32 : MVE_VQDMLAH_qr; +} + +defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>; +defm MVE_VQRDMLAH_qr : MVE_VQDMLAH_qr_types<"vqrdmlah", 0b0, 0b0>; +defm MVE_VQDMLASH_qr : MVE_VQDMLAH_qr_types<"vqdmlash", 0b1, 0b1>; +defm MVE_VQRDMLASH_qr : MVE_VQDMLAH_qr_types<"vqrdmlash", 0b0, 0b1>; + +class MVE_VxDUP size, bit bit_12, + list pattern=[]> + : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn), + (ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary, + iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src", + pattern> { + bits<4> Qd; + bits<4> Rn; + bits<2> imm; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{21-20} = size; + let Inst{19-17} = Rn{3-1}; + let Inst{16} = 0b1; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = bit_12; + let Inst{11-8} = 0b1111; + let Inst{7} = imm{1}; + let Inst{6-1} = 0b110111; + let Inst{0} = imm{0}; +} + +def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; +def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0>; +def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0>; + +def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1>; +def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1>; +def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1>; + +class MVE_VxWDUP size, bit bit_12, + list pattern=[]> + : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn), + (ins tGPREven:$Rn_src, tGPROdd:$Rm, MVE_VIDUP_imm:$imm), NoItinerary, + iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src", + pattern> { + bits<4> Qd; + bits<4> Rm; + bits<4> Rn; + bits<2> imm; + + let Inst{28} = 0b0; + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{21-20} = size; + let Inst{19-17} = Rn{3-1}; + let Inst{16} = 0b1; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = bit_12; + let Inst{11-8} = 0b1111; + let Inst{7} = imm{1}; + let Inst{6-4} = 0b110; + let Inst{3-1} = Rm{3-1}; + let Inst{0} = imm{0}; +} + +def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>; +def MVE_VIWDUPu16 : MVE_VxWDUP<"viwdup", "u16", 0b01, 0b0>; +def MVE_VIWDUPu32 : MVE_VxWDUP<"viwdup", "u32", 0b10, 0b0>; + +def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>; +def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; +def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; + +class MVE_VCTP size, list pattern=[]> + : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, + "$Rn", vpred_n, "", pattern> { + bits<4> Rn; + + let Inst{28-27} = 0b10; + let Inst{26-22} = 0b00000; + let Inst{21-20} = size; + let Inst{19-16} = Rn{3-0}; + let Inst{15-11} = 0b11101; + let Inst{10-0} = 0b00000000001; + let Unpredictable{10-0} = 0b11111111111; + + let Constraints = ""; + let DecoderMethod = "DecodeMveVCTP"; +} + +def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; +def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; +def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; +def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; + +// end of mve_qDest_rSrc + +// start of coproc mov + +class MVE_VMOV_64bit + : MVE_VMOV_lane_base { + bits<5> Rt; + bits<5> Rt2; + bits<4> Qd; + bit idx; + bit idx2; + + let Inst{31-23} = 0b111011000; + let Inst{22} = Qd{3}; + let Inst{21} = 0b0; + let Inst{20} = to_qreg; + let Inst{19-16} = Rt2{3-0}; + let Inst{15-13} = Qd{2-0}; + let Inst{12-5} = 0b01111000; + let Inst{4} = idx2; + let Inst{3-0} = Rt{3-0}; +} + +// The assembly syntax for these instructions mentions the vector +// register name twice, e.g. +// +// vmov q2[2], q2[0], r0, r1 +// vmov r0, r1, q2[2], q2[0] +// +// which needs a bit of juggling with MC operand handling. +// +// For the move _into_ a vector register, the MC operand list also has +// to mention the register name twice: once as the output, and once as +// an extra input to represent where the unchanged half of the output +// register comes from (when this instruction is used in code +// generation). So we arrange that the first mention of the vector reg +// in the instruction is considered by the AsmMatcher to be the output +// ($Qd), and the second one is the input ($QdSrc). Binding them +// together with the existing 'tie' constraint is enough to enforce at +// register allocation time that they have to be the same register. +// +// For the move _from_ a vector register, there's no way to get round +// the fact that both instances of that register name have to be +// inputs. They have to be the same register again, but this time, we +// can't use a tie constraint, because that has to be between an +// output and an input operand. So this time, we have to arrange that +// the q-reg appears just once in the MC operand list, in spite of +// being mentioned twice in the asm syntax - which needs a custom +// AsmMatchConverter. + +def MVE_VMOV_q_rr : MVE_VMOV_64bit<(outs MQPR:$Qd), + (ins MQPR:$QdSrc, rGPR:$Rt, rGPR:$Rt2), + 0b1, "$Qd$idx, $QdSrc$idx2, $Rt, $Rt2", + "$Qd = $QdSrc"> { + let DecoderMethod = "DecodeMVEVMOVDRegtoQ"; +} + +def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd), + 0b0, "$Rt, $Rt2, $Qd$idx, $Qd$idx2", ""> { + let DecoderMethod = "DecodeMVEVMOVQtoDReg"; + let AsmMatchConverter = "cvtMVEVMOVQtoDReg"; +} + +// end of coproc mov + +// start of MVE interleaving load/store + +// Base class for the family of interleaving/deinterleaving +// load/stores with names like VLD20.8 and VST43.32. +class MVE_vldst24_base stage, bits<2> size, + bit load, dag Oops, dag loadIops, dag wbIops, + string iname, string ops, + string cstr, list pattern=[]> + : MVE_MI { + bits<4> VQd; + bits<4> Rn; + + let Inst{31-22} = 0b1111110010; + let Inst{21} = writeback; + let Inst{20} = load; + let Inst{19-16} = Rn; + let Inst{15-13} = VQd{2-0}; + let Inst{12-9} = 0b1111; + let Inst{8-7} = size; + let Inst{6-5} = stage; + let Inst{4-1} = 0b0000; + let Inst{0} = fourregs; + + let mayLoad = load; + let mayStore = !eq(load,0); +} + +// A parameter class used to encapsulate all the ways the writeback +// variants of VLD20 and friends differ from the non-writeback ones. +class MVE_vldst24_writeback { + bit writeback = b; + dag Oops = Oo; + dag Iops = Io; + string syntax = sy; + string cstr = c; + string id_suffix = n; +} + +// Another parameter class that encapsulates the differences between VLD2x +// and VLD4x. +class MVE_vldst24_nvecs s, bit b, RegisterOperand vl> { + int nvecs = n; + list stages = s; + bit bit0 = b; + RegisterOperand VecList = vl; +} + +// A third parameter class that distinguishes VLDnn.8 from .16 from .32. +class MVE_vldst24_lanesize b> { + int lanesize = i; + bits<2> sizebits = b; +} + +// A base class for each direction of transfer: one for load, one for +// store. I can't make these a fourth independent parametric tuple +// class, because they have to take the nvecs tuple class as a +// parameter, in order to find the right VecList operand type. + +class MVE_vld24_base pat, bits<2> size, + MVE_vldst24_writeback wb, string iname, + list pattern=[]> + : MVE_vldst24_base; + +class MVE_vst24_base pat, bits<2> size, + MVE_vldst24_writeback wb, string iname, + list pattern=[]> + : MVE_vldst24_base; + +// Actually define all the interleaving loads and stores, by a series +// of nested foreaches over number of vectors (VLD2/VLD4); stage +// within one of those series (VLDx0/VLDx1/VLDx2/VLDx3); size of +// vector lane; writeback or no writeback. +foreach n = [MVE_vldst24_nvecs<2, [0,1], 0, VecList2Q>, + MVE_vldst24_nvecs<4, [0,1,2,3], 1, VecList4Q>] in +foreach stage = n.stages in +foreach s = [MVE_vldst24_lanesize< 8, 0b00>, + MVE_vldst24_lanesize<16, 0b01>, + MVE_vldst24_lanesize<32, 0b10>] in +foreach wb = [MVE_vldst24_writeback< + 1, (outs rGPR:$wb), (ins t2_nosp_addr_offset_none:$Rn), + "!", "$Rn.base = $wb", "_wb">, + MVE_vldst24_writeback<0, (outs), (ins t2_addr_offset_none:$Rn)>] in { + + // For each case within all of those foreaches, define the actual + // instructions. The def names are made by gluing together pieces + // from all the parameter classes, and will end up being things like + // MVE_VLD20_8 and MVE_VST43_16_wb. + + def "MVE_VLD" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix + : MVE_vld24_base; + + def "MVE_VST" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix + : MVE_vst24_base; +} + +// end of MVE interleaving load/store + +// start of MVE predicable load/store + +// A parameter class for the direction of transfer. +class MVE_ldst_direction { + bit load = b; + dag Oops = Oo; + dag Iops = Io; + string cstr = c; +} +def MVE_ld: MVE_ldst_direction<1, (outs MQPR:$Qd), (ins), ",@earlyclobber $Qd">; +def MVE_st: MVE_ldst_direction<0, (outs), (ins MQPR:$Qd)>; + +// A parameter class for the size of memory access in a load. +class MVE_memsz e, int s, AddrMode m, string mn, list types> { + bits<2> encoding = e; // opcode bit(s) for encoding + int shift = s; // shift applied to immediate load offset + AddrMode AM = m; + + // For instruction aliases: define the complete list of type + // suffixes at this size, and the canonical ones for loads and + // stores. + string MnemonicLetter = mn; + int TypeBits = !shl(8, s); + string CanonLoadSuffix = ".u" # TypeBits; + string CanonStoreSuffix = "." # TypeBits; + list suffixes = !foreach(letter, types, "." # letter # TypeBits); +} + +// Instances of MVE_memsz. +// +// (memD doesn't need an AddrMode, because those are only for +// contiguous loads, and memD is only used by gather/scatters.) +def MVE_memB: MVE_memsz<0b00, 0, AddrModeT2_i7, "b", ["", "u", "s"]>; +def MVE_memH: MVE_memsz<0b01, 1, AddrModeT2_i7s2, "h", ["", "u", "s", "f"]>; +def MVE_memW: MVE_memsz<0b10, 2, AddrModeT2_i7s4, "w", ["", "u", "s", "f"]>; +def MVE_memD: MVE_memsz<0b11, 3, ?, "d", ["", "u", "s", "f"]>; + +// This is the base class for all the MVE loads and stores other than +// the interleaving ones. All the non-interleaving loads/stores share +// the characteristic that they operate on just one vector register, +// so they are VPT-predicable. +// +// The predication operand is vpred_n, for both loads and stores. For +// store instructions, the reason is obvious: if there is no output +// register, there can't be a need for an input parameter giving the +// output register's previous value. Load instructions also don't need +// that input parameter, because unlike MVE data processing +// instructions, predicated loads are defined to set the inactive +// lanes of the output register to zero, instead of preserving their +// input values. +class MVE_VLDRSTR_base pattern=[]> + : MVE_p { + bits<3> Qd; + + let Inst{28} = U; + let Inst{25} = 0b0; + let Inst{24} = P; + let Inst{22} = 0b0; + let Inst{21} = W; + let Inst{20} = dir.load; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = opc; + let Inst{11-9} = 0b111; + + let mayLoad = dir.load; + let mayStore = !eq(dir.load,0); +} + +// Contiguous load and store instructions. These come in two main +// categories: same-size loads/stores in which 128 bits of vector +// register is transferred to or from 128 bits of memory in the most +// obvious way, and widening loads / narrowing stores, in which the +// size of memory accessed is less than the size of a vector register, +// so the load instructions sign- or zero-extend each memory value +// into a wider vector lane, and the store instructions truncate +// correspondingly. +// +// The instruction mnemonics for these two classes look reasonably +// similar, but the actual encodings are different enough to need two +// separate base classes. + +// Contiguous, same size +class MVE_VLDRSTR_cs + : MVE_VLDRSTR_base { + bits<12> addr; + let Inst{23} = addr{7}; + let Inst{19-16} = addr{11-8}; + let Inst{8-7} = memsz.encoding; + let Inst{6-0} = addr{6-0}; +} + +// Contiguous, widening/narrowing +class MVE_VLDRSTR_cw size, dag oops, dag iops, + string asm, string suffix, IndexMode im, + string ops, string cstr> + : MVE_VLDRSTR_base { + bits<11> addr; + let Inst{23} = addr{7}; + let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit + let Inst{18-16} = addr{10-8}; + let Inst{8-7} = size; + let Inst{6-0} = addr{6-0}; + + let IM = im; +} + +// Multiclass wrapper on each of the _cw and _cs base classes, to +// generate three writeback modes (none, preindex, postindex). + +multiclass MVE_VLDRSTR_cw_m size> { + let AM = memsz.AM in { + def "" : MVE_VLDRSTR_cw< + dir, memsz, U, 1, 0, size, + dir.Oops, !con(dir.Iops, (ins taddrmode_imm7:$addr)), + asm, suffix, IndexModeNone, "$Qd, $addr", "">; + + def _pre : MVE_VLDRSTR_cw< + dir, memsz, U, 1, 1, size, + !con((outs tGPR:$wb), dir.Oops), + !con(dir.Iops, (ins taddrmode_imm7:$addr)), + asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> { + let DecoderMethod = "DecodeMVE_MEM_1_pre<"#memsz.shift#">"; + } + + def _post : MVE_VLDRSTR_cw< + dir, memsz, U, 0, 1, size, + !con((outs tGPR:$wb), dir.Oops), + !con(dir.Iops, (ins t_addr_offset_none:$Rn, + t2am_imm7_offset:$addr)), + asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> { + bits<4> Rn; + let Inst{18-16} = Rn{2-0}; + } + } +} + +multiclass MVE_VLDRSTR_cs_m { + let AM = memsz.AM in { + def "" : MVE_VLDRSTR_cs< + dir, memsz, 1, 0, + dir.Oops, !con(dir.Iops, (ins t2addrmode_imm7:$addr)), + asm, suffix, IndexModeNone, "$Qd, $addr", "">; + + def _pre : MVE_VLDRSTR_cs< + dir, memsz, 1, 1, + !con((outs rGPR:$wb), dir.Oops), + !con(dir.Iops, (ins t2addrmode_imm7_pre:$addr)), + asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> { + let DecoderMethod = "DecodeMVE_MEM_2_pre<"#memsz.shift#">"; + } + + def _post : MVE_VLDRSTR_cs< + dir, memsz, 0, 1, + !con((outs rGPR:$wb), dir.Oops), + // We need an !if here to select the base register class, + // because it's legal to write back to SP in a load of this + // type, but not in a store. + !con(dir.Iops, (ins !if(dir.load, t2_addr_offset_none, + t2_nosp_addr_offset_none):$Rn, + t2am_imm7_offset:$addr)), + asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> { + bits<4> Rn; + let Inst{19-16} = Rn{3-0}; + } + } +} + +// Now actually declare all the contiguous load/stores, via those +// multiclasses. The instruction ids coming out of this are the bare +// names shown in the defm, with _pre or _post appended for writeback, +// e.g. MVE_VLDRBS16, MVE_VSTRB16_pre, MVE_VSTRHU16_post. + +defm MVE_VLDRBS16: MVE_VLDRSTR_cw_m; +defm MVE_VLDRBS32: MVE_VLDRSTR_cw_m; +defm MVE_VLDRBU16: MVE_VLDRSTR_cw_m; +defm MVE_VLDRBU32: MVE_VLDRSTR_cw_m; +defm MVE_VLDRHS32: MVE_VLDRSTR_cw_m; +defm MVE_VLDRHU32: MVE_VLDRSTR_cw_m; + +defm MVE_VLDRBU8: MVE_VLDRSTR_cs_m; +defm MVE_VLDRHU16: MVE_VLDRSTR_cs_m; +defm MVE_VLDRWU32: MVE_VLDRSTR_cs_m; + +defm MVE_VSTRB16: MVE_VLDRSTR_cw_m; +defm MVE_VSTRB32: MVE_VLDRSTR_cw_m; +defm MVE_VSTRH32: MVE_VLDRSTR_cw_m; + +defm MVE_VSTRBU8 : MVE_VLDRSTR_cs_m; +defm MVE_VSTRHU16: MVE_VLDRSTR_cs_m; +defm MVE_VSTRWU32: MVE_VLDRSTR_cs_m; + +// Gather loads / scatter stores whose address operand is of the form +// [Rn,Qm], i.e. a single GPR as the common base address, plus a +// vector of offset from it. ('Load/store this sequence of elements of +// the same array.') +// +// Like the contiguous family, these loads and stores can widen the +// loaded values / truncate the stored ones, or they can just +// load/store the same size of memory and vector lane. But unlike the +// contiguous family, there's no particular difference in encoding +// between those two cases. +// +// This family also comes with the option to scale the offset values +// in Qm by the size of the loaded memory (i.e. to treat them as array +// indices), or not to scale them (to treat them as plain byte offsets +// in memory, so that perhaps the loaded values are unaligned). The +// scaled instructions' address operand in assembly looks like +// [Rn,Qm,UXTW #2] or similar. + +// Base class. +class MVE_VLDRSTR_rq size, bit os, string asm, string suffix, int shift> + : MVE_VLDRSTR_base:$addr)), + asm, suffix, "$Qd, $addr", dir.cstr> { + bits<7> addr; + let Inst{23} = 0b1; + let Inst{19-16} = addr{6-3}; + let Inst{8-7} = size; + let Inst{6} = memsz.encoding{1}; + let Inst{5} = 0; + let Inst{4} = memsz.encoding{0}; + let Inst{3-1} = addr{2-0}; + let Inst{0} = os; +} + +// Multiclass that defines the scaled and unscaled versions of an +// instruction, when the memory size is wider than a byte. The scaled +// version gets the default name like MVE_VLDRBU16_rq; the unscaled / +// potentially unaligned version gets a "_u" suffix, e.g. +// MVE_VLDRBU16_rq_u. +multiclass MVE_VLDRSTR_rq_w size> { + def _u : MVE_VLDRSTR_rq; + def "" : MVE_VLDRSTR_rq; +} + +// Subclass of MVE_VLDRSTR_rq with the same API as that multiclass, +// for use when the memory size is one byte, so there's no 'scaled' +// version of the instruction at all. (This is encoded as if it were +// unscaled, but named in the default way with no _u suffix.) +class MVE_VLDRSTR_rq_b size> + : MVE_VLDRSTR_rq; + +// Actually define all the loads and stores in this family. + +def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b; +def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b; +def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b; +def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b; +def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b; + +defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w; +defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w; +defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w; +defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w; +defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w; + +def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b; +def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b; +def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b; + +defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w; +defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w; +defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w; +defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w; + +// Gather loads / scatter stores whose address operand is of the form +// [Qm,#imm], i.e. a vector containing a full base address for each +// loaded item, plus an immediate offset applied consistently to all +// of them. ('Load/store the same field from this vector of pointers +// to a structure type.') +// +// This family requires the vector lane size to be at least 32 bits +// (so there's room for an address in each lane at all). It has no +// widening/narrowing variants. But it does support preindex +// writeback, in which the address vector is updated to hold the +// addresses actually loaded from. + +// Base class. +class MVE_VLDRSTR_qi + : MVE_VLDRSTR_base:$addr)), + asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> { + bits<11> addr; + let Inst{23} = addr{7}; + let Inst{19-17} = addr{10-8}; + let Inst{16} = 0; + let Inst{8} = memsz.encoding{0}; // enough to distinguish 32- from 64-bit + let Inst{7} = 0; + let Inst{6-0} = addr{6-0}; +} + +// Multiclass that generates the non-writeback and writeback variants. +multiclass MVE_VLDRSTR_qi_m { + def "" : MVE_VLDRSTR_qi; + def _pre : MVE_VLDRSTR_qi { + let DecoderMethod="DecodeMVE_MEM_3_pre<"#memsz.shift#">"; + } +} + +// Actual instruction definitions. +defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m; +defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m; +defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m; +defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m; + +// Define aliases for all the instructions where memory size and +// vector lane size are the same. These are mnemonic aliases, so they +// apply consistently across all of the above families - contiguous +// loads, and both the rq and qi types of gather/scatter. +// +// Rationale: As long as you're loading (for example) 16-bit memory +// values into 16-bit vector lanes, you can think of them as signed or +// unsigned integers, fp16 or just raw 16-bit blobs and it makes no +// difference. So we permit all of vldrh.16, vldrh.u16, vldrh.s16, +// vldrh.f16 and treat them all as equivalent to the canonical +// spelling (which happens to be .u16 for loads, and just .16 for +// stores). + +foreach vpt_cond = ["", "t", "e"] in +foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in +foreach suffix = memsz.suffixes in { + + // These foreaches are conceptually ifs, implemented by iterating a + // dummy variable over a list with 0 or 1 elements depending on the + // condition. The idea is to iterate over _nearly_ all the suffixes + // in memsz.suffixes, but omit the one we want all the others to alias. + + foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []) in + def : MnemonicAlias< + "vldr" # memsz.MnemonicLetter # vpt_cond # suffix, + "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>; + + foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []) in + def : MnemonicAlias< + "vstr" # memsz.MnemonicLetter # vpt_cond # suffix, + "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>; +} + +// end of MVE predicable load/store + +class MVE_VPT size, dag iops, string asm, list pattern=[]> + : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> { + bits<3> fc; + bits<4> Mk; + bits<3> Qn; + + let Inst{31-23} = 0b111111100; + let Inst{22} = Mk{3}; + let Inst{21-20} = size; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b1; + let Inst{15-13} = Mk{2-0}; + let Inst{12} = fc{2}; + let Inst{11-8} = 0b1111; + let Inst{7} = fc{0}; + let Inst{4} = 0b0; + + let Defs = [VPR, P0]; +} + +class MVE_VPTt1 size, dag iops> + : MVE_VPT { + bits<4> Qm; + bits<4> Mk; + + let Inst{6} = 0b0; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = fc{1}; +} + +class MVE_VPTt1i size> + : MVE_VPTt1 { + let Inst{12} = 0b0; + let Inst{0} = 0b0; +} + +def MVE_VPTv4i32 : MVE_VPTt1i<"i32", 0b10>; +def MVE_VPTv8i16 : MVE_VPTt1i<"i16", 0b01>; +def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>; + +class MVE_VPTt1u size> + : MVE_VPTt1 { + let Inst{12} = 0b0; + let Inst{0} = 0b1; +} + +def MVE_VPTv4u32 : MVE_VPTt1u<"u32", 0b10>; +def MVE_VPTv8u16 : MVE_VPTt1u<"u16", 0b01>; +def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>; + +class MVE_VPTt1s size> + : MVE_VPTt1 { + let Inst{12} = 0b1; +} + +def MVE_VPTv4s32 : MVE_VPTt1s<"s32", 0b10>; +def MVE_VPTv8s16 : MVE_VPTt1s<"s16", 0b01>; +def MVE_VPTv16s8 : MVE_VPTt1s<"s8", 0b00>; + +class MVE_VPTt2 size, dag iops> + : MVE_VPT { + bits<4> Rm; + bits<3> fc; + bits<4> Mk; + + let Inst{6} = 0b1; + let Inst{5} = fc{1}; + let Inst{3-0} = Rm{3-0}; +} + +class MVE_VPTt2i size> + : MVE_VPTt2 { + let Inst{12} = 0b0; + let Inst{5} = 0b0; +} + +def MVE_VPTv4i32r : MVE_VPTt2i<"i32", 0b10>; +def MVE_VPTv8i16r : MVE_VPTt2i<"i16", 0b01>; +def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>; + +class MVE_VPTt2u size> + : MVE_VPTt2 { + let Inst{12} = 0b0; + let Inst{5} = 0b1; +} + +def MVE_VPTv4u32r : MVE_VPTt2u<"u32", 0b10>; +def MVE_VPTv8u16r : MVE_VPTt2u<"u16", 0b01>; +def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>; + +class MVE_VPTt2s size> + : MVE_VPTt2 { + let Inst{12} = 0b1; +} + +def MVE_VPTv4s32r : MVE_VPTt2s<"s32", 0b10>; +def MVE_VPTv8s16r : MVE_VPTt2s<"s16", 0b01>; +def MVE_VPTv16s8r : MVE_VPTt2s<"s8", 0b00>; + + +class MVE_VPTf pattern=[]> + : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, + "", pattern> { + bits<3> fc; + bits<4> Mk; + bits<3> Qn; + + let Inst{31-29} = 0b111; + let Inst{28} = size; + let Inst{27-23} = 0b11100; + let Inst{22} = Mk{3}; + let Inst{21-20} = 0b11; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b1; + let Inst{15-13} = Mk{2-0}; + let Inst{12} = fc{2}; + let Inst{11-8} = 0b1111; + let Inst{7} = fc{0}; + let Inst{4} = 0b0; + + let Defs = [P0]; + let Predicates = [HasMVEFloat]; +} + +class MVE_VPTft1 + : MVE_VPTf { + bits<3> fc; + bits<4> Qm; + + let Inst{6} = 0b0; + let Inst{5} = Qm{3}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = fc{1}; +} + +def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>; +def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>; + +class MVE_VPTft2 + : MVE_VPTf { + bits<3> fc; + bits<4> Rm; + + let Inst{6} = 0b1; + let Inst{5} = fc{1}; + let Inst{3-0} = Rm{3-0}; +} + +def MVE_VPTv4f32r : MVE_VPTft2<"f32", 0b0>; +def MVE_VPTv8f16r : MVE_VPTft2<"f16", 0b1>; + +def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary, + !strconcat("vpst", "${Mk}"), "", "", []> { + bits<4> Mk; + + let Inst{31-23} = 0b111111100; + let Inst{22} = Mk{3}; + let Inst{21-16} = 0b110001; + let Inst{15-13} = Mk{2-0}; + let Inst{12-0} = 0b0111101001101; + let Unpredictable{12} = 0b1; + let Unpredictable{7} = 0b1; + let Unpredictable{5} = 0b1; + + let Defs = [P0]; +} + +def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, + "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", []> { + bits<4> Qn; + bits<4> Qd; + bits<4> Qm; + + let Inst{28} = 0b1; + let Inst{25-23} = 0b100; + let Inst{22} = Qd{3}; + let Inst{21-20} = 0b11; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b1; + let Inst{15-13} = Qd{2-0}; + let Inst{12-9} = 0b0111; + let Inst{8} = 0b1; + let Inst{7} = Qn{3}; + let Inst{6} = 0b0; + let Inst{5} = Qm{3}; + let Inst{4} = 0b0; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b1; +} + +foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", + "i8", "i16", "i32", "f16", "f32"] in +def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm", + (MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; + +def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary, + "vpnot", "", "", vpred_n, "", []> { + let Inst{31-0} = 0b11111110001100010000111101001101; + let Unpredictable{19-17} = 0b111; + let Unpredictable{12} = 0b1; + let Unpredictable{7} = 0b1; + let Unpredictable{5} = 0b1; + let Defs = [P0]; + let Uses = [P0]; + + let Constraints = ""; +} + +class MVE_loltp_start size> + : t2LOL<(outs GPRlr:$LR), iops, asm, ops> { + bits<4> Rn; + let Predicates = [HasMVEInt]; + let Inst{22} = 0b0; + let Inst{21-20} = size; + let Inst{19-16} = Rn{3-0}; + let Inst{12} = 0b0; +} + +class MVE_DLSTP size> + : MVE_loltp_start<(ins rGPR:$Rn), asm, "$LR, $Rn", size> { + let Inst{13} = 0b1; + let Inst{11-1} = 0b00000000000; + let Unpredictable{10-1} = 0b1111111111; +} + +class MVE_WLSTP size> + : MVE_loltp_start<(ins rGPR:$Rn, wlslabel_u11:$label), + asm, "$LR, $Rn, $label", size> { + bits<11> label; + let Inst{13} = 0b0; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; +} + +def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>; +def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>; +def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>; +def MVE_DLSTP_64 : MVE_DLSTP<"dlstp.64", 0b11>; + +def MVE_WLSTP_8 : MVE_WLSTP<"wlstp.8", 0b00>; +def MVE_WLSTP_16 : MVE_WLSTP<"wlstp.16", 0b01>; +def MVE_WLSTP_32 : MVE_WLSTP<"wlstp.32", 0b10>; +def MVE_WLSTP_64 : MVE_WLSTP<"wlstp.64", 0b11>; + +class MVE_loltp_end + : t2LOL { + let Predicates = [HasMVEInt]; + let Inst{22-21} = 0b00; + let Inst{19-16} = 0b1111; + let Inst{12} = 0b0; +} + +def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout), + (ins GPRlr:$LRin, lelabel_u11:$label), + "letp", "$LRin, $label"> { + bits<11> label; + let Inst{20} = 0b1; + let Inst{13} = 0b0; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; +} + +def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { + let Inst{20} = 0b0; + let Inst{13} = 0b1; + let Inst{11-1} = 0b00000000000; + let Unpredictable{21-20} = 0b11; + let Unpredictable{11-1} = 0b11111111111; +} + + +//===----------------------------------------------------------------------===// +// Patterns +//===----------------------------------------------------------------------===// + +class MVE_unpred_vector_store_typed + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7:$addr), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7:$addr)>; + +multiclass MVE_unpred_vector_store { + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; +} + +class MVE_unpred_vector_load_typed + : Pat<(Ty (LoadKind t2addrmode_imm7:$addr)), + (Ty (RegImmInst t2addrmode_imm7:$addr))>; + +multiclass MVE_unpred_vector_load { + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; +} + +let Predicates = [HasMVEInt, IsLE] in { + defm : MVE_unpred_vector_store; + defm : MVE_unpred_vector_store; + defm : MVE_unpred_vector_store; + + defm : MVE_unpred_vector_load; + defm : MVE_unpred_vector_load; + defm : MVE_unpred_vector_load; + + def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), + (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), + (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), + (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; +} + +let Predicates = [HasMVEInt, IsBE] in { + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; +} + + +// Widening/Narrowing Loads/Stores + +let Predicates = [HasMVEInt] in { + def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>; + def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>; + def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>; +} + +multiclass MVEExtLoad { + def _Any : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) + (!cast("extloadvi" # SrcElemBits) am:$addr)), + (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) + am:$addr)>; + def _Z : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) + (!cast("zextloadvi" # SrcElemBits) am:$addr)), + (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) + am:$addr)>; + def _S : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) + (!cast("sextloadvi" # SrcElemBits) am:$addr)), + (!cast("MVE_VLDR" # SrcElemType # "S" # DestElemBits) + am:$addr)>; +} + +let Predicates = [HasMVEInt] in { + defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>; + defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>; + defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>; +} + + +// Bit convert patterns + +let Predicates = [HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; +} + +let Predicates = [IsLE,HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; +} diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 96986e74415b..806681df102c 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -1,9 +1,8 @@ //===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -497,45 +496,30 @@ def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; // Types for vector shift by immediates. The "SHX" version is for long and // narrow operations where the source and destination vectors have different // types. The "SHINS" version is for shift and insert operations. -def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; -def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisVT<2, i32>]>; -def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; - -def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; -def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; -def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; -def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; +def SDTARMVSHXIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINSIMM : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; -def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; -def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; -def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; +def NEONvshrnImm : SDNode<"ARMISD::VSHRNIMM", SDTARMVSHXIMM>; -def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; -def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; -def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; -def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; -def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; -def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; +def NEONvrshrsImm : SDNode<"ARMISD::VRSHRsIMM", SDTARMVSHIMM>; +def NEONvrshruImm : SDNode<"ARMISD::VRSHRuIMM", SDTARMVSHIMM>; +def NEONvrshrnImm : SDNode<"ARMISD::VRSHRNIMM", SDTARMVSHXIMM>; -def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; -def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; -def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; +def NEONvqshlsImm : SDNode<"ARMISD::VQSHLsIMM", SDTARMVSHIMM>; +def NEONvqshluImm : SDNode<"ARMISD::VQSHLuIMM", SDTARMVSHIMM>; +def NEONvqshlsuImm : SDNode<"ARMISD::VQSHLsuIMM", SDTARMVSHIMM>; +def NEONvqshrnsImm : SDNode<"ARMISD::VQSHRNsIMM", SDTARMVSHXIMM>; +def NEONvqshrnuImm : SDNode<"ARMISD::VQSHRNuIMM", SDTARMVSHXIMM>; +def NEONvqshrnsuImm : SDNode<"ARMISD::VQSHRNsuIMM", SDTARMVSHXIMM>; -def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; -def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; +def NEONvqrshrnsImm : SDNode<"ARMISD::VQRSHRNsIMM", SDTARMVSHXIMM>; +def NEONvqrshrnuImm : SDNode<"ARMISD::VQRSHRNuIMM", SDTARMVSHXIMM>; +def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>; -def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, - SDTCisVT<2, i32>]>; -def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; -def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; - -def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; -def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; -def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; -def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; +def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>; +def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>; def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; @@ -548,23 +532,10 @@ def NEONvbsl : SDNode<"ARMISD::VBSL", SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>>; -def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; - -// VDUPLANE can produce a quad-register result from a double-register source, -// so the result is not constrained to match the source. -def NEONvduplane : SDNode<"ARMISD::VDUPLANE", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisVT<2, i32>]>>; - def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>; -def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; -def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; -def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; -def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; - def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; @@ -585,14 +556,14 @@ def NEONvtbl1 : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>; def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>; -def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ +def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast(N->getOperand(0)); unsigned EltBits = 0; uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); return (EltBits == 32 && EltVal == 0); }]>; -def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{ +def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast(N->getOperand(0)); unsigned EltBits = 0; uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); @@ -1118,6 +1089,13 @@ def VLD1LNq8Pseudo : VLD1QLNPseudo; def VLD1LNq16Pseudo : VLD1QLNPseudo; def VLD1LNq32Pseudo : VLD1QLNPseudo; +let Predicates = [HasNEON] in { +def : Pat<(vector_insert (v4f16 DPR:$src), + (f16 (load addrmode6:$addr)), imm:$lane), + (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v8f16 QPR:$src), + (f16 (load addrmode6:$addr)), imm:$lane), + (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; def : Pat<(vector_insert (v2f32 DPR:$src), (f32 (load addrmode6:$addr)), imm:$lane), (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; @@ -1139,6 +1117,7 @@ def : Pat<(insert_subvector undef, (v4f16 DPR:$src), (i32 0)), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), DPR:$src, dsub_0)>; def : Pat<(insert_subvector (v16i8 undef), (v8i8 DPR:$src), (i32 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), DPR:$src, dsub_0)>; +} let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { @@ -1404,7 +1383,7 @@ class VLD1DUP op7_4, string Dt, ValueType Ty, PatFrag LoadOp, (ins AddrMode:$Rn), IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", [(set VecListOneDAllLanes:$Vd, - (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>, + (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]>, Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; @@ -1417,8 +1396,10 @@ def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16, def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load, addrmode6dupalign32>; -def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), +let Predicates = [HasNEON] in { +def : Pat<(v2f32 (ARMvdup (f32 (load addrmode6dup:$addr)))), (VLD1DUPd32 addrmode6:$addr)>; +} class VLD1QDUP op7_4, string Dt, ValueType Ty, PatFrag LoadOp, Operand AddrMode> @@ -1426,7 +1407,7 @@ class VLD1QDUP op7_4, string Dt, ValueType Ty, PatFrag LoadOp, (ins AddrMode:$Rn), IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", [(set VecListDPairAllLanes:$Vd, - (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> { + (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; @@ -1439,8 +1420,10 @@ def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16, def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load, addrmode6dupalign32>; -def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), +let Predicates = [HasNEON] in { +def : Pat<(v4f32 (ARMvdup (f32 (load addrmode6dup:$addr)))), (VLD1DUPq32 addrmode6:$addr)>; +} let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { // ...with address register writeback: @@ -2152,11 +2135,11 @@ class VST1QLNPseudo } def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-5} = lane{2-0}; } def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-6} = lane{1-0}; let Inst{4} = Rn{4}; } @@ -2167,15 +2150,22 @@ def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, let Inst{5-4} = Rn{5-4}; } -def VST1LNq8Pseudo : VST1QLNPseudo; -def VST1LNq16Pseudo : VST1QLNPseudo; +def VST1LNq8Pseudo : VST1QLNPseudo; +def VST1LNq16Pseudo : VST1QLNPseudo; def VST1LNq32Pseudo : VST1QLNPseudo; +let Predicates = [HasNEON] in { def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr), (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr), (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v4f16 DPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v8f16 QPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; +} + // ...with address register writeback: class VST1LNWB op11_8, bits<4> op7_4, string Dt, ValueType Ty, PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode> @@ -2196,11 +2186,11 @@ class VST1QLNWBPseudo } def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-5} = lane{2-0}; } def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-6} = lane{1-0}; let Inst{4} = Rn{4}; } @@ -2210,8 +2200,8 @@ def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store, let Inst{5-4} = Rn{5-4}; } -def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo; -def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo; +def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo; +def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo; def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo; let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { @@ -2440,37 +2430,45 @@ def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo, Sched<[WriteVST2]>; } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 // Use vld1/vst1 for unaligned f64 load / store +let Predicates = [IsLE,HasNEON] in { def : Pat<(f64 (hword_alignedload addrmode6:$addr)), - (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>; + (VLD1d16 addrmode6:$addr)>; def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr), - (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>; + (VST1d16 addrmode6:$addr, DPR:$value)>; def : Pat<(f64 (byte_alignedload addrmode6:$addr)), - (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>; + (VLD1d8 addrmode6:$addr)>; def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr), - (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>; + (VST1d8 addrmode6:$addr, DPR:$value)>; +} +let Predicates = [IsBE,HasNEON] in { def : Pat<(f64 (non_word_alignedload addrmode6:$addr)), - (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>; + (VLD1d64 addrmode6:$addr)>; def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr), - (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>; + (VST1d64 addrmode6:$addr, DPR:$value)>; +} // Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64 // load / store if it's legal. +let Predicates = [HasNEON] in { def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)), (VLD1q64 addrmode6:$addr)>; def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), (VST1q64 addrmode6:$addr, QPR:$value)>; +} +let Predicates = [IsLE,HasNEON] in { def : Pat<(v2f64 (word_alignedload addrmode6:$addr)), - (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>; + (VLD1q32 addrmode6:$addr)>; def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr), - (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; + (VST1q32 addrmode6:$addr, QPR:$value)>; def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)), - (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>; + (VLD1q16 addrmode6:$addr)>; def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), - (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; + (VST1q16 addrmode6:$addr, QPR:$value)>; def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)), - (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>; + (VLD1q8 addrmode6:$addr)>; def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr), - (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; + (VST1q8 addrmode6:$addr, QPR:$value)>; +} //===----------------------------------------------------------------------===// // NEON pattern fragments @@ -2505,6 +2503,13 @@ def SSubReg_f32_reg : SDNodeXForm; +// Extract S sub-registers of Q/D registers containing a given f16 lane. +def SSubReg_f16_reg : SDNodeXFormgetTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; + // Translate lane numbers from Q registers to D subregs. def SubReg_i8_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); @@ -2666,7 +2671,7 @@ class N3VDSL op21_20, bits<4> op11_8, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { + (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; @@ -2678,7 +2683,7 @@ class N3VDSL16 op21_20, bits<4> op11_8, NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","", [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; @@ -2714,7 +2719,7 @@ class N3VQSL op21_20, bits<4> op11_8, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; @@ -2727,7 +2732,7 @@ class N3VQSL16 op21_20, bits<4> op11_8, string OpcodeStr, string Dt, NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + (ResTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; @@ -2762,7 +2767,7 @@ class N3VDIntSL op21_20, bits<4> op11_8, InstrItinClass itin, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (Ty DPR:$Vd), (Ty (IntOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + (Ty (ARMvduplane (Ty DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -2774,7 +2779,7 @@ class N3VDIntSL16 op21_20, bits<4> op11_8, InstrItinClass itin, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (Ty DPR:$Vd), (Ty (IntOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } class N3VDIntSh op21_20, bits<4> op11_8, bit op4, @@ -2829,7 +2834,7 @@ class N3VQIntSL op21_20, bits<4> op11_8, InstrItinClass itin, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -2841,7 +2846,7 @@ class N3VQIntSL16 op21_20, bits<4> op11_8, InstrItinClass itin, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + (ResTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -2877,7 +2882,7 @@ class N3VDMulOpSL op21_20, bits<4> op11_8, InstrItinClass itin, [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$src1), (Ty (MulOp DPR:$Vn, - (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + (Ty (ARMvduplane (Ty DPR_VFP2:$Vm), imm:$lane)))))))]>; class N3VDMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -2890,7 +2895,7 @@ class N3VDMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$src1), (Ty (MulOp DPR:$Vn, - (Ty (NEONvduplane (Ty DPR_8:$Vm), + (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))))]>; class N3VQMulOp op21_20, bits<4> op11_8, bit op4, @@ -2912,7 +2917,7 @@ class N3VQMulOpSL op21_20, bits<4> op11_8, InstrItinClass itin, [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (MulOp QPR:$Vn, - (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))))]>; class N3VQMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -2926,7 +2931,7 @@ class N3VQMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (MulOp QPR:$Vn, - (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + (ResTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))))]>; // Neon Intrinsic-Op instructions (VABA): double- and quad-register. @@ -2986,7 +2991,7 @@ class N3VLMulOpSL op21_20, bits<4> op11_8, [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), (TyQ (MulOp (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_VFP2:$Vm), + (TyD (ARMvduplane (TyD DPR_VFP2:$Vm), imm:$lane))))))]>; class N3VLMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -2998,7 +3003,7 @@ class N3VLMulOpSL16 op21_20, bits<4> op11_8, [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), (TyQ (MulOp (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_8:$Vm), + (TyD (ARMvduplane (TyD DPR_8:$Vm), imm:$lane))))))]>; // Long Intrinsic-Op vector operations with explicit extend (VABAL). @@ -3034,7 +3039,7 @@ class N3VLInt3SL op21_20, bits<4> op11_8, InstrItinClass itin, [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$src1), (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]>; class N3VLInt3SL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -3047,7 +3052,7 @@ class N3VLInt3SL16 op21_20, bits<4> op11_8, [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$src1), (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + (OpTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]>; // Narrowing 3-register intrinsics. @@ -3080,7 +3085,7 @@ class N3VLSL op21_20, bits<4> op11_8, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; + (TyD (ARMvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; class N3VLSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> @@ -3089,7 +3094,7 @@ class N3VLSL16 op21_20, bits<4> op11_8, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; + (TyD (ARMvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; // Long 3-register operations with explicitly extended operands. class N3VLExt op21_20, bits<4> op11_8, bit op4, @@ -3145,7 +3150,7 @@ class N3VLIntSL op21_20, bits<4> op11_8, InstrItinClass itin, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]>; class N3VLIntSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -3155,7 +3160,7 @@ class N3VLIntSL16 op21_20, bits<4> op11_8, NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + (OpTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]>; // Wide 3-register operations. @@ -4087,72 +4092,72 @@ multiclass N2VShInsL_QHSD op11_8, bit op4, string OpcodeStr> { // 64-bit vector types. def v8i8 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsliImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsliImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsliImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns; + N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsliImm>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsliImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsliImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsliImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns; + N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsliImm>; // imm6 = xxxxxx } multiclass N2VShInsR_QHSD op11_8, bit op4, string OpcodeStr> { // 64-bit vector types. def v8i8 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsriImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsriImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsriImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns; + N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsriImm>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsriImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsriImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsriImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns; + N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsriImm>; // imm6 = xxxxxx } @@ -4251,12 +4256,14 @@ defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>; defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", int_arm_neon_vraddhn, 1>; -def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), +let Predicates = [HasNEON] in { +def : Pat<(v8i8 (trunc (ARMvshruImm (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), +def : Pat<(v4i16 (trunc (ARMvshruImm (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), +def : Pat<(v2i32 (trunc (ARMvshruImm (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>; +} // Vector Multiply Operations. @@ -4287,47 +4294,49 @@ def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16, v4f16, fmul>, Requires<[HasNEON,HasFullFP16]>; +let Predicates = [HasNEON] in { def : Pat<(v8i16 (mul (v8i16 QPR:$src1), - (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))), (v8i16 (VMULslv8i16 (v8i16 QPR:$src1), (v4i16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (mul (v4i32 QPR:$src1), - (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))), (v4i32 (VMULslv4i32 (v4i32 QPR:$src1), (v2i32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), - (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))), + (v4f32 (ARMvduplane (v4f32 QPR:$src2), imm:$lane)))), (v4f32 (VMULslfq (v4f32 QPR:$src1), (v2f32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; def : Pat<(v8f16 (fmul (v8f16 QPR:$src1), - (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))), + (v8f16 (ARMvduplane (v8f16 QPR:$src2), imm:$lane)))), (v8f16 (VMULslhq(v8f16 QPR:$src1), (v4f16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; -def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), +def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (VMULslfd DPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; -def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), +def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))), (VMULslhd DPR:$Rn, (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), (i32 0))>; -def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), +def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (VMULslfq QPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; -def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), +def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))), (VMULslhq QPR:$Rn, (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), (i32 0))>; +} // VQDMULH : Vector Saturating Doubling Multiply Returning High Half defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, @@ -4336,20 +4345,23 @@ defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, "vqdmulh", "s", int_arm_neon_vqdmulh>; + +let Predicates = [HasNEON] in { def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1), - (v8i16 (NEONvduplane (v8i16 QPR:$src2), + (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))), (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1), (v4i16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), - (v4i32 (NEONvduplane (v4i32 QPR:$src2), + (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))), (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1), (v2i32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; +} // VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, @@ -4358,20 +4370,23 @@ defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, "vqrdmulh", "s", int_arm_neon_vqrdmulh>; + +let Predicates = [HasNEON] in { def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1), - (v8i16 (NEONvduplane (v8i16 QPR:$src2), + (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))), (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1), (v4i16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), - (v4i32 (NEONvduplane (v4i32 QPR:$src2), + (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))), (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1), (v2i32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; +} // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) let PostEncoderMethod = "NEONThumb2DataIPostEncoder", @@ -4427,9 +4442,10 @@ def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16", v8f16, v4f16, fmul, fadd>, Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +let Predicates = [HasNEON] in { def : Pat<(v8i16 (add (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))), (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), @@ -4437,15 +4453,16 @@ def : Pat<(v8i16 (add (v8i16 QPR:$src1), def : Pat<(v4i32 (add (v4i32 QPR:$src1), (mul (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))), (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; +} def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), (fmul_su (v4f32 QPR:$src2), - (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLAslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, @@ -4497,7 +4514,7 @@ let Predicates = [HasNEON, HasV8_1a] in { (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; @@ -4505,7 +4522,7 @@ let Predicates = [HasNEON, HasV8_1a] in { (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; @@ -4513,7 +4530,7 @@ let Predicates = [HasNEON, HasV8_1a] in { (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane)))))), (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), @@ -4525,7 +4542,7 @@ let Predicates = [HasNEON, HasV8_1a] in { (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane)))))), (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), @@ -4567,14 +4584,14 @@ let Predicates = [HasNEON, HasV8_1a] in { (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; def : Pat<(v2i32 (int_arm_neon_vqsubs (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; @@ -4582,7 +4599,7 @@ let Predicates = [HasNEON, HasV8_1a] in { (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane)))))), (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), @@ -4594,7 +4611,7 @@ let Predicates = [HasNEON, HasV8_1a] in { (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane)))))), (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), @@ -4608,6 +4625,7 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, "vqdmlal", "s", null_frag>; defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>; +let Predicates = [HasNEON] in { def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), @@ -4618,14 +4636,15 @@ def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; +} // VMLS : Vector Multiply Subtract (integer and floating-point) defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, @@ -4657,9 +4676,10 @@ def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16", v8f16, v4f16, fmul, fsub>, Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +let Predicates = [HasNEON] in { def : Pat<(v8i16 (sub (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))), (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), @@ -4667,15 +4687,16 @@ def : Pat<(v8i16 (sub (v8i16 QPR:$src1), def : Pat<(v4i32 (sub (v4i32 QPR:$src1), (mul (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))), (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; +} def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), (fmul_su (v4f32 QPR:$src2), - (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), @@ -4696,6 +4717,7 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, "vqdmlsl", "s", null_frag>; defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>; +let Predicates = [HasNEON] in { def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))))), @@ -4706,14 +4728,15 @@ def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; +} // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", @@ -4754,16 +4777,16 @@ def : Pat<(v8f16 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), Requires<[HasNEON,HasFullFP16]>; def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasVFP4]>; + Requires<[HasNEON,HasVFP4]>; def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasVFP4]>; + Requires<[HasNEON,HasVFP4]>; def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)), (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasVFP4]>; + Requires<[HasNEON,HasVFP4]>; def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)), (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasVFP4]>; + Requires<[HasNEON,HasVFP4]>; // ARMv8.2a dot product instructions. // We put them in the VFPV8 decoder namespace because the ARM and Thumb @@ -4808,7 +4831,7 @@ multiclass DOTI(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>; } @@ -4991,12 +5014,14 @@ defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>; defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i", int_arm_neon_vrsubhn, 0>; -def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), +let Predicates = [HasNEON] in { +def : Pat<(v8i8 (trunc (ARMvshruImm (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), +def : Pat<(v4i16 (trunc (ARMvshruImm (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), +def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>; +} // Vector Comparisons. @@ -5122,10 +5147,11 @@ class N3VCP8F16Q0; -class VFMQ0 S> +// Vd, Vs, Vs[0-15], Idx[0-1] +class VFMD S> : N3VLaneCP8<0, S, 0, 1, (outs DPR:$Vd), - (ins SPR:$Vn, SPR:$Vm, VectorIndex32:$idx), - IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> { + (ins SPR:$Vn, SPR_8:$Vm, VectorIndex32:$idx), + IIC_VMACD, opc, type, "$Vd, $Vn, $Vm$idx", "", []> { bit idx; let Inst{3} = idx; let Inst{19-16} = Vn{4-1}; @@ -5134,10 +5160,11 @@ class VFMQ0 S> let Inst{2-0} = Vm{3-1}; } -class VFMQ1 S> +// Vq, Vd, Vd[0-7], Idx[0-3] +class VFMQ S> : N3VLaneCP8<0, S, 1, 1, (outs QPR:$Vd), - (ins DPR:$Vn, DPR:$Vm, VectorIndex16:$idx), - IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> { + (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx), + IIC_VMACD, opc, type, "$Vd, $Vn, $Vm$idx", "", []> { bits<2> idx; let Inst{5} = idx{1}; let Inst{3} = idx{0}; @@ -5149,10 +5176,10 @@ def VFMALD : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>; def VFMSLD : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>; def VFMALQ : N3VCP8F16Q1<"vfmal", QPR, DPR, DPR, 0b00, 0b10, 1>; def VFMSLQ : N3VCP8F16Q1<"vfmsl", QPR, DPR, DPR, 0b01, 0b10, 1>; -def VFMALDI : VFMQ0<"vfmal", 0b00>; -def VFMSLDI : VFMQ0<"vfmsl", 0b01>; -def VFMALQI : VFMQ1<"vfmal", 0b00>; -def VFMSLQI : VFMQ1<"vfmsl", 0b01>; +def VFMALDI : VFMD<"vfmal", "f16", 0b00>; +def VFMSLDI : VFMD<"vfmsl", "f16", 0b01>; +def VFMALQI : VFMQ<"vfmal", "f16", 0b00>; +def VFMSLQI : VFMQ<"vfmsl", "f16", 0b01>; } } // HasNEON, HasFP16FML @@ -5308,28 +5335,28 @@ let isReMaterializable = 1 in { def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd), (ins nImmSplatI16:$SIMM), IIC_VMOVImm, "vmvn", "i16", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> { + [(set DPR:$Vd, (v4i16 (ARMvmvnImm timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd), (ins nImmSplatI16:$SIMM), IIC_VMOVImm, "vmvn", "i16", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> { + [(set QPR:$Vd, (v8i16 (ARMvmvnImm timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd), (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, "vmvn", "i32", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> { + [(set DPR:$Vd, (v2i32 (ARMvmvnImm timm:$SIMM)))]> { let Inst{11-8} = SIMM{11-8}; } def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd), (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, "vmvn", "i32", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> { + [(set QPR:$Vd, (v4i32 (ARMvmvnImm timm:$SIMM)))]> { let Inst{11-8} = SIMM{11-8}; } } @@ -5343,8 +5370,10 @@ def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD, "vmvn", "$Vd, $Vm", "", [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>; +let Predicates = [HasNEON] in { def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; +} // VBSL : Vector Bitwise Select def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), @@ -5353,36 +5382,31 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", [(set DPR:$Vd, (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; +let Predicates = [HasNEON] in { def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1), (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1), (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; +} def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), @@ -5391,35 +5415,30 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), [(set QPR:$Vd, (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; +let Predicates = [HasNEON] in { def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1), (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1), (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1), (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1), (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON]>; + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; +} // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", @@ -5479,24 +5498,28 @@ defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, "vabdl", "u", int_arm_neon_vabdu, zext, 1>; +let Predicates = [HasNEON] in { def : Pat<(v8i16 (abs (sub (zext (v8i8 DPR:$opA)), (zext (v8i8 DPR:$opB))))), (VABDLuv8i16 DPR:$opA, DPR:$opB)>; def : Pat<(v4i32 (abs (sub (zext (v4i16 DPR:$opA)), (zext (v4i16 DPR:$opB))))), (VABDLuv4i32 DPR:$opA, DPR:$opB)>; +} // ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the // shift/xor pattern for ABS. def abd_shr : PatFrag<(ops node:$in1, node:$in2, node:$shift), - (NEONvshrs (sub (zext node:$in1), + (ARMvshrsImm (sub (zext node:$in1), (zext node:$in2)), (i32 $shift))>; +let Predicates = [HasNEON] in { def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))), (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)), (zext (v2i32 DPR:$opB))), (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))), (VABDLuv2i64 DPR:$opA, DPR:$opB)>; +} // VABA : Vector Absolute Difference and Accumulate defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, @@ -5536,22 +5559,22 @@ def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ, // VMAXNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, - N3RegFrm, NoItinerary, "vmaxnm", "f32", - v2f32, v2f32, fmaxnum, 1>, - Requires<[HasV8, HasNEON]>; - def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, - N3RegFrm, NoItinerary, "vmaxnm", "f32", - v4f32, v4f32, fmaxnum, 1>, - Requires<[HasV8, HasNEON]>; - def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1, - N3RegFrm, NoItinerary, "vmaxnm", "f16", - v4f16, v4f16, fmaxnum, 1>, - Requires<[HasV8, HasNEON, HasFullFP16]>; - def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1, - N3RegFrm, NoItinerary, "vmaxnm", "f16", - v8f16, v8f16, fmaxnum, 1>, - Requires<[HasV8, HasNEON, HasFullFP16]>; + def NEON_VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f32", + v2f32, v2f32, fmaxnum, 1>, + Requires<[HasV8, HasNEON]>; + def NEON_VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f32", + v4f32, v4f32, fmaxnum, 1>, + Requires<[HasV8, HasNEON]>; + def NEON_VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v4f16, v4f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def NEON_VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v8f16, v8f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } // VMIN : Vector Minimum @@ -5578,22 +5601,22 @@ def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ, // VMINNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, - N3RegFrm, NoItinerary, "vminnm", "f32", - v2f32, v2f32, fminnum, 1>, - Requires<[HasV8, HasNEON]>; - def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, - N3RegFrm, NoItinerary, "vminnm", "f32", - v4f32, v4f32, fminnum, 1>, - Requires<[HasV8, HasNEON]>; - def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1, - N3RegFrm, NoItinerary, "vminnm", "f16", - v4f16, v4f16, fminnum, 1>, - Requires<[HasV8, HasNEON, HasFullFP16]>; - def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1, - N3RegFrm, NoItinerary, "vminnm", "f16", - v8f16, v8f16, fminnum, 1>, - Requires<[HasV8, HasNEON, HasFullFP16]>; + def NEON_VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f32", + v2f32, v2f32, fminnum, 1>, + Requires<[HasV8, HasNEON]>; + def NEON_VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f32", + v4f32, v4f32, fminnum, 1>, + Requires<[HasV8, HasNEON]>; + def NEON_VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v4f16, v4f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def NEON_VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v8f16, v8f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } // Vector Pairwise Operations. @@ -5754,20 +5777,57 @@ defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu>; +let Predicates = [HasNEON] in { +def : Pat<(v8i8 (ARMvshls (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))), + (VSHLsv8i8 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v4i16 (ARMvshls (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))), + (VSHLsv4i16 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v2i32 (ARMvshls (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))), + (VSHLsv2i32 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v1i64 (ARMvshls (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))), + (VSHLsv1i64 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v16i8 (ARMvshls (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))), + (VSHLsv16i8 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v8i16 (ARMvshls (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))), + (VSHLsv8i16 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v4i32 (ARMvshls (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))), + (VSHLsv4i32 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v2i64 (ARMvshls (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))), + (VSHLsv2i64 QPR:$Dn, QPR:$Dm)>; + +def : Pat<(v8i8 (ARMvshlu (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))), + (VSHLuv8i8 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v4i16 (ARMvshlu (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))), + (VSHLuv4i16 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v2i32 (ARMvshlu (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))), + (VSHLuv2i32 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v1i64 (ARMvshlu (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))), + (VSHLuv1i64 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v16i8 (ARMvshlu (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))), + (VSHLuv16i8 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v8i16 (ARMvshlu (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))), + (VSHLuv8i16 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v4i32 (ARMvshlu (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))), + (VSHLuv4i32 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v2i64 (ARMvshlu (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))), + (VSHLuv2i64 QPR:$Dn, QPR:$Dm)>; + +} + // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; +defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", ARMvshlImm>; // VSHR : Vector Shift Right (Immediate) defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs", - NEONvshrs>; + ARMvshrsImm>; defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu", - NEONvshru>; + ARMvshruImm>; // VSHLL : Vector Shift Left Long defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", - PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>; + PatFrag<(ops node:$LHS, node:$RHS), (ARMvshlImm (sext node:$LHS), node:$RHS)>>; defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", - PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>; + PatFrag<(ops node:$LHS, node:$RHS), (ARMvshlImm (zext node:$LHS), node:$RHS)>>; // VSHLL : Vector Shift Left Long (with maximum shift count) class N2VLShMax op21_16, bits<4> op11_8, bit op7, @@ -5785,36 +5845,40 @@ def VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16", def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32", v2i64, v2i32, imm32>; -def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))), +let Predicates = [HasNEON] in { +def : Pat<(v8i16 (ARMvshlImm (zext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (ARMvshlImm (zext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (ARMvshlImm (zext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; -def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (ARMvshlImm (sext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (ARMvshlImm (sext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (ARMvshlImm (sext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; -def : Pat<(v8i16 (NEONvshl (anyext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (ARMvshlImm (anyext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (anyext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (ARMvshlImm (anyext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (anyext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (ARMvshlImm (anyext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; +} // VSHRN : Vector Shift Right and Narrow defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", PatFrag<(ops node:$Rn, node:$amt), - (trunc (NEONvshrs node:$Rn, node:$amt))>>; + (trunc (ARMvshrsImm node:$Rn, node:$amt))>>; -def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))), +let Predicates = [HasNEON] in { +def : Pat<(v8i8 (trunc (ARMvshruImm (v8i16 QPR:$Vn), shr_imm8:$amt))), (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>; -def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))), +def : Pat<(v4i16 (trunc (ARMvshruImm (v4i32 QPR:$Vn), shr_imm16:$amt))), (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>; -def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))), +def : Pat<(v2i32 (trunc (ARMvshruImm (v2i64 QPR:$Vn), shr_imm32:$amt))), (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>; +} // VRSHL : Vector Rounding Shift defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm, @@ -5825,13 +5889,13 @@ defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm, "vrshl", "u", int_arm_neon_vrshiftu>; // VRSHR : Vector Rounding Shift Right defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs", - NEONvrshrs>; + NEONvrshrsImm>; defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu", - NEONvrshru>; + NEONvrshruImm>; // VRSHRN : Vector Rounding Shift Right and Narrow defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", - NEONvrshrn>; + NEONvrshrnImm>; // VQSHL : Vector Saturating Shift defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm, @@ -5841,21 +5905,21 @@ defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>; -defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>; +defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshlsImm>; +defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshluImm>; // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>; +defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsuImm>; // VQSHRN : Vector Saturating Shift Right and Narrow defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", - NEONvqshrns>; + NEONvqshrnsImm>; defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u", - NEONvqshrnu>; + NEONvqshrnuImm>; // VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", - NEONvqshrnsu>; + NEONvqshrnsuImm>; // VQRSHL : Vector Saturating Rounding Shift defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm, @@ -5867,20 +5931,20 @@ defm VQRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm, // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", - NEONvqrshrns>; + NEONvqrshrnsImm>; defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u", - NEONvqrshrnu>; + NEONvqrshrnuImm>; // VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s", - NEONvqrshrnsu>; + NEONvqrshrnsuImm>; // VSRA : Vector Shift Right and Accumulate -defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>; -defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>; +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", ARMvshrsImm>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", ARMvshruImm>; // VRSRA : Vector Rounding Shift Right and Accumulate -defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; -defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrsImm>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshruImm>; // VSLI : Vector Shift Left and Insert defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">; @@ -5957,12 +6021,14 @@ def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0, [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>, Requires<[HasNEON, HasFullFP16]>; +let Predicates = [HasNEON] in { def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>; def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>; def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>; def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>; +} // VQNEG : Vector Saturating Negate defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, @@ -6014,57 +6080,57 @@ let isReMaterializable = 1, isAsCheapAsAMove=1 in { def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd), (ins nImmSplatI8:$SIMM), IIC_VMOVImm, "vmov", "i8", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>; + [(set DPR:$Vd, (v8i8 (ARMvmovImm timm:$SIMM)))]>; def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd), (ins nImmSplatI8:$SIMM), IIC_VMOVImm, "vmov", "i8", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>; + [(set QPR:$Vd, (v16i8 (ARMvmovImm timm:$SIMM)))]>; def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd), (ins nImmSplatI16:$SIMM), IIC_VMOVImm, "vmov", "i16", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> { + [(set DPR:$Vd, (v4i16 (ARMvmovImm timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd), (ins nImmSplatI16:$SIMM), IIC_VMOVImm, "vmov", "i16", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> { + [(set QPR:$Vd, (v8i16 (ARMvmovImm timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd), (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, "vmov", "i32", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> { + [(set DPR:$Vd, (v2i32 (ARMvmovImm timm:$SIMM)))]> { let Inst{11-8} = SIMM{11-8}; } def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd), (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, "vmov", "i32", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> { + [(set QPR:$Vd, (v4i32 (ARMvmovImm timm:$SIMM)))]> { let Inst{11-8} = SIMM{11-8}; } def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd), (ins nImmSplatI64:$SIMM), IIC_VMOVImm, "vmov", "i64", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>; + [(set DPR:$Vd, (v1i64 (ARMvmovImm timm:$SIMM)))]>; def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd), (ins nImmSplatI64:$SIMM), IIC_VMOVImm, "vmov", "i64", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>; + [(set QPR:$Vd, (v2i64 (ARMvmovImm timm:$SIMM)))]>; def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd), (ins nImmVMOVF32:$SIMM), IIC_VMOVImm, "vmov", "f32", "$Vd, $SIMM", "", - [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>; + [(set DPR:$Vd, (v2f32 (ARMvmovFPImm timm:$SIMM)))]>; def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd), (ins nImmVMOVF32:$SIMM), IIC_VMOVImm, "vmov", "f32", "$Vd, $SIMM", "", - [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>; + [(set QPR:$Vd, (v4f32 (ARMvmovFPImm timm:$SIMM)))]>; } // isReMaterializable, isAsCheapAsAMove // Add support for bytes replication feature, so it could be GAS compatible. @@ -6144,7 +6210,7 @@ let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in { def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane), IIC_VMOVSI, "vmov", "s8", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V), + [(set GPR:$R, (ARMvgetlanes (v8i8 DPR:$V), imm:$lane))]> { let Inst{21} = lane{2}; let Inst{6-5} = lane{1-0}; @@ -6152,7 +6218,7 @@ def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane), IIC_VMOVSI, "vmov", "s16", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V), + [(set GPR:$R, (ARMvgetlanes (v4i16 DPR:$V), imm:$lane))]> { let Inst{21} = lane{1}; let Inst{6} = lane{0}; @@ -6160,7 +6226,7 @@ def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane), IIC_VMOVSI, "vmov", "u8", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V), + [(set GPR:$R, (ARMvgetlaneu (v8i8 DPR:$V), imm:$lane))]> { let Inst{21} = lane{2}; let Inst{6-5} = lane{1-0}; @@ -6168,7 +6234,7 @@ def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1}, (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane), IIC_VMOVSI, "vmov", "u16", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V), + [(set GPR:$R, (ARMvgetlaneu (v4i16 DPR:$V), imm:$lane))]> { let Inst{21} = lane{1}; let Inst{6} = lane{0}; @@ -6178,26 +6244,28 @@ def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00, IIC_VMOVSI, "vmov", "32", "$R, $V$lane", [(set GPR:$R, (extractelt (v2i32 DPR:$V), imm:$lane))]>, - Requires<[HasVFP2, HasFastVGETLNi32]> { + Requires<[HasFPRegs, HasFastVGETLNi32]> { let Inst{21} = lane{0}; } +let Predicates = [HasNEON] in { // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td -def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlanes (v16i8 QPR:$src), imm:$lane), (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane))>; -def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlanes (v8i16 QPR:$src), imm:$lane), (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane))>; -def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlaneu (v16i8 QPR:$src), imm:$lane), (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane))>; -def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlaneu (v8i16 QPR:$src), imm:$lane), (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane))>; +} def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), @@ -6211,6 +6279,7 @@ def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>, Requires<[HasNEON, HasSlowVGETLNi32]>; +let Predicates = [HasNEON] in { def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2), (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)), (SSubReg_f32_reg imm:$src2))>; @@ -6221,7 +6290,36 @@ def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2), // (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; +} + +def imm_even : ImmLeaf; +def imm_odd : ImmLeaf; + +let Predicates = [HasNEON] in { +def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_even:$lane))>; +def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; + +def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane), + (EXTRACT_SUBREG + (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)), + (SSubReg_f16_reg imm_even:$lane))>; + +def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG + (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; +} // VMOV : Vector Set Lane (move ARM core register to scalar) @@ -6254,6 +6352,8 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), let isInsertSubreg = 1; } } + +let Predicates = [HasNEON] in { def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), (v16i8 (INSERT_SUBREG QPR:$src1, (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, @@ -6280,6 +6380,15 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)), (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)), SPR:$src2, (SSubReg_f32_reg imm:$src3))>; +def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane), + (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>; +def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane), + (v8f16 (INSERT_SUBREG QPR:$src1, + (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i16_reg imm:$lane))), + (VMOVRH $src2), (SubReg_i16_lane imm:$lane))), + (DSubReg_i16_reg imm:$lane)))>; + //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), // (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), @@ -6311,17 +6420,18 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)), (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)), dsub_0)>; +} // VDUP : Vector Duplicate (from ARM core register to all elements) class VDUPD opcod1, bits<2> opcod3, string Dt, ValueType Ty> : NVDup; + [(set DPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>; class VDUPQ opcod1, bits<2> opcod3, string Dt, ValueType Ty> : NVDup; + [(set QPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>; def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>; def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>; @@ -6331,15 +6441,16 @@ def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>; def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; -// NEONvdup patterns for uarchs with fast VDUP.32. -def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>, +// ARMvdup patterns for uarchs with fast VDUP.32. +def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>, Requires<[HasNEON,HasFastVDUP32]>; -def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>; +def : Pat<(v4f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>, + Requires<[HasNEON]>; -// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead. -def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>, +// ARMvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead. +def : Pat<(v2i32 (ARMvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>, Requires<[HasNEON,HasSlowVDUP32]>; -def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>, +def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>, Requires<[HasNEON,HasSlowVDUP32]>; // VDUP : Vector Duplicate Lane (from scalar to all elements) @@ -6348,13 +6459,13 @@ class VDUPLND op19_16, string OpcodeStr, string Dt, ValueType Ty, Operand IdxTy> : NVDupLane; + [(set DPR:$Vd, (Ty (ARMvduplane (Ty DPR:$Vm), imm:$lane)))]>; class VDUPLNQ op19_16, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Operand IdxTy> : NVDupLane; // Inst{19-16} is partially specified depending on the element size. @@ -6384,48 +6495,50 @@ def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> { let Inst{19} = lane{0}; } -def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)), +let Predicates = [HasNEON] in { +def : Pat<(v4f16 (ARMvduplane (v4f16 DPR:$Vm), imm:$lane)), (VDUPLN32d DPR:$Vm, imm:$lane)>; -def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), +def : Pat<(v2f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)), (VDUPLN32d DPR:$Vm, imm:$lane)>; -def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), +def : Pat<(v4f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)), (VDUPLN32q DPR:$Vm, imm:$lane)>; -def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), +def : Pat<(v16i8 (ARMvduplane (v16i8 QPR:$src), imm:$lane)), (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane)))>; -def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), +def : Pat<(v8i16 (ARMvduplane (v8i16 QPR:$src), imm:$lane)), (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; -def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)), +def : Pat<(v8f16 (ARMvduplane (v8f16 QPR:$src), imm:$lane)), (v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; -def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), +def : Pat<(v4i32 (ARMvduplane (v4i32 QPR:$src), imm:$lane)), (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), +def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)), (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f16 (NEONvdup HPR:$src)), +def : Pat<(v4f16 (ARMvdup HPR:$src)), (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))), +def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))), (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))), +def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))), (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v8f16 (NEONvdup HPR:$src)), +def : Pat<(v8f16 (ARMvdup HPR:$src)), (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$src, ssub_0), (i32 0)))>; +} // VMOVN : Vector Narrowing Move defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN, @@ -6440,9 +6553,12 @@ defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, // VMOVL : Vector Lengthening Move defm VMOVLs : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>; defm VMOVLu : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>; + +let Predicates = [HasNEON] in { def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>; def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>; def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>; +} // Vector Conversions. @@ -6621,24 +6737,29 @@ class VREV64D op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>; + [(set DPR:$Vd, (Ty (ARMvrev64 (Ty DPR:$Vm))))]>; class VREV64Q op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>; + [(set QPR:$Vd, (Ty (ARMvrev64 (Ty QPR:$Vm))))]>; def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>; -def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; +let Predicates = [HasNEON] in { +def : Pat<(v2f32 (ARMvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; +} def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; -def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; -def : Pat<(v8f16 (NEONvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>; -def : Pat<(v4f16 (NEONvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>; + +let Predicates = [HasNEON] in { +def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; +def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>; +def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>; +} // VREV32 : Vector Reverse elements within 32-bit words @@ -6646,12 +6767,12 @@ class VREV32D op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>; + [(set DPR:$Vd, (Ty (ARMvrev32 (Ty DPR:$Vm))))]>; class VREV32Q op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>; + [(set QPR:$Vd, (Ty (ARMvrev32 (Ty QPR:$Vm))))]>; def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>; def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; @@ -6665,12 +6786,12 @@ class VREV16D op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>; + [(set DPR:$Vd, (Ty (ARMvrev16 (Ty DPR:$Vm))))]>; class VREV16Q op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>; + [(set QPR:$Vd, (Ty (ARMvrev16 (Ty QPR:$Vm))))]>; def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>; def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; @@ -6681,7 +6802,8 @@ def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; class AlignedVEXTq : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))), - (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>; + (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>, + Requires<[HasNEON]>; def : AlignedVEXTq; @@ -6693,6 +6815,7 @@ def : AlignedVEXTq; def : AlignedVEXTq; +def : AlignedVEXTq; // v8f16 -> v4f16 // VEXT : Vector Extract @@ -6728,15 +6851,19 @@ def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> { let Inst{10-9} = index{1-0}; let Inst{8} = 0b0; } +let Predicates = [HasNEON] in { def : Pat<(v4f16 (NEONvext (v4f16 DPR:$Vn), (v4f16 DPR:$Vm), (i32 imm:$index))), (VEXTd16 DPR:$Vn, DPR:$Vm, imm:$index)>; +} def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> { let Inst{10} = index{0}; let Inst{9-8} = 0b00; } +let Predicates = [HasNEON] in { def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), (v2f32 DPR:$Vm), (i32 imm:$index))), (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>; +} def VEXTq8 : VEXTq<"vext", "8", v16i8, imm0_15> { let Inst{11-8} = index{3-0}; @@ -6745,8 +6872,10 @@ def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> { let Inst{11-9} = index{2-0}; let Inst{8} = 0b0; } +let Predicates = [HasNEON] in { def : Pat<(v8f16 (NEONvext (v8f16 QPR:$Vn), (v8f16 QPR:$Vm), (i32 imm:$index))), (VEXTq16 QPR:$Vn, QPR:$Vm, imm:$index)>; +} def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> { let Inst{11-10} = index{1-0}; @@ -6756,8 +6885,10 @@ def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> { let Inst{11} = index{0}; let Inst{10-8} = 0b000; } +let Predicates = [HasNEON] in { def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), (v4f32 QPR:$Vm), (i32 imm:$index))), (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>; +} // VTRN : Vector Transpose @@ -6857,6 +6988,7 @@ def VTBX4Pseudo IIC_VTBX4, "$orig = $dst", []>; } // DecoderMethod = "DecodeTBLInstruction" +let Predicates = [HasNEON] in { def : Pat<(v8i8 (NEONvtbl2 v8i8:$Vn0, v8i8:$Vn1, v8i8:$Vm)), (v8i8 (VTBL2 (REG_SEQUENCE DPair, v8i8:$Vn0, dsub_0, v8i8:$Vn1, dsub_1), @@ -6899,6 +7031,7 @@ def : Pat<(v8i8 (int_arm_neon_vtbx4 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1, v8i8:$Vn2, dsub_2, v8i8:$Vn3, dsub_3), v8i8:$Vm))>; +} // VRINT : Vector Rounding multiclass VRINT_FPI op9_7, SDPatternOperator Int> { @@ -6989,6 +7122,7 @@ def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>; def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>; def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>; +let Predicates = [HasNEON] in { def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)), (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (SHA1H (SUBREG_TO_REG (i64 0), @@ -7016,6 +7150,7 @@ def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)), (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)), ssub_0), v4i32:$wk)>; +} //===----------------------------------------------------------------------===// // NEON instructions for single-precision FP math @@ -7123,171 +7258,228 @@ def : Pat<(arm_vmovsr GPR:$a), Requires<[HasNEON, DontUseVMOVSR]>; //===----------------------------------------------------------------------===// -// Non-Instruction Patterns +// Non-Instruction Patterns or Endiness - Revert Patterns //===----------------------------------------------------------------------===// // bit_convert -let Predicates = [IsLE] in { +// 64 bit conversions +let Predicates = [HasNEON] in { +def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; + +def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; + +def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>; + +// 128 bit conversions +def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + +def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + +def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; +} + +let Predicates = [IsLE,HasNEON] in { + // 64 bit conversions + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; + + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; -} -def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; + + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; + + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; - def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; -} -def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; -let Predicates = [IsLE] in { + + def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>; + + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; + + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; -} -def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; - def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; -} -def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; -} -let Predicates = [IsLE] in { + // 128 bit conversions + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; -} -def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; -} -def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; -let Predicates = [IsLE] in { + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; -} -def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; -} -def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; } -let Predicates = [IsBE] in { +let Predicates = [IsBE,HasNEON] in { // 64 bit conversions + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; + + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; - def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; + + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; - def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + + def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; + + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (VREV16d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; - def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; - def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; - def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; // 128 bit conversions + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; } // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian +let Predicates = [IsBE,HasNEON] in { def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)), - (VREV64q8 (VLD1q8 addrmode6:$addr))>, Requires<[IsBE]>; + (VREV64q8 (VLD1q8 addrmode6:$addr))>; def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr), - (VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>, Requires<[IsBE]>; + (VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>; def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)), - (VREV64q16 (VLD1q16 addrmode6:$addr))>, Requires<[IsBE]>; + (VREV64q16 (VLD1q16 addrmode6:$addr))>; def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), - (VST1q16 addrmode6:$addr, (VREV64q16 QPR:$value))>, Requires<[IsBE]>; + (VST1q16 addrmode6:$addr, (VREV64q16 QPR:$value))>; +} // Fold extracting an element out of a v2i32 into a vfp register. def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))), - (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>; + (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>, + Requires<[HasNEON]>; // Vector lengthening move with load, matching extending loads. @@ -7301,17 +7493,20 @@ multiclass Lengthen_Single { def _Any : Pat<(!cast("v" # DestLanes # DestTy) (!cast("extloadvi" # SrcTy) addrmode6:$addr)), (!cast("VMOVLuv" # DestLanes # DestTy) - (!cast("VLD1d" # SrcTy) addrmode6:$addr))>; + (!cast("VLD1d" # SrcTy) addrmode6:$addr))>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadvi" # SrcTy) addrmode6:$addr)), (!cast("VMOVLuv" # DestLanes # DestTy) - (!cast("VLD1d" # SrcTy) addrmode6:$addr))>; + (!cast("VLD1d" # SrcTy) addrmode6:$addr))>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadvi" # SrcTy) addrmode6:$addr)), (!cast("VMOVLsv" # DestLanes # DestTy) - (!cast("VLD1d" # SrcTy) addrmode6:$addr))>; + (!cast("VLD1d" # SrcTy) addrmode6:$addr))>, + Requires<[HasNEON]>; } } @@ -7328,17 +7523,20 @@ multiclass Lengthen_HalfSingle("extloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # InsnLanes # InsnTy) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # InsnLanes # InsnTy) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast("VMOVLsv" # InsnLanes # InsnTy) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; } // The following class definition is basically a copy of the @@ -7352,19 +7550,22 @@ multiclass Lengthen_HalfSingle_Big_Endian("VMOVLuv" # InsnLanes # InsnTy) (!cast("VREV32d" # RevLanes) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # InsnLanes # InsnTy) (!cast("VREV32d" # RevLanes) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast("VMOVLsv" # InsnLanes # InsnTy) (!cast("VREV32d" # RevLanes) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; } // extload, zextload and sextload for a lengthening load followed by another @@ -7386,19 +7587,22 @@ multiclass Lengthen_Double("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), - dsub_0))>; + dsub_0))>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), - dsub_0))>; + dsub_0))>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), - dsub_0))>; + dsub_0))>, + Requires<[HasNEON]>; } // The following class definition is basically a copy of the @@ -7414,21 +7618,24 @@ multiclass Lengthen_Double_Big_Endian("VMOVLuv" # Insn1Lanes # Insn1Ty) (!cast("VREV32d" # RevLanes) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), - dsub_0))>; + dsub_0))>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) (!cast("VREV32d" # RevLanes) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), - dsub_0))>; + dsub_0))>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) (!cast("VREV32d" # RevLanes) (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), - dsub_0))>; + dsub_0))>, + Requires<[HasNEON]>; } // extload, zextload and sextload for a lengthening load followed by another @@ -7451,21 +7658,24 @@ multiclass Lengthen_HalfDouble("VMOVLuv" # Insn1Lanes # Insn1Ty) (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode6:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode6:$addr)), (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; } // The following class definition is basically a copy of the @@ -7482,7 +7692,8 @@ multiclass Lengthen_HalfDouble_Big_Endian("VREV16d8") (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode6:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) @@ -7490,7 +7701,8 @@ multiclass Lengthen_HalfDouble_Big_Endian("VREV16d8") (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode6:$addr)), (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) @@ -7498,14 +7710,15 @@ multiclass Lengthen_HalfDouble_Big_Endian("VREV16d8") (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), - dsub_0)>; + dsub_0)>, + Requires<[HasNEON]>; } defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16 defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32 defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64 -let Predicates = [IsLE] in { +let Predicates = [HasNEON,IsLE] in { defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 @@ -7517,7 +7730,7 @@ let Predicates = [IsLE] in { defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; } -let Predicates = [IsBE] in { +let Predicates = [HasNEON,IsBE] in { defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16 defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32 @@ -7530,7 +7743,7 @@ let Predicates = [IsBE] in { } // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64 -let Predicates = [IsLE] in { +let Predicates = [HasNEON,IsLE] in { def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr, @@ -7547,7 +7760,7 @@ let Predicates = [IsLE] in { // The following patterns are basically a copy of the patterns above, // however with an additional VREV16d instruction to convert data // loaded by VLD1LN into proper vector format in big endian mode. -let Predicates = [IsBE] in { +let Predicates = [HasNEON,IsBE] in { def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 (!cast("VREV16d8") @@ -7565,6 +7778,7 @@ let Predicates = [IsBE] in { (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; } +let Predicates = [HasNEON] in { def : Pat<(v2i64 (concat_vectors DPR:$Dn, DPR:$Dm)), (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; def : Pat<(v4i32 (concat_vectors DPR:$Dn, DPR:$Dm)), @@ -7575,6 +7789,9 @@ def : Pat<(v16i8 (concat_vectors DPR:$Dn, DPR:$Dm)), (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)), (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; +def : Pat<(v8f16 (concat_vectors DPR:$Dn, DPR:$Dm)), + (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; +} //===----------------------------------------------------------------------===// // Assembler aliases diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index b20b34eaa6a9..cfeb13c6acb6 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -1,9 +1,8 @@ //===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -188,6 +187,19 @@ def t_addrmode_rr : MemOperand, let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); } +// t_addrmode_rr_sext := reg + reg +// +// This is similar to t_addrmode_rr, but uses different heuristics for +// ldrsb/ldrsh. +def t_addrmode_rr_sext : MemOperand, + ComplexPattern { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} + // t_addrmode_rrs := reg + reg // // We use separate scaled versions because the Select* functions need @@ -651,7 +663,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, "ldr", "\t$Rt, $addr", [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, - T1Encoding<{0,1,0,0,1,?}> { + T1Encoding<{0,1,0,0,1,?}>, Sched<[WriteLd]> { // A6.2 & A8.6.59 bits<3> Rt; bits<8> addr; @@ -665,7 +677,7 @@ let canFoldAsLoad = 1 in def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, "ldr", "\t$Rt, $addr", [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, - T1LdStSP<{1,?,?}> { + T1LdStSP<{1,?,?}>, Sched<[WriteLd]> { bits<3> Rt; bits<8> addr; let Inst{10-8} = Rt; @@ -716,39 +728,39 @@ multiclass thumb_st_rr_ri_enc reg_opc, bits<4> imm_opc, defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iLoad_r, IIC_iLoad_i, "ldr", - load>; + load>, Sched<[WriteLd]>; // A8.6.64 & A8.6.61 defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", - zextloadi8>; + zextloadi8>, Sched<[WriteLd]>; // A8.6.76 & A8.6.73 defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", - zextloadi16>; + zextloadi16>, Sched<[WriteLd]>; let AddedComplexity = 10 in def tLDRSB : // A8.6.80 - T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr), + T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr), AddrModeT1_1, IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", - [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>; + [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>; let AddedComplexity = 10 in def tLDRSH : // A8.6.84 - T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr), + T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr), AddrModeT1_2, IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr", - [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>; + [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>; def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, "str", "\t$Rt, $addr", [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, - T1LdStSP<{0,?,?}> { + T1LdStSP<{0,?,?}>, Sched<[WriteST]> { bits<3> Rt; bits<8> addr; let Inst{10-8} = Rt; @@ -759,19 +771,19 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iStore_r, IIC_iStore_i, "str", - store>; + store>, Sched<[WriteST]>; // A8.6.197 & A8.6.195 defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", - truncstorei8>; + truncstorei8>, Sched<[WriteST]>; // A8.6.207 & A8.6.205 defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", - truncstorei16>; + truncstorei16>, Sched<[WriteST]>; //===----------------------------------------------------------------------===// @@ -799,8 +811,8 @@ def tLDMIA_UPD : "$Rn = $wb", IIC_iLoad_mu>, PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> { let Size = 2; - let OutOperandList = (outs GPR:$wb); - let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops); + let OutOperandList = (outs tGPR:$wb); + let InOperandList = (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops); let Pattern = []; let isCodeGenOnly = 1; let isPseudo = 1; @@ -809,7 +821,7 @@ def tLDMIA_UPD : // There is no non-writeback version of STM for Thumb. let mayStore = 1, hasExtraSrcRegAllocReq = 1 in -def tSTMIA_UPD : Thumb1I<(outs GPR:$wb), +def tSTMIA_UPD : Thumb1I<(outs tGPR:$wb), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), AddrModeNone, 2, IIC_iStore_mu, "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>, @@ -831,7 +843,7 @@ let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1, def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), IIC_iPop, "pop${p}\t$regs", []>, - T1Misc<{1,1,0,?,?,?,?}> { + T1Misc<{1,1,0,?,?,?,?}>, Sched<[WriteLd]> { bits<16> regs; let Inst{8} = regs{15}; let Inst{7-0} = regs{7-0}; @@ -841,7 +853,7 @@ let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), IIC_iStore_m, "push${p}\t$regs", []>, - T1Misc<{0,1,0,?,?,?,?}> { + T1Misc<{0,1,0,?,?,?,?}>, Sched<[WriteST]> { bits<16> regs; let Inst{8} = regs{14}; let Inst{7-0} = regs{7-0}; @@ -1202,7 +1214,7 @@ def tMUL : // A8.6.105 T1 Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2, IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd", [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>, - T1DataProcessing<0b1101> { + T1DataProcessing<0b1101>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> { bits<3> Rd; bits<3> Rn; let Inst{5-3} = Rn; @@ -1499,12 +1511,13 @@ def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), // FIXME: Non-IOS version(s) let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, Defs = [ R7, LR, SP ] in -def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), +def tInt_eh_sjlj_longjmp : XI<(outs), (ins tGPR:$src, tGPR:$scratch), AddrModeNone, 0, IndexModeNone, Pseudo, NoItinerary, "", "", - [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + [(ARMeh_sjlj_longjmp tGPR:$src, tGPR:$scratch)]>, Requires<[IsThumb,IsNotWindows]>; +// (Windows is Thumb2-only) let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, Defs = [ R11, LR, SP ] in def tInt_WIN_eh_sjlj_longjmp @@ -1599,16 +1612,16 @@ def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>; // and expand it just after ISel. let usesCustomInserter = 1, mayLoad =1, Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in - def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb), - (ins rGPR:$Rn, pred:$p), + def tLDR_postidx: tPseudoInst<(outs tGPR:$Rt, tGPR:$Rn_wb), + (ins tGPR:$Rn, pred:$p), 4, IIC_iStore_ru, []>; // post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def // multiple registers) is the same in ISel as MachineInstr, so there's no need // for a pseudo. -def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4), - (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>; +def : T1Pat<(post_store tGPR:$Rt, tGPR:$Rn, 4), + (tSTMIA_UPD tGPR:$Rn, tGPR:$Rt)>; // If it's impossible to use [r,r] address mode for sextload, select to // ldr{b|h} + sxt{b|h} instead. @@ -1677,9 +1690,9 @@ def : T1Pat<(i32 imm256_510:$src), // be expanded into two instructions late to allow if-conversion and // scheduling. let isReMaterializable = 1 in -def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), +def tLDRpci_pic : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr, pclabel:$cp), NoItinerary, - [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + [(set tGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), imm:$cp))]>, Requires<[IsThumb, IsThumb1Only]>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 7a6673b49d57..7cbfaba7a8eb 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -1,9 +1,8 @@ //===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,6 +25,7 @@ def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; } def it_mask : Operand { let PrintMethod = "printThumbITMask"; let ParserMatchClass = it_mask_asmoperand; + let EncoderMethod = "getITMaskOpValue"; } // t2_shift_imm: An integer that encodes a shift amount and the type of shift @@ -40,6 +40,16 @@ def t2_shift_imm : Operand { let DecoderMethod = "DecodeT2ShifterImmOperand"; } +def mve_shift_imm : AsmOperandClass { + let Name = "MVELongShift"; + let RenderMethod = "addImmOperands"; + let DiagnosticString = "operand must be an immediate in the range [1,32]"; +} +def long_shift : Operand { + let ParserMatchClass = mve_shift_imm; + let DecoderMethod = "DecodeLongShiftOperand"; +} + // Shifted operands. No register controlled shifts for Thumb2. // Note: We do not support rrx shifted operands yet. def t2_so_reg : Operand, // reg imm @@ -151,6 +161,26 @@ def lo5AllOne : PatLeaf<(i32 imm), [{ // Define Thumb2 specific addressing modes. +// t2_addr_offset_none := reg +def MemNoOffsetT2AsmOperand + : AsmOperandClass { let Name = "MemNoOffsetT2"; } +def t2_addr_offset_none : MemOperand { + let PrintMethod = "printAddrMode7Operand"; + let DecoderMethod = "DecodeGPRnopcRegisterClass"; + let ParserMatchClass = MemNoOffsetT2AsmOperand; + let MIOperandInfo = (ops GPRnopc:$base); +} + +// t2_nosp_addr_offset_none := reg +def MemNoOffsetT2NoSpAsmOperand + : AsmOperandClass { let Name = "MemNoOffsetT2NoSp"; } +def t2_nosp_addr_offset_none : MemOperand { + let PrintMethod = "printAddrMode7Operand"; + let DecoderMethod = "DecoderGPRRegisterClass"; + let ParserMatchClass = MemNoOffsetT2NoSpAsmOperand; + let MIOperandInfo = (ops rGPR:$base); +} + // t2addrmode_imm12 := reg + imm12 def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";} def t2addrmode_imm12 : MemOperand, @@ -182,31 +212,40 @@ def t2adrlabel : Operand { } // t2addrmode_posimm8 := reg + imm8 -def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";} +def MemPosImm8OffsetAsmOperand : AsmOperandClass { + let Name="MemPosImm8Offset"; + let RenderMethod = "addMemImmOffsetOperands"; +} def t2addrmode_posimm8 : MemOperand { let PrintMethod = "printT2AddrModeImm8Operand"; - let EncoderMethod = "getT2AddrModeImm8OpValue"; + let EncoderMethod = "getT2AddrModeImmOpValue<8,0>"; let DecoderMethod = "DecodeT2AddrModeImm8"; let ParserMatchClass = MemPosImm8OffsetAsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); } // t2addrmode_negimm8 := reg - imm8 -def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";} +def MemNegImm8OffsetAsmOperand : AsmOperandClass { + let Name="MemNegImm8Offset"; + let RenderMethod = "addMemImmOffsetOperands"; +} def t2addrmode_negimm8 : MemOperand, ComplexPattern { let PrintMethod = "printT2AddrModeImm8Operand"; - let EncoderMethod = "getT2AddrModeImm8OpValue"; + let EncoderMethod = "getT2AddrModeImmOpValue<8,0>"; let DecoderMethod = "DecodeT2AddrModeImm8"; let ParserMatchClass = MemNegImm8OffsetAsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); } // t2addrmode_imm8 := reg +/- imm8 -def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; } +def MemImm8OffsetAsmOperand : AsmOperandClass { + let Name = "MemImm8Offset"; + let RenderMethod = "addMemImmOffsetOperands"; +} class T2AddrMode_Imm8 : MemOperand, ComplexPattern { - let EncoderMethod = "getT2AddrModeImm8OpValue"; + let EncoderMethod = "getT2AddrModeImmOpValue<8,0>"; let DecoderMethod = "DecodeT2AddrModeImm8"; let ParserMatchClass = MemImm8OffsetAsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); @@ -248,10 +287,38 @@ def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 { def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; } def t2am_imm8s4_offset : MemOperand { let PrintMethod = "printT2AddrModeImm8s4OffsetOperand"; - let EncoderMethod = "getT2Imm8s4OpValue"; + let EncoderMethod = "getT2ScaledImmOpValue<8,2>"; let DecoderMethod = "DecodeT2Imm8S4"; } +// t2addrmode_imm7s4 := reg +/- (imm7 << 2) +def MemImm7s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm7s4Offset";} +class T2AddrMode_Imm7s4 : MemOperand { + let EncoderMethod = "getT2AddrModeImm7s4OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm7<2,0>"; + let ParserMatchClass = MemImm7s4OffsetAsmOperand; + let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm); +} + +def t2addrmode_imm7s4 : T2AddrMode_Imm7s4 { + // They are printed the same way as the imm8 version + let PrintMethod = "printT2AddrModeImm8s4Operand"; +} + +def t2addrmode_imm7s4_pre : T2AddrMode_Imm7s4 { + // They are printed the same way as the imm8 version + let PrintMethod = "printT2AddrModeImm8s4Operand"; +} + +def t2am_imm7s4_offset_asmoperand : AsmOperandClass { let Name = "Imm7s4"; } +def t2am_imm7s4_offset : MemOperand { + // They are printed the same way as the imm8 version + let PrintMethod = "printT2AddrModeImm8s4OffsetOperand"; + let ParserMatchClass = t2am_imm7s4_offset_asmoperand; + let EncoderMethod = "getT2ScaledImmOpValue<7,2>"; + let DecoderMethod = "DecodeT2Imm7S4"; +} + // t2addrmode_imm0_1020s4 := reg + (imm8 << 2) def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass { let Name = "MemImm0_1020s4Offset"; @@ -290,6 +357,75 @@ def addrmode_tbh : MemOperand { let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm); } +// Define ARMv8.1-M specific addressing modes. + +// Label operands for BF/BFL/WLS/DLS/LE +class BFLabelOp + : Operand { + let EncoderMethod = !strconcat("getBFTargetOpValue<", isNeg, ", ", + fixup, ">"); + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = !strconcat("DecodeBFLabelOperand<", signed, ", ", + isNeg, ", ", zeroPermitted, ", ", size, ">"); +} +def bflabel_u4 : BFLabelOp<"false", "false", "false", "4", "ARM::fixup_bf_branch">; +def bflabel_s12 : BFLabelOp<"true", "false", "true", "12", "ARM::fixup_bfc_target">; +def bflabel_s16 : BFLabelOp<"true", "false", "true", "16", "ARM::fixup_bf_target">; +def bflabel_s18 : BFLabelOp<"true", "false", "true", "18", "ARM::fixup_bfl_target">; + +def wlslabel_u11_asmoperand : AsmOperandClass { + let Name = "WLSLabel"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isUnsignedOffset<11, 1>"; + let DiagnosticString = + "loop end is out of range or not a positive multiple of 2"; +} +def wlslabel_u11 : BFLabelOp<"false", "false", "true", "11", "ARM::fixup_wls"> { + let ParserMatchClass = wlslabel_u11_asmoperand; +} +def lelabel_u11_asmoperand : AsmOperandClass { + let Name = "LELabel"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isLEOffset"; + let DiagnosticString = + "loop start is out of range or not a negative multiple of 2"; +} +def lelabel_u11 : BFLabelOp<"false", "true", "true", "11", "ARM::fixup_le"> { + let ParserMatchClass = lelabel_u11_asmoperand; +} + +def bfafter_target : Operand { + let EncoderMethod = "getBFAfterTargetOpValue"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeBFAfterTargetOperand"; +} + +// pred operand excluding AL +def pred_noal_asmoperand : AsmOperandClass { + let Name = "CondCodeNoAL"; + let RenderMethod = "addITCondCodeOperands"; + let PredicateMethod = "isITCondCodeNoAL"; + let ParserMethod = "parseITCondCode"; +} +def pred_noal : Operand { + let PrintMethod = "printMandatoryPredicateOperand"; + let ParserMatchClass = pred_noal_asmoperand; + let DecoderMethod = "DecodePredNoALOperand"; +} + + +// CSEL aliases inverted predicate +def pred_noal_inv_asmoperand : AsmOperandClass { + let Name = "CondCodeNoALInv"; + let RenderMethod = "addITCondCodeInvOperands"; + let PredicateMethod = "isITCondCodeNoAL"; + let ParserMethod = "parseITCondCode"; +} +def pred_noal_inv : Operand { + let PrintMethod = "printMandatoryInvertedPredicateOperand"; + let ParserMatchClass = pred_noal_inv_asmoperand; +} //===----------------------------------------------------------------------===// // Multiclass helpers... // @@ -604,6 +740,17 @@ multiclass T2I_bin_irs opcod, string opc, let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; + let Inst{15} = 0b0; + // In most of these instructions, and most versions of the Arm + // architecture, bit 15 of this encoding is listed as (0) rather + // than 0, i.e. setting it to 1 is UNPREDICTABLE or a soft-fail + // rather than a hard failure. In v8.1-M, this requirement is + // upgraded to a hard one for ORR, so that the encodings with 1 + // in this bit can be reused for other instructions (such as + // CSEL). Setting Unpredictable{15} = 1 here would reintroduce + // that encoding clash in the auto- generated MC decoder, so I + // comment it out. + let Unpredictable{15} = !if(!eq(opcod, 0b0010), 0b0, 0b1); let Inst{14-12} = 0b000; // imm3 let Inst{7-6} = 0b00; // imm2 let Inst{5-4} = 0b00; // type @@ -617,6 +764,8 @@ multiclass T2I_bin_irs opcod, string opc, let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; + let Inst{15} = 0; + let Unpredictable{15} = !if(!eq(opcod, 0b0010), 0b0, 0b1); // see above } // Assembly aliases for optional destination operand when it's the same // as the source operand. @@ -880,6 +1029,7 @@ multiclass T2I_sh_ir opcod, string opc, Operand ty, SDNode opnode> { let Inst{31-27} = 0b11101; let Inst{26-21} = 0b010010; let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0b0; let Inst{5-4} = opcod; } // register @@ -923,15 +1073,15 @@ multiclass T2I_sh_ir opcod, string opc, Operand ty, SDNode opnode> { /// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test /// patterns. Similar to T2I_bin_irs except the instruction does not produce /// a explicit result, only implicitly set CPSR. -multiclass T2I_cmp_irs opcod, string opc, +multiclass T2I_cmp_irs opcod, string opc, RegisterClass LHSGPR, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, SDPatternOperator opnode> { let isCompare = 1, Defs = [CPSR] in { // shifted imm def ri : T2OneRegCmpImm< - (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii, + (outs), (ins LHSGPR:$Rn, t2_so_imm:$imm), iii, opc, ".w\t$Rn, $imm", - [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> { + [(opnode LHSGPR:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> { let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; @@ -941,9 +1091,9 @@ let isCompare = 1, Defs = [CPSR] in { } // register def rr : T2TwoRegCmp< - (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir, + (outs), (ins LHSGPR:$Rn, rGPR:$Rm), iir, opc, ".w\t$Rn, $Rm", - [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> { + [(opnode LHSGPR:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; let Inst{24-21} = opcod; @@ -955,9 +1105,9 @@ let isCompare = 1, Defs = [CPSR] in { } // shifted register def rs : T2OneRegCmpShiftedReg< - (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis, + (outs), (ins LHSGPR:$Rn, t2_so_reg:$ShiftedRm), iis, opc, ".w\t$Rn, $ShiftedRm", - [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>, + [(opnode LHSGPR:$Rn, t2_so_reg:$ShiftedRm)]>, Sched<[WriteCMPsi]> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b01; @@ -971,9 +1121,9 @@ let isCompare = 1, Defs = [CPSR] in { // No alias here for 'rr' version as not all instantiations of this // multiclass want one (CMP in particular, does not). def : t2InstAlias(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>; + (!cast(NAME#"ri") LHSGPR:$Rn, t2_so_imm:$imm, pred:$p)>; def : t2InstAlias(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>; + (!cast(NAME#"rs") LHSGPR:$Rn, t2_so_reg:$shift, pred:$p)>; } /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns. @@ -1334,7 +1484,8 @@ def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, - "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>, + Sched<[WriteLd]>; def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), (ins t2addrmode_imm8_pre:$addr), @@ -1872,6 +2023,7 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr, let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0b0; let Inst{14-12} = 0b000; let Inst{7-4} = 0b0000; } @@ -2148,6 +2300,11 @@ def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm), (t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; +// Do the same for v8m targets since they support movw with a 16-bit value. +def : T1Pat<(add tGPR:$src, imm0_65535_neg:$imm), + (tSUBrr tGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>, + Requires<[HasV8MBaseline]>; + let AddedComplexity = 1 in def : T2Pat<(ARMaddc rGPR:$src, imm1_255_neg:$imm), (t2SUBSri rGPR:$src, imm1_255_neg:$imm)>; @@ -2327,14 +2484,14 @@ class T2SatI def t2SSAT: T2SatI<(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), "ssat", "\t$Rd, $sat_imm, $Rn$sh">, - Requires<[IsThumb2]> { + Requires<[IsThumb2]>, Sched<[WriteALU]> { let Inst{23-22} = 0b00; let Inst{5} = 0; } def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn), "ssat16", "\t$Rd, $sat_imm, $Rn">, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]>, Sched<[WriteALU]> { let Inst{23-22} = 0b00; let sh = 0b100000; let Inst{4} = 0; @@ -2342,13 +2499,13 @@ def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn), def t2USAT: T2SatI<(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), "usat", "\t$Rd, $sat_imm, $Rn$sh">, - Requires<[IsThumb2]> { + Requires<[IsThumb2]>, Sched<[WriteALU]> { let Inst{23-22} = 0b10; } def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn), "usat16", "\t$Rd, $sat_imm, $Rn">, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]>, Sched<[WriteALU]> { let Inst{23-22} = 0b10; let sh = 0b100000; let Inst{4} = 0; @@ -2395,6 +2552,8 @@ def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, let Inst{26-25} = 0b01; let Inst{24-21} = 0b0010; let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0b0; + let Unpredictable{15} = 0b1; let Inst{14-12} = 0b000; let Inst{7-4} = 0b0011; } @@ -2472,7 +2631,7 @@ class T2TwoRegBitFI { + [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{26} = 0; // should be 0. let Inst{25} = 1; @@ -2488,7 +2647,7 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), def t2SBFX: T2TwoRegBitFI< (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), - IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> { + IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b10100; @@ -2497,7 +2656,7 @@ def t2SBFX: T2TwoRegBitFI< def t2UBFX: T2TwoRegBitFI< (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), - IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> { + IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b11100; @@ -2523,7 +2682,7 @@ let Constraints = "$src = $Rd" in { (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm), IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm", [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, - bf_inv_mask_imm:$imm))]> { + bf_inv_mask_imm:$imm))]>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{26} = 0; // should be 0. let Inst{25} = 1; @@ -2597,7 +2756,8 @@ def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm), // top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise def top16Zero: PatLeaf<(i32 rGPR:$src), [{ - return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); + return !SDValue(N,0)->getValueType(0).isVector() && + CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); }]>; // so_imm_notSext is needed instead of so_imm_not, as the value of imm @@ -3054,7 +3214,7 @@ def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>; //===----------------------------------------------------------------------===// // Comparison Instructions... // -defm t2CMP : T2I_cmp_irs<0b1101, "cmp", +defm t2CMP : T2I_cmp_irs<0b1101, "cmp", GPRnopc, IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, ARMcmp>; def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_imm:$imm), @@ -3122,10 +3282,10 @@ def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm), (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>; -defm t2TST : T2I_cmp_irs<0b0000, "tst", +defm t2TST : T2I_cmp_irs<0b0000, "tst", rGPR, IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>; -defm t2TEQ : T2I_cmp_irs<0b0100, "teq", +defm t2TEQ : T2I_cmp_irs<0b0100, "teq", rGPR, IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>; @@ -3277,17 +3437,17 @@ def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldrexb", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]>; def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldrexh", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]>; def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr), AddrModeT2_ldrex, 4, NoItinerary, "ldrex", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]> { + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]> { bits<4> Rt; bits<12> addr; let Inst{31-27} = 0b11101; @@ -3303,7 +3463,7 @@ def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2), AddrModeNone, 4, NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, - Requires<[IsThumb2, IsNotMClass]> { + Requires<[IsThumb2, IsNotMClass]>, Sched<[WriteLd]> { bits<4> Rt2; let Inst{11-8} = Rt2; } @@ -3311,17 +3471,17 @@ def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldaexb", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>; + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]>; def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldaexh", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>; + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]>; def t2LDAEX : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldaex", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> { + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]> { bits<4> Rt; bits<4> addr; let Inst{31-27} = 0b11101; @@ -3337,7 +3497,7 @@ def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2), AddrModeNone, 4, NoItinerary, "ldaexd", "\t$Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, Requires<[IsThumb, - HasAcquireRelease, HasV7Clrex, IsNotMClass]> { + HasAcquireRelease, HasV7Clrex, IsNotMClass]>, Sched<[WriteLd]> { bits<4> Rt2; let Inst{11-8} = Rt2; @@ -3352,14 +3512,14 @@ def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd), "strexb", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]>; def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd), (ins rGPR:$Rt, addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "strexh", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]>; def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_imm0_1020s4:$addr), @@ -3367,7 +3527,7 @@ def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, "strex", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]> { + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]> { bits<4> Rd; bits<4> Rt; bits<12> addr; @@ -3384,7 +3544,7 @@ def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd), AddrModeNone, 4, NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, - Requires<[IsThumb2, IsNotMClass]> { + Requires<[IsThumb2, IsNotMClass]>, Sched<[WriteST]> { bits<4> Rt2; let Inst{11-8} = Rt2; } @@ -3395,7 +3555,7 @@ def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd), [(set rGPR:$Rd, (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>, Requires<[IsThumb, HasAcquireRelease, - HasV7Clrex]>; + HasV7Clrex]>, Sched<[WriteST]>; def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd), (ins rGPR:$Rt, addr_offset_none:$addr), @@ -3404,7 +3564,7 @@ def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd), [(set rGPR:$Rd, (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>, Requires<[IsThumb, HasAcquireRelease, - HasV7Clrex]>; + HasV7Clrex]>, Sched<[WriteST]>; def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, addr_offset_none:$addr), @@ -3412,7 +3572,8 @@ def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, "stlex", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> { + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, + Sched<[WriteST]> { bits<4> Rd; bits<4> Rt; bits<4> addr; @@ -3429,7 +3590,7 @@ def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd), AddrModeNone, 4, NoItinerary, "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease, - HasV7Clrex, IsNotMClass]> { + HasV7Clrex, IsNotMClass]>, Sched<[WriteST]> { bits<4> Rt2; let Inst{11-8} = Rt2; } @@ -4547,9 +4708,9 @@ def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm", def : t2InstAlias<"cmn${p} $Rn, $Rm", (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; def : t2InstAlias<"teq${p} $Rn, $Rm", - (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; + (t2TEQrr rGPR:$Rn, rGPR:$Rm, pred:$p)>; def : t2InstAlias<"tst${p} $Rn, $Rm", - (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; + (t2TSTrr rGPR:$Rn, rGPR:$Rm, pred:$p)>; // Memory barriers def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>; @@ -4888,3 +5049,227 @@ def : t2InstAlias<"pld${p} $addr", def : InstAlias<"pli${p} $addr", (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>, Requires<[IsThumb2,HasV7]>; + + +//===----------------------------------------------------------------------===// +// ARMv8.1m instructions +// + +class V8_1MI pattern> + : Thumb2XI, + Requires<[HasV8_1MMainline]>; + +def t2CLRM : V8_1MI<(outs), + (ins pred:$p, reglist_with_apsr:$regs, variable_ops), + AddrModeNone, NoItinerary, "clrm", "${p}\t$regs", "", []> { + bits<16> regs; + + let Inst{31-16} = 0b1110100010011111; + let Inst{15-14} = regs{15-14}; + let Inst{13} = 0b0; + let Inst{12-0} = regs{12-0}; +} + +class t2BF + : V8_1MI<(outs ), iops, AddrModeNone, NoItinerary, asm, ops, "", []> { + + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b11; + let Inst{12} = 0b0; + let Inst{0} = 0b1; + + let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; +} + +def t2BF_LabelPseudo + : t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> { + let isTerminator = 1; + let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; +} + +def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p), + !strconcat("bf", "${p}"), "$b_label, $label"> { + bits<4> b_label; + bits<16> label; + + let Inst{26-23} = b_label{3-0}; + let Inst{22-21} = 0b10; + let Inst{20-16} = label{15-11}; + let Inst{13} = 0b1; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; +} + +def t2BFic : t2BF<(ins bflabel_u4:$b_label, bflabel_s12:$label, + bfafter_target:$ba_label, pred_noal:$bcond), "bfcsel", + "$b_label, $label, $ba_label, $bcond"> { + bits<4> bcond; + bits<12> label; + bits<1> ba_label; + bits<4> b_label; + + let Inst{26-23} = b_label{3-0}; + let Inst{22} = 0b0; + let Inst{21-18} = bcond{3-0}; + let Inst{17} = ba_label{0}; + let Inst{16} = label{11}; + let Inst{13} = 0b1; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; +} + +def t2BFr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p), + !strconcat("bfx", "${p}"), "$b_label, $Rn"> { + bits<4> b_label; + bits<4> Rn; + + let Inst{26-23} = b_label{3-0}; + let Inst{22-20} = 0b110; + let Inst{19-16} = Rn{3-0}; + let Inst{13-1} = 0b1000000000000; +} + +def t2BFLi : t2BF<(ins bflabel_u4:$b_label, bflabel_s18:$label, pred:$p), + !strconcat("bfl", "${p}"), "$b_label, $label"> { + bits<4> b_label; + bits<18> label; + + let Inst{26-23} = b_label{3-0}; + let Inst{22-16} = label{17-11}; + let Inst{13} = 0b0; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; +} + +def t2BFLr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p), + !strconcat("bflx", "${p}"), "$b_label, $Rn"> { + bits<4> b_label; + bits<4> Rn; + + let Inst{26-23} = b_label{3-0}; + let Inst{22-20} = 0b111; + let Inst{19-16} = Rn{3-0}; + let Inst{13-1} = 0b1000000000000; +} + +class t2LOL + : V8_1MI { + let Inst{31-23} = 0b111100000; + let Inst{15-14} = 0b11; + let Inst{0} = 0b1; + let isBranch = 1; + let isTerminator = 1; + let DecoderMethod = "DecodeLOLoop"; + let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; +} + +let isNotDuplicable = 1 in { +def t2WLS : t2LOL<(outs GPRlr:$LR), + (ins rGPR:$Rn, wlslabel_u11:$label), + "wls", "$LR, $Rn, $label"> { + bits<4> Rn; + bits<11> label; + let Inst{22-20} = 0b100; + let Inst{19-16} = Rn{3-0}; + let Inst{13-12} = 0b00; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; + let usesCustomInserter = 1; +} + +def t2DLS : t2LOL<(outs GPRlr:$LR), (ins rGPR:$Rn), + "dls", "$LR, $Rn"> { + bits<4> Rn; + let isBranch = 0; + let isTerminator = 0; + let Inst{22-20} = 0b100; + let Inst{19-16} = Rn{3-0}; + let Inst{13-1} = 0b1000000000000; + let usesCustomInserter = 1; +} + +def t2LEUpdate : t2LOL<(outs GPRlr:$LRout), + (ins GPRlr:$LRin, lelabel_u11:$label), + "le", "$LRin, $label"> { + bits<11> label; + let Inst{22-16} = 0b0001111; + let Inst{13-12} = 0b00; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; + let usesCustomInserter = 1; +} + +def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> { + bits<11> label; + let Inst{22-16} = 0b0101111; + let Inst{13-12} = 0b00; + let Inst{11} = label{0}; + let Inst{10-1} = label{10-1}; +} + +def t2DoLoopStart : + t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br, + [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>; + +def t2LoopDec : + t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), + 4, IIC_Br, []>, Sched<[WriteBr]>; + +let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in { +def t2WhileLoopStart : + t2PseudoInst<(outs), + (ins rGPR:$elts, brtarget:$target), + 4, IIC_Br, []>, + Sched<[WriteBr]>; + +def t2LoopEnd : + t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), + 8, IIC_Br, []>, Sched<[WriteBr]>; + +} // end isBranch, isTerminator, hasSideEffects + +} // end isNotDuplicable + +class CS opcode, list pattern=[]> + : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond), + AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<4> fcond; + + let Inst{31-20} = 0b111010100101; + let Inst{19-16} = Rn{3-0}; + let Inst{15-12} = opcode; + let Inst{11-8} = Rd{3-0}; + let Inst{7-4} = fcond{3-0}; + let Inst{3-0} = Rm{3-0}; + + let Uses = [CPSR]; +} + +def t2CSEL : CS<"csel", 0b1000>; +def t2CSINC : CS<"csinc", 0b1001>; +def t2CSINV : CS<"csinv", 0b1010>; +def t2CSNEG : CS<"csneg", 0b1011>; + + +// CS aliases. +let Predicates = [HasV8_1MMainline] in { + def : InstAlias<"csetm\t$Rd, $fcond", + (t2CSINV rGPR:$Rd, ZR, ZR, pred_noal_inv:$fcond)>; + + def : InstAlias<"cset\t$Rd, $fcond", + (t2CSINC rGPR:$Rd, ZR, ZR, pred_noal_inv:$fcond)>; + + def : InstAlias<"cinc\t$Rd, $Rn, $fcond", + (t2CSINC rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>; + + def : InstAlias<"cinv\t$Rd, $Rn, $fcond", + (t2CSINV rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>; + + def : InstAlias<"cneg\t$Rd, $Rn, $fcond", + (t2CSNEG rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>; +} diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index b58730c452f7..a0dd25de07ee 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -1,9 +1,8 @@ //===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -53,28 +52,50 @@ def vfp_f16imm : Operand, let ParserMatchClass = FPImmOperand; } -def vfp_f32imm : Operand, - PatLeaf<(f32 fpimm), [{ - return ARM_AM::getFP32Imm(N->getValueAPF()) != -1; - }], SDNodeXFormgetValueAPF(); uint32_t enc = ARM_AM::getFP32Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); - }]>> { + }]>; + +def gi_vfp_f32imm : GICustomOperandRenderer<"renderVFPF32Imm">, + GISDNodeXFormEquiv; + +def vfp_f32imm : Operand, + PatLeaf<(f32 fpimm), [{ + return ARM_AM::getFP32Imm(N->getValueAPF()) != -1; + }], vfp_f32imm_xform> { let PrintMethod = "printFPImmOperand"; let ParserMatchClass = FPImmOperand; + let GISelPredicateCode = [{ + const auto &MO = MI.getOperand(1); + if (!MO.isFPImm()) + return false; + return ARM_AM::getFP32Imm(MO.getFPImm()->getValueAPF()) != -1; + }]; } -def vfp_f64imm : Operand, - PatLeaf<(f64 fpimm), [{ - return ARM_AM::getFP64Imm(N->getValueAPF()) != -1; - }], SDNodeXFormgetValueAPF(); uint32_t enc = ARM_AM::getFP64Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); - }]>> { + }]>; + +def gi_vfp_f64imm : GICustomOperandRenderer<"renderVFPF64Imm">, + GISDNodeXFormEquiv; + +def vfp_f64imm : Operand, + PatLeaf<(f64 fpimm), [{ + return ARM_AM::getFP64Imm(N->getValueAPF()) != -1; + }], vfp_f64imm_xform> { let PrintMethod = "printFPImmOperand"; let ParserMatchClass = FPImmOperand; + let GISelPredicateCode = [{ + const auto &MO = MI.getOperand(1); + if (!MO.isFPImm()) + return false; + return ARM_AM::getFP64Imm(MO.getFPImm()->getValueAPF()) != -1; + }]; } def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ @@ -120,39 +141,45 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr), IIC_fpLoad64, "vldr", "\t$Dd, $addr", - [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>; + [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>, + Requires<[HasFPRegs]>; def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), IIC_fpLoad32, "vldr", "\t$Sd, $addr", - [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> { + [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]>, + Requires<[HasFPRegs]> { // Some single precision VFP instructions may be executed on both NEON and VFP // pipelines. let D = VFPNeonDomain; } +let isUnpredicable = 1 in def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr), IIC_fpLoad16, "vldr", ".16\t$Sd, $addr", [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>, - Requires<[HasFullFP16]>; + Requires<[HasFPRegs16]>; } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), IIC_fpStore64, "vstr", "\t$Dd, $addr", - [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>; + [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>, + Requires<[HasFPRegs]>; def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), IIC_fpStore32, "vstr", "\t$Sd, $addr", - [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> { + [(alignedstore32 SPR:$Sd, addrmode5:$addr)]>, + Requires<[HasFPRegs]> { // Some single precision VFP instructions may be executed on both NEON and VFP // pipelines. let D = VFPNeonDomain; } +let isUnpredicable = 1 in def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), IIC_fpStore16, "vstr", ".16\t$Sd, $addr", [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>, - Requires<[HasFullFP16]>; + Requires<[HasFPRegs16]>; //===----------------------------------------------------------------------===// // Load / store multiple Instructions. @@ -160,6 +187,7 @@ def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), multiclass vfp_ldst_mult { + let Predicates = [HasFPRegs] in { // Double Precision def DIA : AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), @@ -227,6 +255,7 @@ multiclass vfp_ldst_mult, - Requires<[HasVFP2]>; + Requires<[HasFPRegs]>; def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r), 0>, - Requires<[HasVFP2]>; + Requires<[HasFPRegs]>; def : InstAlias<"vpop${p} $r", (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r), 0>, - Requires<[HasVFP2]>; + Requires<[HasFPRegs]>; def : InstAlias<"vpop${p} $r", (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r), 0>, - Requires<[HasVFP2]>; + Requires<[HasFPRegs]>; defm : VFPDTAnyInstAlias<"vpush${p}", "$r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>; defm : VFPDTAnyInstAlias<"vpush${p}", "$r", @@ -295,6 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r", // However, there is no UAL syntax for them, so we keep them around for // (dis)assembly only. multiclass vfp_ldstx_mult { + let Predicates = [HasFPRegs] in { // Unknown precision def XIA : AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), @@ -317,6 +347,7 @@ multiclass vfp_ldstx_mult { let Inst{21} = 1; // Writeback let Inst{20} = L_bit; } + } } defm FLDM : vfp_ldstx_mult<"fldm", 1>; @@ -452,7 +483,7 @@ def VNMULH : AHbI<0b11100, 0b10, 1, 0, multiclass vsel_inst opc, int CC> { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", - Uses = [CPSR], AddedComplexity = 4 in { + Uses = [CPSR], AddedComplexity = 4, isUnpredicable = 1 in { def H : AHbInp<0b11100, opc, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"), @@ -480,7 +511,8 @@ defm VSELEQ : vsel_inst<"eq", 0b00, 0>; defm VSELVS : vsel_inst<"vs", 0b01, 6>; multiclass vmaxmin_inst { - let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in { + let DecoderNamespace = "VFPV8", PostEncoderMethod = "", + isUnpredicable = 1 in { def H : AHbInp<0b11101, 0b00, opc, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"), @@ -501,8 +533,8 @@ multiclass vmaxmin_inst { } } -defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; -defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; +defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; +defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -571,9 +603,9 @@ def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0, } def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm", - []>; + [(set HPR:$Sd, (fabs (f16 HPR:$Sm)))]>; let Defs = [FPSCR_NZCV] in { def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, @@ -682,8 +714,8 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FullFP16Pat<(f32 (fpextend HPR:$Sm)), - (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>; +def : FP16Pat<(f32 (fpextend HPR:$Sm)), + (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>; def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; @@ -693,8 +725,8 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FullFP16Pat<(f16 (fpround SPR:$Sm)), - (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>; +def : FP16Pat<(f16 (fpround SPR:$Sm)), + (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; @@ -825,7 +857,7 @@ multiclass vcvt_inst rm, let Inst{17-16} = rm; - // Encode instruction operands + // Encode instruction operands. let Inst{3-0} = Dm{3-0}; let Inst{5} = Dm{4}; let Inst{8} = 1; @@ -906,9 +938,9 @@ def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0, multiclass vrint_inst_zrx { def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm", - []>, + [(set (f16 HPR:$Sd), (node (f16 HPR:$Sm)))]>, Requires<[HasFullFP16]> { let Inst{7} = op2; let Inst{16} = op; @@ -948,11 +980,12 @@ defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>; multiclass vrint_inst_anpm rm, SDPatternOperator node = null_frag> { - let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { + let PostEncoderMethod = "", DecoderNamespace = "VFPV8", + isUnpredicable = 1 in { def H : AHuInp<0b11101, 0b11, 0b1000, 0b01, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vrint", opc, ".f16\t$Sd, $Sm"), - []>, + [(set (f16 HPR:$Sd), (node (f16 HPR:$Sm)))]>, Requires<[HasFullFP16]> { let Inst{17-16} = rm; } @@ -998,22 +1031,24 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, Sched<[WriteFPSQRT32]>; def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", - []>; + [(set HPR:$Sd, (fsqrt (f16 HPR:$Sm)))]>; let hasSideEffects = 0 in { let isMoveReg = 1 in { def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs DPR:$Dd), (ins DPR:$Dm), - IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>; + IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>, + Requires<[HasFPRegs64]>; def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), - IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>; + IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>, + Requires<[HasFPRegs]>; } // isMoveReg -let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { +let PostEncoderMethod = "", DecoderNamespace = "VFPV8", isUnpredicable = 1 in { def VMOVH : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpUNA16, "vmovx.f16\t$Sd, $Sm", []>, @@ -1035,6 +1070,7 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$Rt), (ins SPR:$Sn), IIC_fpMOVSI, "vmov", "\t$Rt, $Sn", [(set GPR:$Rt, (bitconvert SPR:$Sn))]>, + Requires<[HasFPRegs]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<4> Rt; @@ -1058,7 +1094,7 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$Sn), (ins GPR:$Rt), IIC_fpMOVIS, "vmov", "\t$Sn, $Rt", [(set SPR:$Sn, (bitconvert GPR:$Rt))]>, - Requires<[HasVFP2, UseVMOVSR]>, + Requires<[HasFPRegs, UseVMOVSR]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<5> Sn; @@ -1084,6 +1120,7 @@ def VMOVRRD : AVConv3I<0b11000101, 0b1011, (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm), IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm", [(set GPR:$Rt, GPR:$Rt2, (arm_fmrrd DPR:$Dm))]>, + Requires<[HasFPRegs]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<5> Dm; @@ -1112,6 +1149,7 @@ def VMOVRRS : AVConv3I<0b11000101, 0b1010, (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2), IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2", [/* For disassembly only; pattern left blank */]>, + Requires<[HasFPRegs]>, Sched<[WriteFPMOV]> { bits<5> src1; bits<4> Rt; @@ -1139,6 +1177,7 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011, (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2), IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2", [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]>, + Requires<[HasFPRegs]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<5> Dm; @@ -1183,6 +1222,7 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2), IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2", [/* For disassembly only; pattern left blank */]>, + Requires<[HasFPRegs]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<5> dst1; @@ -1206,10 +1246,10 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, // Move H->R, clearing top 16 bits def VMOVRH : AVConv2I<0b11100001, 0b1001, - (outs GPR:$Rt), (ins HPR:$Sn), + (outs rGPR:$Rt), (ins HPR:$Sn), IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn", - [(set GPR:$Rt, (arm_vmovrh HPR:$Sn))]>, - Requires<[HasFullFP16]>, + [(set rGPR:$Rt, (arm_vmovrh HPR:$Sn))]>, + Requires<[HasFPRegs16]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<4> Rt; @@ -1222,14 +1262,16 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001, let Inst{6-5} = 0b00; let Inst{3-0} = 0b0000; + + let isUnpredicable = 1; } // Move R->H, clearing top 16 bits def VMOVHR : AVConv4I<0b11100000, 0b1001, - (outs HPR:$Sn), (ins GPR:$Rt), + (outs HPR:$Sn), (ins rGPR:$Rt), IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt", - [(set HPR:$Sn, (arm_vmovhr GPR:$Rt))]>, - Requires<[HasFullFP16]>, + [(set HPR:$Sn, (arm_vmovhr rGPR:$Rt))]>, + Requires<[HasFPRegs16]>, Sched<[WriteFPMOV]> { // Instruction operands. bits<5> Sn; @@ -1242,6 +1284,8 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001, let Inst{6-5} = 0b00; let Inst{3-0} = 0b0000; + + let isUnpredicable = 1; } // FMRDH: SPR -> GPR @@ -1348,6 +1392,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, []>, Sched<[WriteFPCVT]> { let Inst{7} = 1; // s32 + let isUnpredicable = 1; } def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), @@ -1393,6 +1438,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, []>, Sched<[WriteFPCVT]> { let Inst{7} = 0; // u32 + let isUnpredicable = 1; } def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)), @@ -1497,6 +1543,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, []>, Sched<[WriteFPCVT]> { let Inst{7} = 1; // Z bit + let isUnpredicable = 1; } def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)), @@ -1543,6 +1590,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, []>, Sched<[WriteFPCVT]> { let Inst{7} = 1; // Z bit + let isUnpredicable = 1; } def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)), @@ -1572,6 +1620,7 @@ def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, []>, Sched<[WriteFPCVT]> { let Inst{7} = 0; // Z bit + let isUnpredicable = 1; } def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, @@ -1596,6 +1645,7 @@ def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, []>, Sched<[WriteFPCVT]> { let Inst{7} = 0; // Z bit + let isUnpredicable = 1; } } @@ -1643,6 +1693,8 @@ class AVConv1XInsD_Encode op1, bits<2> op2, bits<4> op3, bits<4> op4, let Predicates = [HasVFP2, HasDPVFP]; } +let isUnpredicable = 1 in { + def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>, @@ -1667,6 +1719,8 @@ def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1, Requires<[HasFullFP16]>, Sched<[WriteFPCVT]>; +} // End of 'let isUnpredicable = 1 in' + def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []>, @@ -1722,6 +1776,8 @@ def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1, // Fixed-Point to FP: +let isUnpredicable = 1 in { + def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>, @@ -1746,6 +1802,8 @@ def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1, Requires<[HasFullFP16]>, Sched<[WriteFPCVT]>; +} // End of 'let isUnpredicable = 1 in' + def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []>, @@ -2030,6 +2088,9 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; +def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, HPR:$Sdin)), + (VFMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, + Requires<[HasFullFP16]>; def VFMSD : ADbI<0b11101, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -2208,13 +2269,13 @@ def VMOVDcc : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p), IIC_fpUNA64, [(set (f64 DPR:$Dd), (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>, - RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>; + RegConstraint<"$Dn = $Dd">, Requires<[HasFPRegs64]>; def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p), IIC_fpUNA32, [(set (f32 SPR:$Sd), (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>, - RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>; + RegConstraint<"$Sn = $Sd">, Requires<[HasFPRegs]>; } // hasSideEffects //===----------------------------------------------------------------------===// @@ -2238,15 +2299,16 @@ class MovFromVFP opc19_16, dag oops, dag iops, string opc, string asm, let Inst{3-0} = 0b0000; } -// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags -// to APSR. -let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in -def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins), - "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>; - let DecoderMethod = "DecodeForVMRSandVMSR" in { + // APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags + // to APSR. + let Defs = [CPSR], Uses = [FPSCR_NZCV], Predicates = [HasFPRegs], + Rt = 0b1111 /* apsr_nzcv */ in + def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins), + "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>; + // Application level FPSCR -> GPR - let hasSideEffects = 1, Uses = [FPSCR] in + let hasSideEffects = 1, Uses = [FPSCR], Predicates = [HasFPRegs] in def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPRnopc:$Rt), (ins), "vmrs", "\t$Rt, fpscr", [(set GPRnopc:$Rt, (int_arm_get_fpscr))]>; @@ -2269,6 +2331,33 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in { "vmrs", "\t$Rt, fpinst", []>; def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPRnopc:$Rt), (ins), "vmrs", "\t$Rt, fpinst2", []>; + let Predicates = [HasV8_1MMainline, HasFPRegs] in { + // System level FPSCR_NZCVQC -> GPR + def VMRS_FPSCR_NZCVQC + : MovFromVFP<0b0010 /* fpscr_nzcvqc */, + (outs GPR:$Rt), (ins cl_FPSCR_NZCV:$fpscr_in), + "vmrs", "\t$Rt, fpscr_nzcvqc", []>; + } + } + let Predicates = [HasV8_1MMainline, Has8MSecExt] in { + // System level FPSCR -> GPR, with context saving for security extensions + def VMRS_FPCXTNS : MovFromVFP<0b1110 /* fpcxtns */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpcxtns", []>; + } + let Predicates = [HasV8_1MMainline, Has8MSecExt] in { + // System level FPSCR -> GPR, with context saving for security extensions + def VMRS_FPCXTS : MovFromVFP<0b1111 /* fpcxts */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpcxts", []>; + } + + let Predicates = [HasV8_1MMainline, HasMVEInt] in { + // System level VPR/P0 -> GPR + let Uses = [VPR] in + def VMRS_VPR : MovFromVFP<0b1100 /* vpr */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, vpr", []>; + + def VMRS_P0 : MovFromVFP<0b1101 /* p0 */, (outs GPR:$Rt), (ins VCCR:$cond), + "vmrs", "\t$Rt, p0", []>; } } @@ -2291,10 +2380,12 @@ class MovToVFP opc19_16, dag oops, dag iops, string opc, string asm, let Inst{11-8} = 0b1010; let Inst{7} = 0; let Inst{4} = 1; + let Predicates = [HasVFP2]; } let DecoderMethod = "DecodeForVMRSandVMSR" in { let Defs = [FPSCR] in { + let Predicates = [HasFPRegs] in // Application level GPR -> FPSCR def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src), "vmsr", "\tfpscr, $src", @@ -2310,6 +2401,33 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in { def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src), "vmsr", "\tfpinst2, $src", []>; } + let Predicates = [HasV8_1MMainline, Has8MSecExt] in { + // System level GPR -> FPSCR with context saving for security extensions + def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src), + "vmsr", "\tfpcxtns, $src", []>; + } + let Predicates = [HasV8_1MMainline, Has8MSecExt] in { + // System level GPR -> FPSCR with context saving for security extensions + def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src), + "vmsr", "\tfpcxts, $src", []>; + } + let Predicates = [HasV8_1MMainline, HasFPRegs] in { + // System level GPR -> FPSCR_NZCVQC + def VMSR_FPSCR_NZCVQC + : MovToVFP<0b0010 /* fpscr_nzcvqc */, + (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src), + "vmsr", "\tfpscr_nzcvqc, $src", []>; + } + + let Predicates = [HasV8_1MMainline, HasMVEInt] in { + // System level GPR -> VPR/P0 + let Defs = [VPR] in + def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src), + "vmsr", "\tvpr, $src", []>; + + def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src), + "vmsr", "\tp0, $src", []>; + } } //===----------------------------------------------------------------------===// @@ -2371,6 +2489,8 @@ def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm), let Inst{11-8} = 0b1001; // Half precision let Inst{7-4} = 0b0000; let Inst{3-0} = imm{3-0}; + + let isUnpredicable = 1; } } @@ -2426,7 +2546,7 @@ def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>; def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>; -def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>; +def : InstAlias<"fmstat${p}", (FMSTAT pred:$p), 0>, Requires<[HasFPRegs]>; def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm", (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>; def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm", @@ -2484,3 +2604,126 @@ def : VFP3InstAlias<"fconstd${p} $Dd, $val", (FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>; def : VFP3InstAlias<"fconsts${p} $Sd, $val", (FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>; + +def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_ops), + AddrModeNone, 4, IndexModeNone, VFPMiscFrm, NoItinerary, + "vscclrm{$p}\t$regs", "", []>, Sched<[]> { + bits<13> regs; + let Inst{31-23} = 0b111011001; + let Inst{22} = regs{12}; + let Inst{21-16} = 0b011111; + let Inst{15-12} = regs{11-8}; + let Inst{11-8} = 0b1011; + let Inst{7-0} = regs{7-0}; + + let DecoderMethod = "DecodeVSCCLRM"; + + list Predicates = [HasV8_1MMainline, Has8MSecExt]; +} + +def VSCCLRMS : VFPXI<(outs), (ins pred:$p, fp_sreglist_with_vpr:$regs, variable_ops), + AddrModeNone, 4, IndexModeNone, VFPMiscFrm, NoItinerary, + "vscclrm{$p}\t$regs", "", []>, Sched<[]> { + bits<13> regs; + let Inst{31-23} = 0b111011001; + let Inst{22} = regs{8}; + let Inst{21-16} = 0b011111; + let Inst{15-12} = regs{12-9}; + let Inst{11-8} = 0b1010; + let Inst{7-0} = regs{7-0}; + + let DecoderMethod = "DecodeVSCCLRM"; + + list Predicates = [HasV8_1MMainline, Has8MSecExt]; +} + +//===----------------------------------------------------------------------===// +// Store VFP System Register to memory. +// + +class vfp_vstrldr SysReg, string sysreg, + dag oops, dag iops, IndexMode im, string Dest, string cstr> + : VFPI, + Sched<[]> { + bits<12> addr; + let Inst{27-25} = 0b110; + let Inst{24} = P; + let Inst{23} = addr{7}; + let Inst{22} = SysReg{3}; + let Inst{21} = W; + let Inst{20} = opc; + let Inst{19-16} = addr{11-8}; + let Inst{15-13} = SysReg{2-0}; + let Inst{12-7} = 0b011111; + let Inst{6-0} = addr{6-0}; + list Predicates = [HasFPRegs, HasV8_1MMainline]; + let mayLoad = opc; + let mayStore = !if(opc, 0b0, 0b1); + let hasSideEffects = 1; +} + +multiclass vfp_vstrldr_sysreg SysReg, string sysreg, + dag oops=(outs), dag iops=(ins)> { + def _off : + vfp_vstrldr { + let DecoderMethod = "DecodeVSTRVLDR_SYSREG"; + } + + def _pre : + vfp_vstrldr { + let DecoderMethod = "DecodeVSTRVLDR_SYSREG"; + } + + def _post : + vfp_vstrldr { + bits<4> Rn; + let Inst{19-16} = Rn{3-0}; + let DecoderMethod = "DecodeVSTRVLDR_SYSREG"; + } +} + +let Defs = [FPSCR] in { + defm VSTR_FPSCR : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">; + defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">; + + let Predicates = [HasV8_1MMainline, Has8MSecExt] in { + defm VSTR_FPCXTNS : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">; + defm VSTR_FPCXTS : vfp_vstrldr_sysreg<0b0,0b1111, "fpcxts">; + } +} + +let Predicates = [HasV8_1MMainline, HasMVEInt] in { + let Uses = [VPR] in { + defm VSTR_VPR : vfp_vstrldr_sysreg<0b0,0b1100, "vpr">; + } + defm VSTR_P0 : vfp_vstrldr_sysreg<0b0,0b1101, "p0", + (outs), (ins VCCR:$P0)>; +} + +let Uses = [FPSCR] in { + defm VLDR_FPSCR : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">; + defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">; + + let Predicates = [HasV8_1MMainline, Has8MSecExt] in { + defm VLDR_FPCXTNS : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">; + defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">; + } +} + +let Predicates = [HasV8_1MMainline, HasMVEInt] in { + let Defs = [VPR] in { + defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">; + } + defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0", + (outs VCCR:$P0), (ins)>; +} diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 293e734c97cd..4485a474a6df 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -1,9 +1,8 @@ //===- ARMInstructionSelector.cpp ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -76,6 +75,11 @@ private: const ARMRegisterBankInfo &RBI; const ARMSubtarget &STI; + // FIXME: This is necessary because DAGISel uses "Subtarget->" and GlobalISel + // uses "STI." in the code generated by TableGen. If we want to reuse some of + // the custom C++ predicates written for DAGISel, we need to have both around. + const ARMSubtarget *Subtarget = &STI; + // Store the opcodes that we might need, so we don't have to check what kind // of subtarget (ARM vs Thumb) we have all the time. struct OpcodeCache { @@ -98,6 +102,27 @@ private: unsigned STORE8; unsigned LOAD8; + unsigned ADDrr; + unsigned ADDri; + + // Used for G_ICMP + unsigned CMPrr; + unsigned MOVi; + unsigned MOVCCi; + + // Used for G_SELECT + unsigned MOVCCr; + + unsigned TSTri; + unsigned Bcc; + + // Used for G_GLOBAL_VALUE + unsigned MOVi32imm; + unsigned ConstPoolLoad; + unsigned MOV_ga_pcrel; + unsigned LDRLIT_ga_pcrel; + unsigned LDRLIT_ga_abs; + OpcodeCache(const ARMSubtarget &STI); } const Opcodes; @@ -112,6 +137,9 @@ private: unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, unsigned Size) const; + void renderVFPF32Imm(MachineInstrBuilder &New, const MachineInstr &Old) const; + void renderVFPF64Imm(MachineInstrBuilder &New, const MachineInstr &Old) const; + #define GET_GLOBALISEL_PREDICATES_DECL #include "ARMGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -204,7 +232,7 @@ static bool selectMergeValues(MachineInstrBuilder &MIB, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - assert(TII.getSubtarget().hasVFP2() && "Can't select merge without VFP"); + assert(TII.getSubtarget().hasVFP2Base() && "Can't select merge without VFP"); // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs // into one DPR. @@ -235,7 +263,8 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - assert(TII.getSubtarget().hasVFP2() && "Can't select unmerge without VFP"); + assert(TII.getSubtarget().hasVFP2Base() && + "Can't select unmerge without VFP"); // We only support G_UNMERGE_VALUES as a way to break up one DPR into two // GPRs. @@ -285,6 +314,24 @@ ARMInstructionSelector::OpcodeCache::OpcodeCache(const ARMSubtarget &STI) { STORE_OPCODE(STORE8, STRBi12); STORE_OPCODE(LOAD8, LDRBi12); + + STORE_OPCODE(ADDrr, ADDrr); + STORE_OPCODE(ADDri, ADDri); + + STORE_OPCODE(CMPrr, CMPrr); + STORE_OPCODE(MOVi, MOVi); + STORE_OPCODE(MOVCCi, MOVCCi); + + STORE_OPCODE(MOVCCr, MOVCCr); + + STORE_OPCODE(TSTri, TSTri); + STORE_OPCODE(Bcc, Bcc); + + STORE_OPCODE(MOVi32imm, MOVi32imm); + ConstPoolLoad = isThumb ? ARM::t2LDRpci : ARM::LDRi12; + STORE_OPCODE(MOV_ga_pcrel, MOV_ga_pcrel); + LDRLIT_ga_pcrel = isThumb ? ARM::tLDRLIT_ga_pcrel : ARM::LDRLIT_ga_pcrel; + LDRLIT_ga_abs = isThumb ? ARM::tLDRLIT_ga_abs : ARM::LDRLIT_ga_abs; #undef MAP_OPCODE } @@ -408,10 +455,11 @@ getComparePreds(CmpInst::Predicate Pred) { } struct ARMInstructionSelector::CmpConstants { - CmpConstants(unsigned CmpOpcode, unsigned FlagsOpcode, unsigned OpRegBank, - unsigned OpSize) + CmpConstants(unsigned CmpOpcode, unsigned FlagsOpcode, unsigned SelectOpcode, + unsigned OpRegBank, unsigned OpSize) : ComparisonOpcode(CmpOpcode), ReadFlagsOpcode(FlagsOpcode), - OperandRegBankID(OpRegBank), OperandSize(OpSize) {} + SelectResultOpcode(SelectOpcode), OperandRegBankID(OpRegBank), + OperandSize(OpSize) {} // The opcode used for performing the comparison. const unsigned ComparisonOpcode; @@ -420,6 +468,9 @@ struct ARMInstructionSelector::CmpConstants { // ARM::INSTRUCTION_LIST_END if we don't need to read the flags. const unsigned ReadFlagsOpcode; + // The opcode used for materializing the result of the comparison. + const unsigned SelectResultOpcode; + // The assumed register bank ID for the operands. const unsigned OperandRegBankID; @@ -439,7 +490,7 @@ struct ARMInstructionSelector::InsertInfo { void ARMInstructionSelector::putConstant(InsertInfo I, unsigned DestReg, unsigned Constant) const { - (void)BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVi)) + (void)BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(Opcodes.MOVi)) .addDef(DestReg) .addImm(Constant) .add(predOps(ARMCC::AL)) @@ -542,7 +593,8 @@ bool ARMInstructionSelector::insertComparison(CmpConstants Helper, InsertInfo I, } // Select either 1 or the previous result based on the value of the flags. - auto Mov1I = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVCCi)) + auto Mov1I = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, + TII.get(Helper.SelectResultOpcode)) .addDef(ResReg) .addUse(PrevRes) .addImm(1) @@ -569,7 +621,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, auto &MBB = *MIB->getParent(); auto &MF = *MBB.getParent(); - bool UseMovt = STI.useMovt(MF); + bool UseMovt = STI.useMovt(); unsigned Size = TM.getPointerSize(0); unsigned Alignment = 4; @@ -577,7 +629,9 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, auto addOpsForConstantPoolLoad = [&MF, Alignment, Size](MachineInstrBuilder &MIB, const GlobalValue *GV, bool IsSBREL) { - assert(MIB->getOpcode() == ARM::LDRi12 && "Unsupported instruction"); + assert((MIB->getOpcode() == ARM::LDRi12 || + MIB->getOpcode() == ARM::t2LDRpci) && + "Unsupported instruction"); auto ConstPool = MF.getConstantPool(); auto CPIndex = // For SB relative entries we need a target-specific constant pool. @@ -587,21 +641,38 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, ARMConstantPoolConstant::Create(GV, ARMCP::SBREL), Alignment) : ConstPool->getConstantPoolIndex(GV, Alignment); MIB.addConstantPoolIndex(CPIndex, /*Offset*/ 0, /*TargetFlags*/ 0) - .addMemOperand( - MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), - MachineMemOperand::MOLoad, Size, Alignment)) - .addImm(0) - .add(predOps(ARMCC::AL)); + .addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, + Size, Alignment)); + if (MIB->getOpcode() == ARM::LDRi12) + MIB.addImm(0); + MIB.add(predOps(ARMCC::AL)); + }; + + auto addGOTMemOperand = [this, &MF, Alignment](MachineInstrBuilder &MIB) { + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad, + TM.getProgramPointerSize(), Alignment)); }; if (TM.isPositionIndependent()) { bool Indirect = STI.isGVIndirectSymbol(GV); + + // For ARM mode, we have different pseudoinstructions for direct accesses + // and indirect accesses, and the ones for indirect accesses include the + // load from GOT. For Thumb mode, we use the same pseudoinstruction for both + // direct and indirect accesses, and we need to manually generate the load + // from GOT. + bool UseOpcodeThatLoads = Indirect && !STI.isThumb(); + // FIXME: Taking advantage of MOVT for ELF is pretty involved, so we don't // support it yet. See PR28229. unsigned Opc = UseMovt && !STI.isTargetELF() - ? (Indirect ? ARM::MOV_ga_pcrel_ldr : ARM::MOV_ga_pcrel) - : (Indirect ? ARM::LDRLIT_ga_pcrel_ldr : ARM::LDRLIT_ga_pcrel); + ? (UseOpcodeThatLoads ? (unsigned)ARM::MOV_ga_pcrel_ldr + : Opcodes.MOV_ga_pcrel) + : (UseOpcodeThatLoads ? (unsigned)ARM::LDRLIT_ga_pcrel_ldr + : Opcodes.LDRLIT_ga_pcrel); MIB->setDesc(TII.get(Opc)); int TargetFlags = ARMII::MO_NO_FLAG; @@ -611,17 +682,35 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, TargetFlags |= ARMII::MO_GOT; MIB->getOperand(1).setTargetFlags(TargetFlags); - if (Indirect) - MIB.addMemOperand(MF.getMachineMemOperand( - MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad, - TM.getProgramPointerSize(), Alignment)); + if (Indirect) { + if (!UseOpcodeThatLoads) { + auto ResultReg = MIB->getOperand(0).getReg(); + auto AddressReg = MRI.createVirtualRegister(&ARM::GPRRegClass); + + MIB->getOperand(0).setReg(AddressReg); + + auto InsertBefore = std::next(MIB->getIterator()); + auto MIBLoad = BuildMI(MBB, InsertBefore, MIB->getDebugLoc(), + TII.get(Opcodes.LOAD32)) + .addDef(ResultReg) + .addReg(AddressReg) + .addImm(0) + .add(predOps(ARMCC::AL)); + addGOTMemOperand(MIBLoad); + + if (!constrainSelectedInstRegOperands(*MIBLoad, TII, TRI, RBI)) + return false; + } else { + addGOTMemOperand(MIB); + } + } return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } bool isReadOnly = STI.getTargetLowering()->isReadOnly(GV); if (STI.isROPI() && isReadOnly) { - unsigned Opc = UseMovt ? ARM::MOV_ga_pcrel : ARM::LDRLIT_ga_pcrel; + unsigned Opc = UseMovt ? Opcodes.MOV_ga_pcrel : Opcodes.LDRLIT_ga_pcrel; MIB->setDesc(TII.get(Opc)); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } @@ -630,19 +719,19 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, MachineInstrBuilder OffsetMIB; if (UseMovt) { OffsetMIB = BuildMI(MBB, *MIB, MIB->getDebugLoc(), - TII.get(ARM::MOVi32imm), Offset); + TII.get(Opcodes.MOVi32imm), Offset); OffsetMIB.addGlobalAddress(GV, /*Offset*/ 0, ARMII::MO_SBREL); } else { // Load the offset from the constant pool. - OffsetMIB = - BuildMI(MBB, *MIB, MIB->getDebugLoc(), TII.get(ARM::LDRi12), Offset); + OffsetMIB = BuildMI(MBB, *MIB, MIB->getDebugLoc(), + TII.get(Opcodes.ConstPoolLoad), Offset); addOpsForConstantPoolLoad(OffsetMIB, GV, /*IsSBREL*/ true); } if (!constrainSelectedInstRegOperands(*OffsetMIB, TII, TRI, RBI)) return false; // Add the offset to the SB register. - MIB->setDesc(TII.get(ARM::ADDrr)); + MIB->setDesc(TII.get(Opcodes.ADDrr)); MIB->RemoveOperand(1); MIB.addReg(ARM::R9) // FIXME: don't hardcode R9 .addReg(Offset) @@ -654,18 +743,18 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, if (STI.isTargetELF()) { if (UseMovt) { - MIB->setDesc(TII.get(ARM::MOVi32imm)); + MIB->setDesc(TII.get(Opcodes.MOVi32imm)); } else { // Load the global's address from the constant pool. - MIB->setDesc(TII.get(ARM::LDRi12)); + MIB->setDesc(TII.get(Opcodes.ConstPoolLoad)); MIB->RemoveOperand(1); addOpsForConstantPoolLoad(MIB, GV, /*IsSBREL*/ false); } } else if (STI.isTargetMachO()) { if (UseMovt) - MIB->setDesc(TII.get(ARM::MOVi32imm)); + MIB->setDesc(TII.get(Opcodes.MOVi32imm)); else - MIB->setDesc(TII.get(ARM::LDRLIT_ga_abs)); + MIB->setDesc(TII.get(Opcodes.LDRLIT_ga_abs)); } else { LLVM_DEBUG(dbgs() << "Object format not supported yet\n"); return false; @@ -680,13 +769,13 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, auto InsertBefore = std::next(MIB->getIterator()); auto &DbgLoc = MIB->getDebugLoc(); - // Compare the condition to 0. + // Compare the condition to 1. auto CondReg = MIB->getOperand(1).getReg(); assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) && "Unsupported types for select operation"); - auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::CMPri)) + auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.TSTri)) .addUse(CondReg) - .addImm(0) + .addImm(1) .add(predOps(ARMCC::AL)); if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI)) return false; @@ -699,7 +788,7 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) && validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) && "Unsupported types for select operation"); - auto Mov1I = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::MOVCCr)) + auto Mov1I = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.MOVCCr)) .addDef(ResReg) .addUse(TrueReg) .addUse(FalseReg) @@ -713,12 +802,37 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, bool ARMInstructionSelector::selectShift(unsigned ShiftOpc, MachineInstrBuilder &MIB) const { + assert(!STI.isThumb() && "Unsupported subtarget"); MIB->setDesc(TII.get(ARM::MOVsr)); MIB.addImm(ShiftOpc); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +void ARMInstructionSelector::renderVFPF32Imm( + MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const { + assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT && + "Expected G_FCONSTANT"); + + APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF(); + int FPImmEncoding = ARM_AM::getFP32Imm(FPImmValue); + assert(FPImmEncoding != -1 && "Invalid immediate value"); + + NewInstBuilder.addImm(FPImmEncoding); +} + +void ARMInstructionSelector::renderVFPF64Imm( + MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const { + assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT && + "Expected G_FCONSTANT"); + + APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF(); + int FPImmEncoding = ARM_AM::getFP64Imm(FPImmValue); + assert(FPImmEncoding != -1 && "Invalid immediate value"); + + NewInstBuilder.addImm(FPImmEncoding); +} + bool ARMInstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { assert(I.getParent() && "Instruction should be in a basic block!"); @@ -748,12 +862,8 @@ bool ARMInstructionSelector::select(MachineInstr &I, isSExt = true; LLVM_FALLTHROUGH; case G_ZEXT: { - LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - // FIXME: Smaller destination sizes coming soon! - if (DstTy.getSizeInBits() != 32) { - LLVM_DEBUG(dbgs() << "Unsupported destination size for extension"); - return false; - } + assert(MRI.getType(I.getOperand(0).getReg()).getSizeInBits() <= 32 && + "Unsupported destination size for extension"); LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); unsigned SrcSize = SrcTy.getSizeInBits(); @@ -869,10 +979,32 @@ bool ARMInstructionSelector::select(MachineInstr &I, } } + assert(!STI.isThumb() && "Unsupported subtarget"); I.setDesc(TII.get(ARM::MOVi)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; } + case G_FCONSTANT: { + // Load from constant pool + unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits() / 8; + unsigned Alignment = Size; + + assert((Size == 4 || Size == 8) && "Unsupported FP constant type"); + auto LoadOpcode = Size == 4 ? ARM::VLDRS : ARM::VLDRD; + + auto ConstPool = MF.getConstantPool(); + auto CPIndex = + ConstPool->getConstantPoolIndex(I.getOperand(1).getFPImm(), Alignment); + MIB->setDesc(TII.get(LoadOpcode)); + MIB->RemoveOperand(1); + MIB.addConstantPoolIndex(CPIndex, /*Offset*/ 0, /*TargetFlags*/ 0) + .addMemOperand( + MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, Size, Alignment)) + .addImm(0) + .add(predOps(ARMCC::AL)); + break; + } case G_INTTOPTR: case G_PTRTOINT: { auto SrcReg = I.getOperand(1).getReg(); @@ -900,17 +1032,17 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_SELECT: return selectSelect(MIB, MRI); case G_ICMP: { - CmpConstants Helper(ARM::CMPrr, ARM::INSTRUCTION_LIST_END, - ARM::GPRRegBankID, 32); + CmpConstants Helper(Opcodes.CMPrr, ARM::INSTRUCTION_LIST_END, + Opcodes.MOVCCi, ARM::GPRRegBankID, 32); return selectCmp(Helper, MIB, MRI); } case G_FCMP: { - assert(STI.hasVFP2() && "Can't select fcmp without VFP"); + assert(STI.hasVFP2Base() && "Can't select fcmp without VFP"); unsigned OpReg = I.getOperand(2).getReg(); unsigned Size = MRI.getType(OpReg).getSizeInBits(); - if (Size == 64 && STI.isFPOnlySP()) { + if (Size == 64 && !STI.hasFP64()) { LLVM_DEBUG(dbgs() << "Subtarget only supports single precision"); return false; } @@ -920,7 +1052,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, } CmpConstants Helper(Size == 32 ? ARM::VCMPS : ARM::VCMPD, ARM::FMSTAT, - ARM::FPRRegBankID, Size); + Opcodes.MOVCCi, ARM::FPRRegBankID, Size); return selectCmp(Helper, MIB, MRI); } case G_LSHR: @@ -931,13 +1063,13 @@ bool ARMInstructionSelector::select(MachineInstr &I, return selectShift(ARM_AM::ShiftOpc::lsl, MIB); } case G_GEP: - I.setDesc(TII.get(ARM::ADDrr)); + I.setDesc(TII.get(Opcodes.ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; case G_FRAME_INDEX: // Add 0 to the given frame index and hope it will eventually be folded into // the user(s). - I.setDesc(TII.get(ARM::ADDri)); + I.setDesc(TII.get(Opcodes.ADDri)); MIB.addImm(0).add(predOps(ARMCC::AL)).add(condCodeOp()); break; case G_GLOBAL_VALUE: @@ -956,13 +1088,31 @@ bool ARMInstructionSelector::select(MachineInstr &I, LLT ValTy = MRI.getType(Reg); const auto ValSize = ValTy.getSizeInBits(); - assert((ValSize != 64 || STI.hasVFP2()) && + assert((ValSize != 64 || STI.hasVFP2Base()) && "Don't know how to load/store 64-bit value without VFP"); const auto NewOpc = selectLoadStoreOpCode(I.getOpcode(), RegBank, ValSize); if (NewOpc == G_LOAD || NewOpc == G_STORE) return false; + if (ValSize == 1 && NewOpc == Opcodes.STORE8) { + // Before storing a 1-bit value, make sure to clear out any unneeded bits. + unsigned OriginalValue = I.getOperand(0).getReg(); + + unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass); + I.getOperand(0).setReg(ValueToStore); + + auto InsertBefore = I.getIterator(); + auto AndI = BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.AND)) + .addDef(ValueToStore) + .addUse(OriginalValue) + .addImm(1) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + if (!constrainSelectedInstRegOperands(*AndI, TII, TRI, RBI)) + return false; + } + I.setDesc(TII.get(NewOpc)); if (NewOpc == ARM::LDRH || NewOpc == ARM::STRH) @@ -988,17 +1138,19 @@ bool ARMInstructionSelector::select(MachineInstr &I, } // Set the flags. - auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri)) - .addReg(I.getOperand(0).getReg()) - .addImm(1) - .add(predOps(ARMCC::AL)); + auto Test = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcodes.TSTri)) + .addReg(I.getOperand(0).getReg()) + .addImm(1) + .add(predOps(ARMCC::AL)); if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI)) return false; // Branch conditionally. - auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc)) - .add(I.getOperand(1)) - .add(predOps(ARMCC::NE, ARM::CPSR)); + auto Branch = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcodes.Bcc)) + .add(I.getOperand(1)) + .add(predOps(ARMCC::NE, ARM::CPSR)); if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI)) return false; I.eraseFromParent(); diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 4a0c24d58474..73a57b297ad6 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- ARMLegalizerInfo.cpp --------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -83,41 +82,29 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { } getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) - .legalForCartesianProduct({s32}, {s1, s8, s16}); + .legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16}); - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR}) .legalFor({s32}) .minScalar(0, s32); - getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}}); - getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}}); - - getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({s32, p0}) - .clampScalar(0, s32, s32); - - // We're keeping these builders around because we'll want to add support for - // floating point to them. - auto &LoadStoreBuilder = - getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalForTypesWithMemSize({ - {s1, p0, 8}, - {s8, p0, 8}, - {s16, p0, 16}, - {s32, p0, 32}, - {p0, p0, 32}}); - - if (ST.isThumb()) { - // FIXME: merge with the code for non-Thumb. - computeTables(); - verify(*ST.getInstrInfo()); - return; - } + if (ST.hasNEON()) + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({s32, s64}) + .minScalar(0, s32); + else + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({s32}) + .minScalar(0, s32); - getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); - getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}) + .legalFor({{s32, s32}}) + .minScalar(0, s32) + .clampScalar(1, s32, s32); - if (ST.hasDivideInARMMode()) + bool HasHWDivide = (!ST.isThumb() && ST.hasDivideInARMMode()) || + (ST.isThumb() && ST.hasDivideInThumbMode()); + if (HasHWDivide) getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32}) .clampScalar(0, s32, s32); @@ -128,7 +115,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { for (unsigned Op : {G_SREM, G_UREM}) { setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16); - if (ST.hasDivideInARMMode()) + if (HasHWDivide) setAction({Op, s32}, Lower); else if (AEABI(ST)) setAction({Op, s32}, Custom); @@ -136,46 +123,57 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Libcall); } - getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32}); - - if (ST.hasV5TOps()) { - getActionDefinitionsBuilder(G_CTLZ) - .legalFor({s32}) - .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) - .lowerFor({s32}) - .clampScalar(0, s32, s32); - } else { - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) - .libcallFor({s32}) - .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_CTLZ) - .lowerFor({s32}) - .clampScalar(0, s32, s32); - } - - getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}}); - - getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0}, - {s1}); + getActionDefinitionsBuilder(G_INTTOPTR) + .legalFor({{p0, s32}}) + .minScalar(1, s32); + getActionDefinitionsBuilder(G_PTRTOINT) + .legalFor({{s32, p0}}) + .minScalar(0, s32); - getActionDefinitionsBuilder(G_BRCOND).legalFor({s1}); + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({s32, p0}) + .clampScalar(0, s32, s32); getActionDefinitionsBuilder(G_ICMP) .legalForCartesianProduct({s1}, {s32, p0}) .minScalar(1, s32); + getActionDefinitionsBuilder(G_SELECT) + .legalForCartesianProduct({s32, p0}, {s1}) + .minScalar(0, s32); + // We're keeping these builders around because we'll want to add support for // floating point to them. + auto &LoadStoreBuilder = getActionDefinitionsBuilder({G_LOAD, G_STORE}) + .legalForTypesWithMemDesc({{s1, p0, 8, 8}, + {s8, p0, 8, 8}, + {s16, p0, 16, 8}, + {s32, p0, 32, 8}, + {p0, p0, 32, 8}}) + .unsupportedIfMemSizeNotPow2(); + + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); + auto &PhiBuilder = - getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32); + getActionDefinitionsBuilder(G_PHI) + .legalFor({s32, p0}) + .minScalar(0, s32); + + getActionDefinitionsBuilder(G_GEP) + .legalFor({{p0, s32}}) + .minScalar(1, s32); - if (!ST.useSoftFloat() && ST.hasVFP2()) { + getActionDefinitionsBuilder(G_BRCOND).legalFor({s1}); + + if (!ST.useSoftFloat() && ST.hasVFP2Base()) { getActionDefinitionsBuilder( {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FCONSTANT, G_FNEG}) .legalFor({s32, s64}); - LoadStoreBuilder.legalFor({{s64, p0}}); + LoadStoreBuilder + .legalForTypesWithMemDesc({{s64, p0, 64, 32}}) + .maxScalar(0, s32); PhiBuilder.legalFor({s64}); getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct({s1}, @@ -219,13 +217,33 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { .libcallForCartesianProduct({s32, s64}, {s32}); } - if (!ST.useSoftFloat() && ST.hasVFP4()) + if (!ST.useSoftFloat() && ST.hasVFP4Base()) getActionDefinitionsBuilder(G_FMA).legalFor({s32, s64}); else getActionDefinitionsBuilder(G_FMA).libcallFor({s32, s64}); getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64}); + if (ST.hasV5TOps()) { + getActionDefinitionsBuilder(G_CTLZ) + .legalFor({s32, s32}) + .clampScalar(1, s32, s32) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + .lowerFor({s32, s32}) + .clampScalar(1, s32, s32) + .clampScalar(0, s32, s32); + } else { + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + .libcallFor({s32, s32}) + .clampScalar(1, s32, s32) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_CTLZ) + .lowerFor({s32, s32}) + .clampScalar(1, s32, s32) + .clampScalar(0, s32, s32); + } + computeTables(); verify(*ST.getInstrInfo()); } @@ -351,7 +369,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, return false; case G_SREM: case G_UREM: { - unsigned OriginalResult = MI.getOperand(0).getReg(); + Register OriginalResult = MI.getOperand(0).getReg(); auto Size = MRI.getType(OriginalResult).getSizeInBits(); if (Size != 32) return false; @@ -360,24 +378,17 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, MI.getOpcode() == G_SREM ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; // Our divmod libcalls return a struct containing the quotient and the - // remainder. We need to create a virtual register for it. + // remainder. Create a new, unused register for the quotient and use the + // destination of the original instruction for the remainder. Type *ArgTy = Type::getInt32Ty(Ctx); StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true); - auto RetVal = MRI.createGenericVirtualRegister( - getLLTForType(*RetTy, MIRBuilder.getMF().getDataLayout())); - - auto Status = createLibcall(MIRBuilder, Libcall, {RetVal, RetTy}, + Register RetRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), + OriginalResult}; + auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy}, {{MI.getOperand(1).getReg(), ArgTy}, {MI.getOperand(2).getReg(), ArgTy}}); if (Status != LegalizerHelper::Legalized) return false; - - // The remainder is the second result of divmod. Split the return value into - // a new, unused register for the quotient and the destination of the - // original instruction for the remainder. - MIRBuilder.buildUnmerge( - {MRI.createGenericVirtualRegister(LLT::scalar(32)), OriginalResult}, - RetVal); break; } case G_FCMP: { @@ -405,7 +416,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, auto *ArgTy = OpSize == 32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx); auto *RetTy = Type::getInt32Ty(Ctx); - SmallVector Results; + SmallVector Results; for (auto Libcall : Libcalls) { auto LibcallResult = MRI.createGenericVirtualRegister(LLT::scalar(32)); auto Status = diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h index 527bf87f1093..e95f8cf76103 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.h +++ b/lib/Target/ARM/ARMLegalizerInfo.h @@ -1,9 +1,8 @@ //===- ARMLegalizerInfo ------------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 6da7430a8e51..90a1ce238c3f 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1,9 +1,8 @@ //===- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -174,12 +173,14 @@ namespace { MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs); + ArrayRef> Regs, + ArrayRef Instrs); MachineInstr *CreateLoadStoreDouble( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs) const; + ArrayRef> Regs, + ArrayRef Instrs) const; void FormCandidates(const MemOpQueue &MemOps); MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, @@ -623,7 +624,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs) { + ArrayRef> Regs, + ArrayRef Instrs) { unsigned NumRegs = Regs.size(); assert(NumRegs > 1); @@ -815,6 +817,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti( for (const std::pair &R : Regs) MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second)); + MIB.cloneMergedMemRefs(Instrs); + return MIB.getInstr(); } @@ -822,7 +826,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs) const { + ArrayRef> Regs, + ArrayRef Instrs) const { bool IsLoad = isi32Load(Opcode); assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store"); unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8; @@ -838,6 +843,7 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble( .addReg(Regs[1].first, getKillRegState(Regs[1].second)); } MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + MIB.cloneMergedMemRefs(Instrs); return MIB.getInstr(); } @@ -895,10 +901,11 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { MachineInstr *Merged = nullptr; if (Cand.CanMergeToLSDouble) Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, - Opcode, Pred, PredReg, DL, Regs); + Opcode, Pred, PredReg, DL, Regs, + Cand.Instrs); if (!Merged && Cand.CanMergeToLSMulti) Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, - Opcode, Pred, PredReg, DL, Regs); + Opcode, Pred, PredReg, DL, Regs, Cand.Instrs); if (!Merged) return nullptr; @@ -1287,7 +1294,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // can still change to a writeback form as that will save us 2 bytes // of code size. It can create WAW hazards though, so only do it if // we're minimizing code size. - if (!MBB.getParent()->getFunction().optForMinSize() || !BaseKill) + if (!STI->hasMinSize() || !BaseKill) return false; bool HighRegsUsed = false; @@ -1436,14 +1443,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base, getKillRegState(isLd ? BaseKill : false)) .addImm(Pred).addReg(PredReg) .addReg(MO.getReg(), (isLd ? getDefRegState(true) : - getKillRegState(MO.isKill()))); + getKillRegState(MO.isKill()))) + .cloneMemRefs(*MI); } else if (isLd) { if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg) + .cloneMemRefs(*MI); } else { int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) @@ -1451,7 +1460,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base) .addReg(0) .addImm(Imm) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } } else { // t2LDR_PRE, t2LDR_POST @@ -1459,7 +1469,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base, RegState::Define) .addReg(Base) .addImm(Offset) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } } else { MachineOperand &MO = MI->getOperand(0); @@ -1474,14 +1485,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base) .addReg(0) .addImm(Imm) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } else { // t2STR_PRE, t2STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) .addReg(Base) .addImm(Offset) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } } MBB.erase(MBBI); @@ -1541,7 +1554,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { // Transfer implicit operands. for (const MachineOperand &MO : MI.implicit_operands()) MIB.add(MO); - MIB.setMemRefs(MI.memoperands()); + MIB.cloneMemRefs(MI); MBB.erase(MBBI); return true; @@ -1581,7 +1594,9 @@ static bool isMemoryOp(const MachineInstr &MI) { const MachineMemOperand &MMO = **MI.memoperands_begin(); // Don't touch volatile memory accesses - we may be changing their order. - if (MMO.isVolatile()) + // TODO: We could allow unordered and monotonic atomics here, but we need to + // make sure the resulting ldm/stm is correctly marked as atomic. + if (MMO.isVolatile() || MMO.isAtomic()) return false; // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is @@ -1607,19 +1622,26 @@ static void InsertLDR_STR(MachineBasicBlock &MBB, bool isDef, unsigned NewOpc, unsigned Reg, bool RegDeadKill, bool RegUndef, unsigned BaseReg, bool BaseKill, bool BaseUndef, ARMCC::CondCodes Pred, - unsigned PredReg, const TargetInstrInfo *TII) { + unsigned PredReg, const TargetInstrInfo *TII, + MachineInstr *MI) { if (isDef) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + // FIXME: This is overly conservative; the new instruction accesses 4 + // bytes, not 8. + MIB.cloneMemRefs(*MI); } else { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + // FIXME: This is overly conservative; the new instruction accesses 4 + // bytes, not 8. + MIB.cloneMemRefs(*MI); } } @@ -1677,7 +1699,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, .addReg(BaseReg, getKillRegState(BaseKill)) .addImm(Pred).addReg(PredReg) .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) - .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)) + .cloneMemRefs(*MI); ++NumLDRD2LDM; } else { BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) @@ -1686,7 +1709,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, .addReg(EvenReg, getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) .addReg(OddReg, - getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)) + .cloneMemRefs(*MI); ++NumSTRD2STM; } } else { @@ -1704,9 +1728,10 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if (isLd && TRI->regsOverlap(EvenReg, BaseReg)) { assert(!TRI->regsOverlap(OddReg, BaseReg)); InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill, - false, BaseReg, false, BaseUndef, Pred, PredReg, TII); + false, BaseReg, false, BaseUndef, Pred, PredReg, TII, MI); InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill, - false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII); + false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII, + MI); } else { if (OddReg == EvenReg && EvenDeadKill) { // If the two source operands are the same, the kill marker is @@ -1719,9 +1744,11 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if (EvenReg == BaseReg) EvenDeadKill = false; InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill, - EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII); + EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII, + MI); InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill, - OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII); + OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII, + MI); } if (isLd) ++NumLDRD2LDR; @@ -2048,6 +2075,11 @@ char ARMPreAllocLoadStoreOpt::ID = 0; INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +// Limit the number of instructions to be rescheduled. +// FIXME: tune this limit, and/or come up with some better heuristics. +static cl::opt InstReorderLimit("arm-prera-ldst-opt-reorder-limit", + cl::init(8), cl::Hidden); + bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (AssumeMisalignedLoadStores || skipFunction(Fn.getFunction())) return false; @@ -2140,7 +2172,8 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, // At the moment, we ignore the memoryoperand's value. // If we want to use AliasAnalysis, we should check it accordingly. if (!Op0->hasOneMemOperand() || - (*Op0->memoperands_begin())->isVolatile()) + (*Op0->memoperands_begin())->isVolatile() || + (*Op0->memoperands_begin())->isAtomic()) return false; unsigned Align = (*Op0->memoperands_begin())->getAlignment(); @@ -2223,7 +2256,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, } // Don't try to reschedule too many instructions. - if (NumMove == 8) // FIXME: Tune this limit. + if (NumMove == InstReorderLimit) break; // Found a mergable instruction; save information about it. @@ -2351,10 +2384,13 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { bool RetVal = false; DenseMap MI2LocMap; - DenseMap> Base2LdsMap; - DenseMap> Base2StsMap; - SmallVector LdBases; - SmallVector StBases; + using MapIt = DenseMap>::iterator; + using Base2InstMap = DenseMap>; + using BaseVec = SmallVector; + Base2InstMap Base2LdsMap; + Base2InstMap Base2StsMap; + BaseVec LdBases; + BaseVec StBases; unsigned Loc = 0; MachineBasicBlock::iterator MBBI = MBB->begin(); @@ -2381,41 +2417,28 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { bool isLd = isLoadSingle(Opc); unsigned Base = MI.getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); - bool StopHere = false; - if (isLd) { - DenseMap>::iterator BI = - Base2LdsMap.find(Base); - if (BI != Base2LdsMap.end()) { - for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { - if (Offset == getMemoryOpOffset(*BI->second[i])) { - StopHere = true; - break; - } - } - if (!StopHere) - BI->second.push_back(&MI); - } else { - Base2LdsMap[Base].push_back(&MI); - LdBases.push_back(Base); + auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) { + MapIt BI = Base2Ops.find(Base); + if (BI == Base2Ops.end()) { + Base2Ops[Base].push_back(&MI); + Bases.push_back(Base); + return; } - } else { - DenseMap>::iterator BI = - Base2StsMap.find(Base); - if (BI != Base2StsMap.end()) { - for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { - if (Offset == getMemoryOpOffset(*BI->second[i])) { - StopHere = true; - break; - } + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(*BI->second[i])) { + StopHere = true; + break; } - if (!StopHere) - BI->second.push_back(&MI); - } else { - Base2StsMap[Base].push_back(&MI); - StBases.push_back(Base); } - } + if (!StopHere) + BI->second.push_back(&MI); + }; + + if (isLd) + FindBases(Base2LdsMap, LdBases); + else + FindBases(Base2StsMap, StBases); if (StopHere) { // Found a duplicate (a base+offset combination that's seen earlier). diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp new file mode 100644 index 000000000000..cedf3bd3c74e --- /dev/null +++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -0,0 +1,384 @@ +//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// Finalize v8.1-m low-overhead loops by converting the associated pseudo +/// instructions into machine operations. +/// The expectation is that the loop contains three pseudo instructions: +/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop +/// form should be in the preheader, whereas the while form should be in the +/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the +/// pre-preheader? +/// - t2LoopDec - placed within in the loop body. +/// - t2LoopEnd - the loop latch terminator. +/// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMBasicBlockInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-low-overhead-loops" +#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" + +namespace { + + class ARMLowOverheadLoops : public MachineFunctionPass { + const ARMBaseInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + std::unique_ptr BBUtils = nullptr; + + public: + static char ID; + + ARMLowOverheadLoops() : MachineFunctionPass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool ProcessLoop(MachineLoop *ML); + + void RevertWhile(MachineInstr *MI) const; + + void RevertLoopDec(MachineInstr *MI) const; + + void RevertLoopEnd(MachineInstr *MI) const; + + void Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *Dec, MachineInstr *End, bool Revert); + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return ARM_LOW_OVERHEAD_LOOPS_NAME; + } + }; +} + +char ARMLowOverheadLoops::ID = 0; + +INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, + false, false) + +bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { + if (!static_cast(MF.getSubtarget()).hasLOB()) + return false; + + LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); + + auto &MLI = getAnalysis(); + MRI = &MF.getRegInfo(); + TII = static_cast( + MF.getSubtarget().getInstrInfo()); + BBUtils = std::unique_ptr(new ARMBasicBlockUtils(MF)); + BBUtils->computeAllBlockSizes(); + BBUtils->adjustBBOffsetsAfter(&MF.front()); + + bool Changed = false; + for (auto ML : MLI) { + if (!ML->getParentLoop()) + Changed |= ProcessLoop(ML); + } + return Changed; +} + +bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { + + bool Changed = false; + + // Process inner loops first. + for (auto I = ML->begin(), E = ML->end(); I != E; ++I) + Changed |= ProcessLoop(*I); + + LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); + + auto IsLoopStart = [](MachineInstr &MI) { + return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStart; + }; + + // Search the given block for a loop start instruction. If one isn't found, + // and there's only one predecessor block, search that one too. + std::function SearchForStart = + [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { + for (auto &MI : *MBB) { + if (IsLoopStart(MI)) + return &MI; + } + if (MBB->pred_size() == 1) + return SearchForStart(*MBB->pred_begin()); + return nullptr; + }; + + MachineInstr *Start = nullptr; + MachineInstr *Dec = nullptr; + MachineInstr *End = nullptr; + bool Revert = false; + + // Search the preheader for the start intrinsic, or look through the + // predecessors of the header to find exactly one set.iterations intrinsic. + // FIXME: I don't see why we shouldn't be supporting multiple predecessors + // with potentially multiple set.loop.iterations, so we need to enable this. + if (auto *Preheader = ML->getLoopPreheader()) { + Start = SearchForStart(Preheader); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" + << " - Performing manual predecessor search.\n"); + MachineBasicBlock *Pred = nullptr; + for (auto *MBB : ML->getHeader()->predecessors()) { + if (!ML->contains(MBB)) { + if (Pred) { + LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); + Start = nullptr; + break; + } + Pred = MBB; + Start = SearchForStart(MBB); + } + } + } + + // Find the low-overhead loop components and decide whether or not to fall + // back to a normal loop. + for (auto *MBB : reverse(ML->getBlocks())) { + for (auto &MI : *MBB) { + if (MI.getOpcode() == ARM::t2LoopDec) + Dec = &MI; + else if (MI.getOpcode() == ARM::t2LoopEnd) + End = &MI; + else if (MI.getDesc().isCall()) + // TODO: Though the call will require LE to execute again, does this + // mean we should revert? Always executing LE hopefully should be + // faster than performing a sub,cmp,br or even subs,br. + Revert = true; + + if (!Dec) + continue; + + // If we find that we load/store LR between LoopDec and LoopEnd, expect + // that the decremented value has been spilled to the stack. Because + // this value isn't actually going to be produced until the latch, by LE, + // we would need to generate a real sub. The value is also likely to be + // reloaded for use of LoopEnd - in which in case we'd need to perform + // an add because it gets negated again by LE! The other option is to + // then generate the other form of LE which doesn't perform the sub. + if (MI.mayLoad() || MI.mayStore()) + Revert = + MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; + } + + if (Dec && End && Revert) + break; + } + + if (!Start && !Dec && !End) { + LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); + return Changed; + } if (!(Start && Dec && End)) { + report_fatal_error("Failed to find all loop components"); + } + + if (!End->getOperand(1).isMBB() || + End->getOperand(1).getMBB() != ML->getHeader()) + report_fatal_error("Expected LoopEnd to target Loop Header"); + + // The WLS and LE instructions have 12-bits for the label offset. WLS + // requires a positive offset, while LE uses negative. + if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || + !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { + LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); + Revert = true; + } + if (Start->getOpcode() == ARM::t2WhileLoopStart && + (BBUtils->getOffsetOf(Start) > + BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || + !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { + LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); + Revert = true; + } + + LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start + << " - Found Loop Dec: " << *Dec + << " - Found Loop End: " << *End); + + Expand(ML, Start, Dec, End, Revert); + return true; +} + +// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a +// beq that branches to the exit branch. +// FIXME: Need to check that we're not trashing the CPSR when generating the +// cmp. We could also try to generate a cbz if the value in LR is also in +// another low register. +void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); + MachineBasicBlock *MBB = MI->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2CMPri)); + MIB.addReg(ARM::LR); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::CPSR); + + // TODO: Try to use tBcc instead + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); + MIB.add(MI->getOperand(1)); // branch target + MIB.addImm(ARMCC::EQ); // condition code + MIB.addReg(ARM::CPSR); + MI->eraseFromParent(); +} + +// TODO: Check flags so that we can possibly generate a tSubs or tSub. +void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); + MachineBasicBlock *MBB = MI->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2SUBri)); + MIB.addDef(ARM::LR); + MIB.add(MI->getOperand(1)); + MIB.add(MI->getOperand(2)); + MIB.addImm(ARMCC::AL); + MIB.addReg(0); + MIB.addReg(0); + MI->eraseFromParent(); +} + +// Generate a subs, or sub and cmp, and a branch instead of an LE. +// FIXME: Need to check that we're not trashing the CPSR when generating +// the cmp. +void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); + + // Create cmp + MachineBasicBlock *MBB = MI->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2CMPri)); + MIB.addReg(ARM::LR); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::CPSR); + + // TODO Try to use tBcc instead. + // Create bne + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); + MIB.add(MI->getOperand(1)); // branch target + MIB.addImm(ARMCC::NE); // condition code + MIB.addReg(ARM::CPSR); + MI->eraseFromParent(); +} + +void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *Dec, MachineInstr *End, + bool Revert) { + + auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { + // The trip count should already been held in LR since the instructions + // within the loop can only read and write to LR. So, there should be a + // mov to setup the count. WLS/DLS perform this move, so find the original + // and delete it - inserting WLS/DLS in its place. + MachineBasicBlock *MBB = Start->getParent(); + MachineInstr *InsertPt = Start; + for (auto &I : MRI->def_instructions(ARM::LR)) { + if (I.getParent() != MBB) + continue; + + // Always execute. + if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) + continue; + + // Only handle move reg, if the trip count it will need moving into a reg + // before the setup instruction anyway. + if (!I.getDesc().isMoveReg() || + !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) + continue; + InsertPt = &I; + break; + } + + unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? + ARM::t2DLS : ARM::t2WLS; + MachineInstrBuilder MIB = + BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); + + MIB.addDef(ARM::LR); + MIB.add(Start->getOperand(0)); + if (Opc == ARM::t2WLS) + MIB.add(Start->getOperand(1)); + + if (InsertPt != Start) + InsertPt->eraseFromParent(); + Start->eraseFromParent(); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); + return &*MIB; + }; + + // Combine the LoopDec and LoopEnd instructions into LE(TP). + auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, + MachineInstr *End) { + MachineBasicBlock *MBB = End->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), + TII->get(ARM::t2LEUpdate)); + MIB.addDef(ARM::LR); + MIB.add(End->getOperand(0)); + MIB.add(End->getOperand(1)); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); + + End->eraseFromParent(); + Dec->eraseFromParent(); + return &*MIB; + }; + + // TODO: We should be able to automatically remove these branches before we + // get here - probably by teaching analyzeBranch about the pseudo + // instructions. + // If there is an unconditional branch, after I, that just branches to the + // next block, remove it. + auto RemoveDeadBranch = [](MachineInstr *I) { + MachineBasicBlock *BB = I->getParent(); + MachineInstr *Terminator = &BB->instr_back(); + if (Terminator->isUnconditionalBranch() && I != Terminator) { + MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); + if (BB->isLayoutSuccessor(Succ)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); + Terminator->eraseFromParent(); + } + } + }; + + if (Revert) { + if (Start->getOpcode() == ARM::t2WhileLoopStart) + RevertWhile(Start); + else + Start->eraseFromParent(); + RevertLoopDec(Dec); + RevertLoopEnd(End); + } else { + Start = ExpandLoopStart(ML, Start); + RemoveDeadBranch(Start); + End = ExpandLoopEnd(ML, Dec, End); + RemoveDeadBranch(End); + } +} + +FunctionPass *llvm::createARMLowOverheadLoopsPass() { + return new ARMLowOverheadLoops(); +} diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 48b02d40b246..90c5ad025e56 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp index e25d36b57616..3b676ca4c883 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index 91310e81e398..90d794cd27b1 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -62,6 +61,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// enable far jump. bool LRSpilledForFarJump = false; + /// LRSpilled - True if the LR register has been for spilled for + /// any reason, so it's legal to emit an ARM::tBfar (i.e. "bl"). + bool LRSpilled = false; + /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer /// spill stack offset. unsigned FramePtrSpillOffset = 0; @@ -151,6 +154,9 @@ public: bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; } void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; } + bool isLRSpilled() const { return LRSpilled; } + void setLRIsSpilled(bool s) { LRSpilled = s; } + bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } @@ -239,6 +245,8 @@ public: void setPromotedConstpoolIncrease(int Sz) { PromotedGlobalsIncrease = Sz; } + + DenseMap EHPrologueRemappedRegs; }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp index df1da9d8e474..38bf28ba8219 100644 --- a/lib/Target/ARM/ARMMacroFusion.cpp +++ b/lib/Target/ARM/ARMMacroFusion.cpp @@ -1,9 +1,8 @@ //===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMMacroFusion.h b/lib/Target/ARM/ARMMacroFusion.h index b3abd7b593a1..4896a4a2544d 100644 --- a/lib/Target/ARM/ARMMacroFusion.h +++ b/lib/Target/ARM/ARMMacroFusion.h @@ -1,9 +1,8 @@ //===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp index cff4a256100d..348895da713f 100644 --- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp +++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp @@ -1,10 +1,9 @@ //===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between, //removed one -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp index fc3258914f92..5389d09bf7d7 100644 --- a/lib/Target/ARM/ARMParallelDSP.cpp +++ b/lib/Target/ARM/ARMParallelDSP.cpp @@ -1,9 +1,8 @@ //===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -49,12 +48,12 @@ DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), namespace { struct OpChain; struct BinOpChain; - struct Reduction; + class Reduction; using OpChainList = SmallVector, 8>; using ReductionList = SmallVector; using ValueList = SmallVector; - using MemInstList = SmallVector; + using MemInstList = SmallVector; using PMACPair = std::pair; using PMACPairList = SmallVector; using Instructions = SmallVector; @@ -64,31 +63,24 @@ namespace { Instruction *Root; ValueList AllValues; MemInstList VecLd; // List of all load instructions. - MemLocList MemLocs; // All memory locations read by this tree. + MemInstList Loads; bool ReadOnly = true; OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { } virtual ~OpChain() = default; - void SetMemoryLocations() { - const auto Size = LocationSize::unknown(); + void PopulateLoads() { for (auto *V : AllValues) { - if (auto *I = dyn_cast(V)) { - if (I->mayWriteToMemory()) - ReadOnly = false; - if (auto *Ld = dyn_cast(V)) - MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size)); - } + if (auto *Ld = dyn_cast(V)) + Loads.push_back(Ld); } } unsigned size() const { return AllValues.size(); } }; - // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures. - // 'Reduction' contains the phi-node and accumulator statement from where we - // start pattern matching, and 'BinOpChain' the multiplication - // instructions that are candidates for parallel execution. + // 'BinOpChain' holds the multiplication instructions that are candidates + // for parallel execution. struct BinOpChain : public OpChain { ValueList LHS; // List of all (narrow) left hand operands. ValueList RHS; // List of all (narrow) right hand operands. @@ -103,15 +95,85 @@ namespace { bool AreSymmetrical(BinOpChain *Other); }; - struct Reduction { - PHINode *Phi; // The Phi-node from where we start - // pattern matching. - Instruction *AccIntAdd; // The accumulating integer add statement, - // i.e, the reduction statement. - OpChainList MACCandidates; // The MAC candidates associated with - // this reduction statement. - PMACPairList PMACPairs; - Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; + /// Represent a sequence of multiply-accumulate operations with the aim to + /// perform the multiplications in parallel. + class Reduction { + Instruction *Root = nullptr; + Value *Acc = nullptr; + OpChainList Muls; + PMACPairList MulPairs; + SmallPtrSet Adds; + + public: + Reduction() = delete; + + Reduction (Instruction *Add) : Root(Add) { } + + /// Record an Add instruction that is a part of the this reduction. + void InsertAdd(Instruction *I) { Adds.insert(I); } + + /// Record a BinOpChain, rooted at a Mul instruction, that is a part of + /// this reduction. + void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) { + Muls.push_back(make_unique(I, LHS, RHS)); + } + + /// Add the incoming accumulator value, returns true if a value had not + /// already been added. Returning false signals to the user that this + /// reduction already has a value to initialise the accumulator. + bool InsertAcc(Value *V) { + if (Acc) + return false; + Acc = V; + return true; + } + + /// Set two BinOpChains, rooted at muls, that can be executed as a single + /// parallel operation. + void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) { + MulPairs.push_back(std::make_pair(Mul0, Mul1)); + } + + /// Return true if enough mul operations are found that can be executed in + /// parallel. + bool CreateParallelPairs(); + + /// Return the add instruction which is the root of the reduction. + Instruction *getRoot() { return Root; } + + /// Return the incoming value to be accumulated. This maybe null. + Value *getAccumulator() { return Acc; } + + /// Return the set of adds that comprise the reduction. + SmallPtrSetImpl &getAdds() { return Adds; } + + /// Return the BinOpChain, rooted at mul instruction, that comprise the + /// the reduction. + OpChainList &getMuls() { return Muls; } + + /// Return the BinOpChain, rooted at mul instructions, that have been + /// paired for parallel execution. + PMACPairList &getMulPairs() { return MulPairs; } + + /// To finalise, replace the uses of the root with the intrinsic call. + void UpdateRoot(Instruction *SMLAD) { + Root->replaceAllUsesWith(SMLAD); + } + }; + + class WidenedLoad { + LoadInst *NewLd = nullptr; + SmallVector Loads; + + public: + WidenedLoad(SmallVectorImpl &Lds, LoadInst *Wide) + : NewLd(Wide) { + for (auto *I : Lds) + Loads.push_back(I); + } + LoadInst *getLoad() { + return NewLd; + } }; class ARMParallelDSP : public LoopPass { @@ -124,28 +186,37 @@ namespace { const DataLayout *DL; Module *M; std::map LoadPairs; - std::map> SequentialLoads; + SmallPtrSet OffsetLoads; + std::map> WideLoads; + + template + bool IsNarrowSequence(Value *V, ValueList &VL); - bool RecordSequentialLoads(BasicBlock *Header); - bool InsertParallelMACs(Reduction &Reduction); + bool RecordMemoryOps(BasicBlock *BB); + void InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - void CreateParallelMACPairs(Reduction &R); - Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, - Instruction *Acc, bool Exchange, - Instruction *InsertAfter); + LoadInst* CreateWideLoad(SmallVectorImpl &Loads, + IntegerType *LoadTy); + bool CreateParallelPairs(Reduction &R); /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate /// Dual performs two signed 16x16-bit multiplications. It adds the /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Function &F); + bool MatchSMLAD(Loop *L); public: static char ID; ARMParallelDSP() : LoopPass(ID) { } + bool doInitialization(Loop *L, LPPassManager &LPM) override { + LoadPairs.clear(); + WideLoads.clear(); + return true; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { LoopPass::getAnalysisUsage(AU); AU.addRequired(); @@ -183,6 +254,9 @@ namespace { return false; } + if (!TheLoop->getLoopPreheader()) + InsertPreheaderForLoop(L, DT, LI, nullptr, true); + Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -202,31 +276,62 @@ namespace { return false; } + if (!ST->isLittle()) { + LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass " + << "ARMParallelDSP\n"); + return false; + } + LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); - bool Changes = false; LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - if (!RecordSequentialLoads(Header)) { + if (!RecordMemoryOps(Header)) { LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); return false; } - Changes = MatchSMLAD(F); + bool Changes = MatchSMLAD(L); return Changes; } }; } +template +static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, + const DataLayout &DL, ScalarEvolution &SE) { + if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) + return true; + return false; +} + +bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, + MemInstList &VecMem) { + if (!Ld0 || !Ld1) + return false; + + if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) + return false; + + LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n"; + dbgs() << "Ld0:"; Ld0->dump(); + dbgs() << "Ld1:"; Ld1->dump(); + ); + + VecMem.clear(); + VecMem.push_back(Ld0); + VecMem.push_back(Ld1); + return true; +} + // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP // instructions, which is set to 16. So here we should collect all i8 and i16 // narrow operations. // TODO: we currently only collect i16, and will support i8 later, so that's // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template -static bool IsNarrowSequence(Value *V, ValueList &VL) { - LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump()); +bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) { ConstantInt *CInt; if (match(V, m_ConstantInt(CInt))) { @@ -236,7 +341,7 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) { auto *I = dyn_cast(V); if (!I) - return false; + return false; Value *Val, *LHS, *RHS; if (match(V, m_Trunc(m_Value(Val)))) { @@ -245,108 +350,253 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) { } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) { // TODO: we need to implement sadd16/sadd8 for this, which enables to // also do the rewrite for smlad8.ll, but it is unsupported for now. - LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump()); return false; } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) { - if (cast(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) { - LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " << - cast(I)->getSrcTy()->getIntegerBitWidth() << "\n"); + if (cast(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) return false; - } if (match(Val, m_Load(m_Value()))) { - LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump()); + auto *Ld = cast(Val); + + // Check that these load could be paired. + if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld)) + return false; + VL.push_back(Val); VL.push_back(I); return true; } } - LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump()); return false; } -template -static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, - const DataLayout &DL, ScalarEvolution &SE) { - if (!MemOp0->isSimple() || !MemOp1->isSimple()) { - LLVM_DEBUG(dbgs() << "No, not touching volatile access\n"); - return false; - } - if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) { - LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n"); - return true; +/// Iterate through the block and record base, offset pairs of loads which can +/// be widened into a single load. +bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { + SmallVector Loads; + SmallVector Writes; + + // Collect loads and instruction that may write to memory. For now we only + // record loads which are simple, sign-extended and have a single user. + // TODO: Allow zero-extended loads. + for (auto &I : *BB) { + if (I.mayWriteToMemory()) + Writes.push_back(&I); + auto *Ld = dyn_cast(&I); + if (!Ld || !Ld->isSimple() || + !Ld->hasOneUse() || !isa(Ld->user_back())) + continue; + Loads.push_back(Ld); } - LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n"); - return false; -} -bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, - MemInstList &VecMem) { - if (!Ld0 || !Ld1) - return false; + using InstSet = std::set; + using DepMap = std::map; + DepMap RAWDeps; - LLVM_DEBUG(dbgs() << "Are consecutive loads:\n"; - dbgs() << "Ld0:"; Ld0->dump(); - dbgs() << "Ld1:"; Ld1->dump(); - ); + // Record any writes that may alias a load. + const auto Size = LocationSize::unknown(); + for (auto Read : Loads) { + for (auto Write : Writes) { + MemoryLocation ReadLoc = + MemoryLocation(Read->getPointerOperand(), Size); - if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) { - LLVM_DEBUG(dbgs() << "No, load has more than one use.\n"); - return false; + if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), + ModRefInfo::ModRef))) + continue; + if (DT->dominates(Write, Read)) + RAWDeps[Read].insert(Write); + } } - if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) - return false; + // Check whether there's not a write between the two loads which would + // prevent them from being safely merged. + auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { + LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; + LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; - VecMem.clear(); - VecMem.push_back(Ld0); - VecMem.push_back(Ld1); - return true; -} + if (RAWDeps.count(Dominated)) { + InstSet &WritesBefore = RAWDeps[Dominated]; -/// Iterate through the block and record base, offset pairs of loads as well as -/// maximal sequences of sequential loads. -bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) { - SmallVector Loads; - for (auto &I : *Header) { - auto *Ld = dyn_cast(&I); - if (!Ld) - continue; - Loads.push_back(Ld); - } + for (auto Before : WritesBefore) { - std::map BaseLoads; + // We can't move the second load backward, past a write, to merge + // with the first load. + if (DT->dominates(Dominator, Before)) + return false; + } + } + return true; + }; - for (auto *Ld0 : Loads) { - for (auto *Ld1 : Loads) { - if (Ld0 == Ld1) + // Record base, offset load pairs. + for (auto *Base : Loads) { + for (auto *Offset : Loads) { + if (Base == Offset) continue; - if (AreSequentialAccesses(Ld0, Ld1, *DL, *SE)) { - LoadPairs[Ld0] = Ld1; - if (BaseLoads.count(Ld0)) { - LoadInst *Base = BaseLoads[Ld0]; - BaseLoads[Ld1] = Base; - SequentialLoads[Base].push_back(Ld1); - } else { - BaseLoads[Ld1] = Ld0; - SequentialLoads[Ld0].push_back(Ld1); - } + if (AreSequentialAccesses(Base, Offset, *DL, *SE) && + SafeToPair(Base, Offset)) { + LoadPairs[Base] = Offset; + OffsetLoads.insert(Offset); + break; } } } + + LLVM_DEBUG(if (!LoadPairs.empty()) { + dbgs() << "Consecutive load pairs:\n"; + for (auto &MapIt : LoadPairs) { + LLVM_DEBUG(dbgs() << *MapIt.first << ", " + << *MapIt.second << "\n"); + } + }); return LoadPairs.size() > 1; } -void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { - OpChainList &Candidates = R.MACCandidates; - PMACPairList &PMACPairs = R.PMACPairs; - const unsigned Elems = Candidates.size(); +// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector +// multiplications. +// To use SMLAD: +// 1) we first need to find integer add then look for this pattern: +// +// acc0 = ... +// ld0 = load i16 +// sext0 = sext i16 %ld0 to i32 +// ld1 = load i16 +// sext1 = sext i16 %ld1 to i32 +// mul0 = mul %sext0, %sext1 +// ld2 = load i16 +// sext2 = sext i16 %ld2 to i32 +// ld3 = load i16 +// sext3 = sext i16 %ld3 to i32 +// mul1 = mul i32 %sext2, %sext3 +// add0 = add i32 %mul0, %acc0 +// acc1 = add i32 %add0, %mul1 +// +// Which can be selected to: +// +// ldr r0 +// ldr r1 +// smlad r2, r0, r1, r2 +// +// If constants are used instead of loads, these will need to be hoisted +// out and into a register. +// +// If loop invariants are used instead of loads, these need to be packed +// before the loop begins. +// +bool ARMParallelDSP::MatchSMLAD(Loop *L) { + // Search recursively back through the operands to find a tree of values that + // form a multiply-accumulate chain. The search records the Add and Mul + // instructions that form the reduction and allows us to find a single value + // to be used as the initial input to the accumlator. + std::function Search = [&] + (Value *V, Reduction &R) -> bool { + + // If we find a non-instruction, try to use it as the initial accumulator + // value. This may have already been found during the search in which case + // this function will return false, signaling a search fail. + auto *I = dyn_cast(V); + if (!I) + return R.InsertAcc(V); + + switch (I->getOpcode()) { + default: + break; + case Instruction::PHI: + // Could be the accumulator value. + return R.InsertAcc(V); + case Instruction::Add: { + // Adds should be adding together two muls, or another add and a mul to + // be within the mac chain. One of the operands may also be the + // accumulator value at which point we should stop searching. + bool ValidLHS = Search(I->getOperand(0), R); + bool ValidRHS = Search(I->getOperand(1), R); + if (!ValidLHS && !ValidLHS) + return false; + else if (ValidLHS && ValidRHS) { + R.InsertAdd(I); + return true; + } else { + R.InsertAdd(I); + return R.InsertAcc(I); + } + } + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + if (isa(MulOp0) && isa(MulOp1)) { + ValueList LHS; + ValueList RHS; + if (IsNarrowSequence<16>(MulOp0, LHS) && + IsNarrowSequence<16>(MulOp1, RHS)) { + R.InsertMul(I, LHS, RHS); + return true; + } + } + return false; + } + case Instruction::SExt: + return Search(I->getOperand(0), R); + } + return false; + }; + + bool Changed = false; + SmallPtrSet AllAdds; + BasicBlock *Latch = L->getLoopLatch(); + + for (Instruction &I : reverse(*Latch)) { + if (I.getOpcode() != Instruction::Add) + continue; + + if (AllAdds.count(&I)) + continue; + + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; + + Reduction R(&I); + if (!Search(&I, R)) + continue; + + if (!CreateParallelPairs(R)) + continue; + + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + } + + return Changed; +} + +bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { + + // Not enough mul operations to make a pair. + if (R.getMuls().size() < 2) + return false; - if (Elems < 2) - return; + // Check that the muls operate directly upon sign extended loads. + for (auto &MulChain : R.getMuls()) { + // A mul has 2 operands, and a narrow op consist of sext and a load; thus + // we expect at least 4 items in this operand value list. + if (MulChain->size() < 4) { + LLVM_DEBUG(dbgs() << "Operand list too short.\n"); + return false; + } + MulChain->PopulateLoads(); + ValueList &LHS = static_cast(MulChain.get())->LHS; + ValueList &RHS = static_cast(MulChain.get())->RHS; + + // Use +=2 to skip over the expected extend instructions. + for (unsigned i = 0, e = LHS.size(); i < e; i += 2) { + if (!isa(LHS[i]) || !isa(RHS[i])) + return false; + } + } - auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) { + auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) { if (!PMul0->AreSymmetrical(PMul1)) return false; @@ -363,23 +613,22 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { if (!Ld0 || !Ld1 || !Ld2 || !Ld3) return false; - LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n" - << "\t Ld0: " << *Ld0 << "\n" - << "\t Ld1: " << *Ld1 << "\n" - << "and operands " << x + 2 << ":\n" - << "\t Ld2: " << *Ld2 << "\n" - << "\t Ld3: " << *Ld3 << "\n"); + LLVM_DEBUG(dbgs() << "Loads:\n" + << " - " << *Ld0 << "\n" + << " - " << *Ld1 << "\n" + << " - " << *Ld2 << "\n" + << " - " << *Ld3 << "\n"); if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + R.AddMulPair(PMul0, PMul1); return true; } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); PMul1->Exchange = true; - PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + R.AddMulPair(PMul0, PMul1); return true; } } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && @@ -389,16 +638,18 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { LLVM_DEBUG(dbgs() << " and swapping muls\n"); PMul0->Exchange = true; // Only the second operand can be exchanged, so swap the muls. - PMACPairs.push_back(std::make_pair(PMul1, PMul0)); + R.AddMulPair(PMul1, PMul0); return true; } } return false; }; + OpChainList &Muls = R.getMuls(); + const unsigned Elems = Muls.size(); SmallPtrSet Paired; for (unsigned i = 0; i < Elems; ++i) { - BinOpChain *PMul0 = static_cast(Candidates[i].get()); + BinOpChain *PMul0 = static_cast(Muls[i].get()); if (Paired.count(PMul0->Root)) continue; @@ -406,7 +657,7 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { if (i == j) continue; - BinOpChain *PMul1 = static_cast(Candidates[j].get()); + BinOpChain *PMul1 = static_cast(Muls[j].get()); if (Paired.count(PMul1->Root)) continue; @@ -417,315 +668,133 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { assert(PMul0 != PMul1 && "expected different chains"); - LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n"; - dbgs() << "- "; Mul0->dump(); - dbgs() << "- "; Mul1->dump()); - - LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); - if (CanPair(PMul0, PMul1)) { + if (CanPair(R, PMul0, PMul1)) { Paired.insert(Mul0); Paired.insert(Mul1); break; } } } + return !R.getMulPairs().empty(); } -bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) { - Instruction *Acc = Reduction.Phi; - Instruction *InsertAfter = Reduction.AccIntAdd; - - for (auto &Pair : Reduction.PMACPairs) { - BinOpChain *PMul0 = Pair.first; - BinOpChain *PMul1 = Pair.second; - LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n"; - dbgs() << "- "; PMul0->Root->dump(); - dbgs() << "- "; PMul1->Root->dump()); - - auto *VecLd0 = cast(PMul0->VecLd[0]); - auto *VecLd1 = cast(PMul1->VecLd[0]); - Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter); - InsertAfter = Acc; - } - - if (Acc != Reduction.Phi) { - LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump()); - Reduction.AccIntAdd->replaceAllUsesWith(Acc); - return true; - } - return false; -} - -static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, - ReductionList &Reductions) { - RecurrenceDescriptor RecDesc; - const bool HasFnNoNaNAttr = - F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - const BasicBlock *Latch = TheLoop->getLoopLatch(); - - // We need a preheader as getIncomingValueForBlock assumes there is one. - if (!TheLoop->getLoopPreheader()) { - LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n"); - return; - } - - for (PHINode &Phi : Header->phis()) { - const auto *Ty = Phi.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; - - const bool IsReduction = - RecurrenceDescriptor::AddReductionVar(&Phi, - RecurrenceDescriptor::RK_IntegerAdd, - TheLoop, HasFnNoNaNAttr, RecDesc); - if (!IsReduction) - continue; - - Instruction *Acc = dyn_cast(Phi.getIncomingValueForBlock(Latch)); - if (!Acc) - continue; - - Reductions.push_back(Reduction(&Phi, Acc)); - } - - LLVM_DEBUG( - dbgs() << "\nAccumulating integer additions (reductions) found:\n"; - for (auto &R : Reductions) { - dbgs() << "- "; R.Phi->dump(); - dbgs() << "-> "; R.AccIntAdd->dump(); - } - ); -} - -static void AddMACCandidate(OpChainList &Candidates, - Instruction *Mul, - Value *MulOp0, Value *MulOp1) { - LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); - assert(Mul->getOpcode() == Instruction::Mul && - "expected mul instruction"); - ValueList LHS; - ValueList RHS; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); - Candidates.push_back(make_unique(Mul, LHS, RHS)); - } -} - -static void MatchParallelMACSequences(Reduction &R, - OpChainList &Candidates) { - Instruction *Acc = R.AccIntAdd; - LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc); - - // Returns false to signal the search should be stopped. - std::function Match = - [&Candidates, &Match](Value *V) -> bool { - auto *I = dyn_cast(V); - if (!I) - return false; - - switch (I->getOpcode()) { - case Instruction::Add: - if (Match(I->getOperand(0)) || (Match(I->getOperand(1)))) - return true; - break; - case Instruction::Mul: { - Value *MulOp0 = I->getOperand(0); - Value *MulOp1 = I->getOperand(1); - if (isa(MulOp0) && isa(MulOp1)) - AddMACCandidate(Candidates, I, MulOp0, MulOp1); - return false; - } - case Instruction::SExt: - return Match(I->getOperand(0)); - } - return false; +void ARMParallelDSP::InsertParallelMACs(Reduction &R) { + + auto CreateSMLADCall = [&](SmallVectorImpl &VecLd0, + SmallVectorImpl &VecLd1, + Value *Acc, bool Exchange, + Instruction *InsertAfter) { + // Replace the reduction chain with an intrinsic call + IntegerType *Ty = IntegerType::get(M->getContext(), 32); + LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ? + WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty); + LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ? + WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty); + + Value* Args[] = { WideLd0, WideLd1, Acc }; + Function *SMLAD = nullptr; + if (Exchange) + SMLAD = Acc->getType()->isIntegerTy(32) ? + Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : + Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); + else + SMLAD = Acc->getType()->isIntegerTy(32) ? + Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : + Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); + + IRBuilder Builder(InsertAfter->getParent(), + ++BasicBlock::iterator(InsertAfter)); + Instruction *Call = Builder.CreateCall(SMLAD, Args); + NumSMLAD++; + return Call; }; - while (Match (Acc)); - LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found " - << Candidates.size() << " candidates.\n"); -} - -// Collects all instructions that are not part of the MAC chains, which is the -// set of instructions that can potentially alias with the MAC operands. -static void AliasCandidates(BasicBlock *Header, Instructions &Reads, - Instructions &Writes) { - for (auto &I : *Header) { - if (I.mayReadFromMemory()) - Reads.push_back(&I); - if (I.mayWriteToMemory()) - Writes.push_back(&I); - } -} - -// Check whether statements in the basic block that write to memory alias with -// the memory locations accessed by the MAC-chains. -// TODO: we need the read statements when we accept more complicated chains. -static bool AreAliased(AliasAnalysis *AA, Instructions &Reads, - Instructions &Writes, OpChainList &MACCandidates) { - LLVM_DEBUG(dbgs() << "Alias checks:\n"); - for (auto &MAC : MACCandidates) { - LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump()); - - // At the moment, we allow only simple chains that only consist of reads, - // accumulate their result with an integer add, and thus that don't write - // memory, and simply bail if they do. - if (!MAC->ReadOnly) - return true; - - // Now for all writes in the basic block, check that they don't alias with - // the memory locations accessed by our MAC-chain: - for (auto *I : Writes) { - LLVM_DEBUG(dbgs() << "- "; I->dump()); - assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs"); - for (auto &MemLoc : MAC->MemLocs) { - if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc), - ModRefInfo::ModRef))) { - LLVM_DEBUG(dbgs() << "Yes, aliases found\n"); - return true; - } - } - } - } - - LLVM_DEBUG(dbgs() << "OK: no aliases found!\n"); - return false; -} + Instruction *InsertAfter = R.getRoot(); + Value *Acc = R.getAccumulator(); + if (!Acc) + Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); -static bool CheckMACMemory(OpChainList &Candidates) { - for (auto &C : Candidates) { - // A mul has 2 operands, and a narrow op consist of sext and a load; thus - // we expect at least 4 items in this operand value list. - if (C->size() < 4) { - LLVM_DEBUG(dbgs() << "Operand list too short.\n"); - return false; - } - C->SetMemoryLocations(); - ValueList &LHS = static_cast(C.get())->LHS; - ValueList &RHS = static_cast(C.get())->RHS; + LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n" + << "Acc: " << *Acc << "\n"); + for (auto &Pair : R.getMulPairs()) { + BinOpChain *PMul0 = Pair.first; + BinOpChain *PMul1 = Pair.second; + LLVM_DEBUG(dbgs() << "Muls:\n" + << "- " << *PMul0->Root << "\n" + << "- " << *PMul1->Root << "\n"); - // Use +=2 to skip over the expected extend instructions. - for (unsigned i = 0, e = LHS.size(); i < e; i += 2) { - if (!isa(LHS[i]) || !isa(RHS[i])) - return false; - } + Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange, + InsertAfter); + InsertAfter = cast(Acc); } - return true; + R.UpdateRoot(cast(Acc)); } -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector -// multiplications. -// To use SMLAD: -// 1) we first need to find integer add reduction PHIs, -// 2) then from the PHI, look for this pattern: -// -// acc0 = phi i32 [0, %entry], [%acc1, %loop.body] -// ld0 = load i16 -// sext0 = sext i16 %ld0 to i32 -// ld1 = load i16 -// sext1 = sext i16 %ld1 to i32 -// mul0 = mul %sext0, %sext1 -// ld2 = load i16 -// sext2 = sext i16 %ld2 to i32 -// ld3 = load i16 -// sext3 = sext i16 %ld3 to i32 -// mul1 = mul i32 %sext2, %sext3 -// add0 = add i32 %mul0, %acc0 -// acc1 = add i32 %add0, %mul1 -// -// Which can be selected to: -// -// ldr.h r0 -// ldr.h r1 -// smlad r2, r0, r1, r2 -// -// If constants are used instead of loads, these will need to be hoisted -// out and into a register. -// -// If loop invariants are used instead of loads, these need to be packed -// before the loop begins. -// -bool ARMParallelDSP::MatchSMLAD(Function &F) { - BasicBlock *Header = L->getHeader(); - LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n"; - dbgs() << "Header block:\n"; Header->dump(); - dbgs() << "Loop info:\n\n"; L->dump()); +LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl &Loads, + IntegerType *LoadTy) { + assert(Loads.size() == 2 && "currently only support widening two loads"); - bool Changed = false; - ReductionList Reductions; - MatchReductions(F, L, Header, Reductions); + LoadInst *Base = Loads[0]; + LoadInst *Offset = Loads[1]; - for (auto &R : Reductions) { - OpChainList MACCandidates; - MatchParallelMACSequences(R, MACCandidates); - if (!CheckMACMemory(MACCandidates)) - continue; + Instruction *BaseSExt = dyn_cast(Base->user_back()); + Instruction *OffsetSExt = dyn_cast(Offset->user_back()); - R.MACCandidates = std::move(MACCandidates); + assert((BaseSExt && OffsetSExt) + && "Loads should have a single, extending, user"); - LLVM_DEBUG(dbgs() << "MAC candidates:\n"; - for (auto &M : R.MACCandidates) - M->Root->dump(); - dbgs() << "\n";); - } + std::function MoveBefore = + [&](Value *A, Value *B) -> void { + if (!isa(A) || !isa(B)) + return; - // Collect all instructions that may read or write memory. Our alias - // analysis checks bail out if any of these instructions aliases with an - // instruction from the MAC-chain. - Instructions Reads, Writes; - AliasCandidates(Header, Reads, Writes); + auto *Source = cast(A); + auto *Sink = cast(B); - for (auto &R : Reductions) { - if (AreAliased(AA, Reads, Writes, R.MACCandidates)) - return false; - CreateParallelMACPairs(R); - Changed |= InsertParallelMACs(R); - } + if (DT->dominates(Source, Sink) || + Source->getParent() != Sink->getParent() || + isa(Source) || isa(Sink)) + return; - LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump();); - return Changed; -} + Source->moveBefore(Sink); + for (auto &U : Source->uses()) + MoveBefore(Source, U.getUser()); + }; -static LoadInst *CreateLoadIns(IRBuilder &IRB, LoadInst &BaseLoad, - const Type *LoadTy) { - const unsigned AddrSpace = BaseLoad.getPointerAddressSpace(); + // Insert the load at the point of the original dominating load. + LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset; + IRBuilder IRB(DomLoad->getParent(), + ++BasicBlock::iterator(DomLoad)); - Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(), + // Bitcast the pointer to a wider type and create the wide load, while making + // sure to maintain the original alignment as this prevents ldrd from being + // generated when it could be illegal due to memory alignment. + const unsigned AddrSpace = DomLoad->getPointerAddressSpace(); + Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(), LoadTy->getPointerTo(AddrSpace)); - return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment()); -} - -Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, - Instruction *Acc, bool Exchange, - Instruction *InsertAfter) { - LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n" - << "- " << *VecLd0 << "\n" - << "- " << *VecLd1 << "\n" - << "- " << *Acc << "\n" - << "Exchange: " << Exchange << "\n"); - - IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); - - // Replace the reduction chain with an intrinsic call - const Type *Ty = IntegerType::get(M->getContext(), 32); - LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty); - LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty); - Value* Args[] = { NewLd0, NewLd1, Acc }; - Function *SMLAD = nullptr; - if (Exchange) - SMLAD = Acc->getType()->isIntegerTy(32) ? - Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : - Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); - else - SMLAD = Acc->getType()->isIntegerTy(32) ? - Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : - Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); - CallInst *Call = Builder.CreateCall(SMLAD, Args); - NumSMLAD++; - return Call; + LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, + Base->getAlignment()); + + // Make sure everything is in the correct order in the basic block. + MoveBefore(Base->getPointerOperand(), VecPtr); + MoveBefore(VecPtr, WideLoad); + + // From the wide load, create two values that equal the original two loads. + // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. + // TODO: Support big-endian as well. + Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); + BaseSExt->setOperand(0, Bottom); + + IntegerType *OffsetTy = cast(Offset->getType()); + Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); + Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); + Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); + OffsetSExt->setOperand(0, Trunc); + + WideLoads.emplace(std::make_pair(Base, + make_unique(Loads, WideLoad))); + return WideLoad; } // Compare the value lists in Other to this chain. @@ -741,7 +810,6 @@ bool BinOpChain::AreSymmetrical(BinOpChain *Other) { } const unsigned Pairs = VL0.size(); - LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n"); for (unsigned i = 0; i < Pairs; ++i) { const Value *V0 = VL0[i]; @@ -749,24 +817,17 @@ bool BinOpChain::AreSymmetrical(BinOpChain *Other) { const auto *Inst0 = dyn_cast(V0); const auto *Inst1 = dyn_cast(V1); - LLVM_DEBUG(dbgs() << "Pair " << i << ":\n"; - dbgs() << "mul1: "; V0->dump(); - dbgs() << "mul2: "; V1->dump()); - if (!Inst0 || !Inst1) return false; - if (Inst0->isSameOperationAs(Inst1)) { - LLVM_DEBUG(dbgs() << "OK: same operation found!\n"); + if (Inst0->isSameOperationAs(Inst1)) continue; - } const APInt *C0, *C1; if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) return false; } - LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n"); return true; }; diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h index 3ff0bee7e5bf..d519490c9c57 100644 --- a/lib/Target/ARM/ARMPerfectShuffle.h +++ b/lib/Target/ARM/ARMPerfectShuffle.h @@ -1,9 +1,8 @@ //===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td new file mode 100644 index 000000000000..0b6b40de80dd --- /dev/null +++ b/lib/Target/ARM/ARMPredicates.td @@ -0,0 +1,211 @@ +//===-- ARMPredicates.td - ARM Instruction Predicates ------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +def HasV4T : Predicate<"Subtarget->hasV4TOps()">, + AssemblerPredicate<"HasV4TOps", "armv4t">; +def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; +def HasV5T : Predicate<"Subtarget->hasV5TOps()">, + AssemblerPredicate<"HasV5TOps", "armv5t">; +def NoV5T : Predicate<"!Subtarget->hasV5TOps()">; +def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, + AssemblerPredicate<"HasV5TEOps", "armv5te">; +def HasV6 : Predicate<"Subtarget->hasV6Ops()">, + AssemblerPredicate<"HasV6Ops", "armv6">; +def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; +def HasV6M : Predicate<"Subtarget->hasV6MOps()">, + AssemblerPredicate<"HasV6MOps", + "armv6m or armv6t2">; +def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">, + AssemblerPredicate<"HasV8MBaselineOps", + "armv8m.base">; +def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">, + AssemblerPredicate<"HasV8MMainlineOps", + "armv8m.main">; +def HasV8_1MMainline : Predicate<"Subtarget->hasV8_1MMainlineOps()">, + AssemblerPredicate<"HasV8_1MMainlineOps", + "armv8.1m.main">; +def HasMVEInt : Predicate<"Subtarget->hasMVEIntegerOps()">, + AssemblerPredicate<"HasMVEIntegerOps", + "mve">; +def HasMVEFloat : Predicate<"Subtarget->hasMVEFloatOps()">, + AssemblerPredicate<"HasMVEFloatOps", + "mve.fp">; +def HasFPRegs : Predicate<"Subtarget->hasFPRegs()">, + AssemblerPredicate<"FeatureFPRegs", + "fp registers">; +def HasFPRegs16 : Predicate<"Subtarget->hasFPRegs16()">, + AssemblerPredicate<"FeatureFPRegs16", + "16-bit fp registers">; +def HasFPRegs64 : Predicate<"Subtarget->hasFPRegs64()">, + AssemblerPredicate<"FeatureFPRegs64", + "64-bit fp registers">; +def HasFPRegsV8_1M : Predicate<"Subtarget->hasFPRegs() && Subtarget->hasV8_1MMainlineOps()">, + AssemblerPredicate<"FeatureFPRegs,HasV8_1MMainlineOps", + "armv8.1m.main with FP or MVE">; +def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, + AssemblerPredicate<"HasV6T2Ops", "armv6t2">; +def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; +def HasV6K : Predicate<"Subtarget->hasV6KOps()">, + AssemblerPredicate<"HasV6KOps", "armv6k">; +def NoV6K : Predicate<"!Subtarget->hasV6KOps()">; +def HasV7 : Predicate<"Subtarget->hasV7Ops()">, + AssemblerPredicate<"HasV7Ops", "armv7">; +def HasV8 : Predicate<"Subtarget->hasV8Ops()">, + AssemblerPredicate<"HasV8Ops", "armv8">; +def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, + AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; +def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, + AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; +def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, + AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; +def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, + AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; +def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, + AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; +def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">; +def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">, + AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">; +def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">, + AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">; +def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">, + AssemblerPredicate<"FeatureVFP4_D16_SP", "VFP4">; +def HasDPVFP : Predicate<"Subtarget->hasFP64()">, + AssemblerPredicate<"FeatureFP64", + "double precision VFP">; +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8Base()">, + AssemblerPredicate<"FeatureFPARMv8_D16_SP", "FPARMv8">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "NEON">; +def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, + AssemblerPredicate<"FeatureSHA2", "sha2">; +def HasAES : Predicate<"Subtarget->hasAES()">, + AssemblerPredicate<"FeatureAES", "aes">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasDotProd : Predicate<"Subtarget->hasDotProd()">, + AssemblerPredicate<"FeatureDotProd", "dotprod">; +def HasCRC : Predicate<"Subtarget->hasCRC()">, + AssemblerPredicate<"FeatureCRC", "crc">; +def HasRAS : Predicate<"Subtarget->hasRAS()">, + AssemblerPredicate<"FeatureRAS", "ras">; +def HasLOB : Predicate<"Subtarget->hasLOB()">, + AssemblerPredicate<"FeatureLOB", "lob">; +def HasFP16 : Predicate<"Subtarget->hasFP16()">, + AssemblerPredicate<"FeatureFP16","half-float conversions">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16","full half-float">; +def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, + AssemblerPredicate<"FeatureFP16FML","full half-float fml">; +def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">, + AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">; +def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, + AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">; +def HasDSP : Predicate<"Subtarget->hasDSP()">, + AssemblerPredicate<"FeatureDSP", "dsp">; +def HasDB : Predicate<"Subtarget->hasDataBarrier()">, + AssemblerPredicate<"FeatureDB", + "data-barriers">; +def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">, + AssemblerPredicate<"FeatureDFB", + "full-data-barrier">; +def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">, + AssemblerPredicate<"FeatureV7Clrex", + "v7 clrex">; +def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">, + AssemblerPredicate<"FeatureAcquireRelease", + "acquire/release">; +def HasMP : Predicate<"Subtarget->hasMPExtension()">, + AssemblerPredicate<"FeatureMP", + "mp-extensions">; +def HasVirtualization: Predicate<"false">, + AssemblerPredicate<"FeatureVirtualization", + "virtualization-extensions">; +def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, + AssemblerPredicate<"FeatureTrustZone", + "TrustZone">; +def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">, + AssemblerPredicate<"Feature8MSecExt", + "ARMv8-M Security Extensions">; +def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; +def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; +def IsThumb : Predicate<"Subtarget->isThumb()">, + AssemblerPredicate<"ModeThumb", "thumb">; +def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; +def IsThumb2 : Predicate<"Subtarget->isThumb2()">, + AssemblerPredicate<"ModeThumb,FeatureThumb2", + "thumb2">; +def IsMClass : Predicate<"Subtarget->isMClass()">, + AssemblerPredicate<"FeatureMClass", "armv*m">; +def IsNotMClass : Predicate<"!Subtarget->isMClass()">, + AssemblerPredicate<"!FeatureMClass", + "!armv*m">; +def IsARM : Predicate<"!Subtarget->isThumb()">, + AssemblerPredicate<"!ModeThumb", "arm-mode">; +def IsMachO : Predicate<"Subtarget->isTargetMachO()">; +def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def IsWindows : Predicate<"Subtarget->isTargetWindows()">; +def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">; +def IsReadTPHard : Predicate<"Subtarget->isReadTPHard()">; +def IsReadTPSoft : Predicate<"!Subtarget->isReadTPHard()">; +def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, + AssemblerPredicate<"FeatureNaClTrap", "NaCl">; +def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; + +def UseNegativeImmediates : + Predicate<"false">, + AssemblerPredicate<"!FeatureNoNegativeImmediates", + "NegativeImmediates">; + +// FIXME: Eventually this will be just "hasV6T2Ops". +let RecomputePerFunction = 1 in { + def UseMovt : Predicate<"Subtarget->useMovt()">; + def DontUseMovt : Predicate<"!Subtarget->useMovt()">; + def UseMovtInPic : Predicate<"Subtarget->useMovt() && Subtarget->allowPositionIndependentMovt()">; + def DontUseMovtInPic : Predicate<"!Subtarget->useMovt() || !Subtarget->allowPositionIndependentMovt()">; + + def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&" + " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||" + "Subtarget->hasMinSize())">; +} +def UseMulOps : Predicate<"Subtarget->useMulOps()">; + +// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. +// But only select them if more precision in FP computation is allowed, and when +// they are not slower than a mul + add sequence. +// Do not use them for Darwin platforms. +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast && " + " Subtarget->hasVFP4Base()) && " + "!Subtarget->isTargetDarwin() &&" + "Subtarget->useFPVMLx()">; + +def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; +def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; + +def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">; +def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">; + +def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||" + "!Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&" + "Subtarget->useNEONForSinglePrecisionFP()">; + +let RecomputePerFunction = 1 in { + def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">; + def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">; +} + +def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">; + +// Armv8.5-A extensions +def HasSB : Predicate<"Subtarget->hasSB()">, + AssemblerPredicate<"FeatureSB", "sb">; diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 4f28f2dafc70..b100150175fc 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- ARMRegisterBankInfo.cpp -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -161,6 +160,10 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) "Subclass not added?"); assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) && "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) && + "Subclass not added?"); assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit"); #ifndef NDEBUG @@ -182,6 +185,13 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass( case tGPR_and_tcGPRRegClassID: case tcGPRRegClassID: case tGPRRegClassID: + case tGPREvenRegClassID: + case tGPROddRegClassID: + case tGPR_and_tGPREvenRegClassID: + case tGPR_and_tGPROddRegClassID: + case tGPREven_and_tcGPRRegClassID: + case tGPREven_and_tGPR_and_tcGPRRegClassID: + case tGPROdd_and_tcGPRRegClassID: return getRegBank(ARM::GPRRegBankID); case HPRRegClassID: case SPR_8RegClassID: @@ -218,7 +228,15 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { switch (Opc) { case G_ADD: - case G_SUB: + case G_SUB: { + // Integer operations where the source and destination are in the + // same register class. + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + OperandsMapping = Ty.getSizeInBits() == 64 + ? &ARM::ValueMappings[ARM::DPR3OpsIdx] + : &ARM::ValueMappings[ARM::GPR3OpsIdx]; + break; + } case G_MUL: case G_AND: case G_OR: @@ -337,6 +355,14 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { &ARM::ValueMappings[ARM::GPR3OpsIdx]}); break; } + case G_FCONSTANT: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + OperandsMapping = getOperandsMapping( + {Ty.getSizeInBits() == 64 ? &ARM::ValueMappings[ARM::DPR3OpsIdx] + : &ARM::ValueMappings[ARM::SPR3OpsIdx], + nullptr}); + break; + } case G_CONSTANT: case G_FRAME_INDEX: case G_GLOBAL_VALUE: @@ -424,6 +450,19 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); break; + case DBG_VALUE: { + SmallVector OperandBanks(NumOperands); + const MachineOperand &MaybeReg = MI.getOperand(0); + if (MaybeReg.isReg() && MaybeReg.getReg()) { + unsigned Size = MRI.getType(MaybeReg.getReg()).getSizeInBits(); + if (Size > 32 && Size != 64) + return getInvalidInstructionMapping(); + OperandBanks[0] = Size == 64 ? &ARM::ValueMappings[ARM::DPR3OpsIdx] + : &ARM::ValueMappings[ARM::GPR3OpsIdx]; + } + OperandsMapping = getOperandsMapping(OperandBanks); + break; + } default: return getInvalidInstructionMapping(); } @@ -433,7 +472,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { for (const auto &Mapping : OperandsMapping[i]) { assert( (Mapping.RegBank->getID() != ARM::FPRRegBankID || - MF.getSubtarget().hasVFP2()) && + MF.getSubtarget().hasVFP2Base()) && "Trying to use floating point register bank on target without vfp"); } } diff --git a/lib/Target/ARM/ARMRegisterBankInfo.h b/lib/Target/ARM/ARMRegisterBankInfo.h index 9650b358f319..1961f7af49bb 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.h +++ b/lib/Target/ARM/ARMRegisterBankInfo.h @@ -1,9 +1,8 @@ //===- ARMRegisterBankInfo ---------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/ARM/ARMRegisterBanks.td b/lib/Target/ARM/ARMRegisterBanks.td index 6e3834da3bb5..e4ebf793f9b0 100644 --- a/lib/Target/ARM/ARMRegisterBanks.td +++ b/lib/Target/ARM/ARMRegisterBanks.td @@ -1,9 +1,8 @@ //=- ARMRegisterBank.td - Describe the AArch64 Banks ---------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp index e6e8cdf965e2..6649750bb388 100644 --- a/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/lib/Target/ARM/ARMRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h index e2e650e4af93..87c0f322d3b3 100644 --- a/lib/Target/ARM/ARMRegisterInfo.h +++ b/lib/Target/ARM/ARMRegisterInfo.h @@ -1,9 +1,8 @@ //===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index dc56186cb54a..92ae26b3729d 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -1,9 +1,8 @@ //===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -14,7 +13,8 @@ include "ARMSystemRegister.td" //===----------------------------------------------------------------------===// // Registers are identified with 4-bit ID numbers. -class ARMReg Enc, string n, list subregs = []> : Register { +class ARMReg Enc, string n, list subregs = [], + list altNames = []> : Register { let HWEncoding = Enc; let Namespace = "ARM"; let SubRegs = subregs; @@ -27,6 +27,11 @@ class ARMFReg Enc, string n> : Register { let Namespace = "ARM"; } +let Namespace = "ARM", + FallbackRegAltNameIndex = NoRegAltName in { + def RegNamesRaw : RegAltNameIndex; +} + // Subregister indices. let Namespace = "ARM" in { def qqsub_0 : SubRegIndex<256>; @@ -84,9 +89,11 @@ def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>; def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>; def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>; def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>; -def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>; -def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>; -def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>; +let RegAltNameIndices = [RegNamesRaw] in { +def SP : ARMReg<13, "sp", [], ["r13"]>, DwarfRegNum<[13]>; +def LR : ARMReg<14, "lr", [], ["r14"]>, DwarfRegNum<[14]>; +def PC : ARMReg<15, "pc", [], ["r15"]>, DwarfRegNum<[15]>; +} } // Float registers @@ -190,6 +197,17 @@ def MVFR0 : ARMReg<7, "mvfr0">; def FPEXC : ARMReg<8, "fpexc">; def FPINST : ARMReg<9, "fpinst">; def FPINST2 : ARMReg<10, "fpinst2">; +// These encodings aren't actual instruction encodings, their encoding depends +// on the instruction they are used in and for VPR 32 was chosen such that it +// always comes last in spr_reglist_with_vpr. +def VPR : ARMReg<32, "vpr">; +def FPSCR_NZCVQC + : ARMReg<2, "fpscr_nzcvqc">; +def P0 : ARMReg<13, "p0">; +def FPCXTNS : ARMReg<14, "fpcxtns">; +def FPCXTS : ARMReg<15, "fpcxts">; + +def ZR : ARMReg<15, "zr">, DwarfRegNum<[15]>; // Register classes. // @@ -209,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), // know how to spill them. If we make our prologue/epilogue code smarter at // some point, we can go back to using the above allocation orders for the // Thumb1 instructions that know how to use hi regs. - let AltOrders = [(add LR, GPR), (trunc GPR, 8)]; + let AltOrders = [(add LR, GPR), (trunc GPR, 8), + (add (trunc GPR, 8), R12, LR, (shl GPR, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().getGPRAllocationOrder(MF); }]; let DiagnosticString = "operand must be a register in range [r0, r15]"; } @@ -220,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), // certain operand slots, particularly as the destination. Primarily // useful for disassembly. def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> { - let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)]; + let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8), + (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().getGPRAllocationOrder(MF); }]; let DiagnosticString = "operand must be a register in range [r0, r14]"; } @@ -238,6 +258,27 @@ def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV) let DiagnosticString = "operand must be a register in range [r0, r14] or apsr_nzcv"; } +// GPRs without the PC and SP registers but with APSR. Used by CLRM instruction. +def GPRwithAPSRnosp : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR)> { + let isAllocatable = 0; +} + +def GPRwithZR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), ZR)> { + let AltOrders = [(add LR, GPRwithZR), (trunc GPRwithZR, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().isThumb1Only(); + }]; + let DiagnosticString = "operand must be a register in range [r0, r14] or zr"; +} + +def GPRwithZRnosp : RegisterClass<"ARM", [i32], 32, (sub GPRwithZR, SP)> { + let AltOrders = [(add LR, GPRwithZRnosp), (trunc GPRwithZRnosp, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().isThumb1Only(); + }]; + let DiagnosticString = "operand must be a register in range [r0, r12] or r14 or zr"; +} + // GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the // implied SP argument list. // FIXME: It would be better to not use this at all and refactor the @@ -247,14 +288,19 @@ def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)> { let DiagnosticString = "operand must be a register sp"; } +// GPRlr - Only LR is legal. Used by ARMv8.1-M Low Overhead Loop instructions +// where LR is the only legal loop counter register. +def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>; + // restricted GPR register class. Many Thumb2 instructions allow the full // register range for operands, but have undefined behaviours when PC // or SP (R13 or R15) are used. The ARM ISA refers to these operands // via the BadReg() pseudo-code description. def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { - let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)]; + let AltOrders = [(add LR, rGPR), (trunc rGPR, 8), + (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().getGPRAllocationOrder(MF); }]; let DiagnosticType = "rGPR"; } @@ -285,12 +331,38 @@ def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> { }]; } +def tGPROdd : RegisterClass<"ARM", [i32], 32, (add R1, R3, R5, R7, R9, R11)> { + let AltOrders = [(and tGPROdd, tGPR)]; + let AltOrderSelect = [{ + return MF.getSubtarget().isThumb1Only(); + }]; + let DiagnosticString = + "operand must be an odd-numbered register in range [r1,r11]"; +} + +def tGPREven : RegisterClass<"ARM", [i32], 32, (add R0, R2, R4, R6, R8, R10, R12, LR)> { + let AltOrders = [(and tGPREven, tGPR)]; + let AltOrderSelect = [{ + return MF.getSubtarget().isThumb1Only(); + }]; + let DiagnosticString = "operand must be an even-numbered register"; +} + // Condition code registers. def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } +// MVE Condition code register. +def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)> { +// let CopyCost = -1; // Don't allow copying of status registers. +} + +// FPSCR, when the flags at the top of it are used as the input or +// output to an instruction such as MVE VADC. +def cl_FPSCR_NZCV : RegisterClass<"ARM", [i32], 32, (add FPSCR_NZCV)>; + // Scalar single precision floating point register class.. // FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack // to avoid partial-write dependencies on D or Q (depending on platform) @@ -302,7 +374,7 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> { (decimate (rotl SPR, 1), 4), (decimate (rotl SPR, 1), 2))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().useStride4VFPs(MF); + return 1 + MF.getSubtarget().useStride4VFPs(); }]; let DiagnosticString = "operand must be a register in range [s0, s31]"; } @@ -314,7 +386,7 @@ def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { (decimate (rotl HPR, 1), 4), (decimate (rotl HPR, 1), 2))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().useStride4VFPs(MF); + return 1 + MF.getSubtarget().useStride4VFPs(); }]; let DiagnosticString = "operand must be a register in range [s0, s31]"; } @@ -336,11 +408,18 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 6 let AltOrders = [(rotl DPR, 16), (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().useStride4VFPs(MF); + return 1 + MF.getSubtarget().useStride4VFPs(); }]; let DiagnosticType = "DPR"; } +// Scalar single and double precision floating point and VPR register class, +// this is only used for parsing, don't use it anywhere else as the size and +// types don't match! +def FPWithVPR : RegisterClass<"ARM", [f32], 32, (add SPR, DPR, VPR)> { + let isAllocatable = 0; +} + // Subset of DPR that are accessible with VFP2 (and so that also have // 32-bit SPR subregs). def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, @@ -359,8 +438,10 @@ def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (sequence "Q%u", 0, 15)> { // Allocate non-VFP2 aliases Q8-Q15 first. - let AltOrders = [(rotl QPR, 8)]; - let AltOrderSelect = [{ return 1; }]; + let AltOrders = [(rotl QPR, 8), (trunc QPR, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().hasMVEIntegerOps(); + }]; let DiagnosticString = "operand must be a register in range [q0, q15]"; } @@ -376,6 +457,12 @@ def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], let DiagnosticString = "operand must be a register in range [q0, q3]"; } +// MVE 128-bit vector register class. This class is only really needed for +// parsing assembly, since we still have to truncate the register set in the QPR +// class anyway. +def MQPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], + 128, (trunc QPR, 8)>; + // Pseudo-registers representing odd-even pairs of D registers. The even-odd // pairs are already represented by the Q registers. // These are needed by NEON instructions requiring two consecutive D registers. @@ -390,8 +477,11 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, (interleave QPR, TuplesOE2D)> { // Allocate starting at non-VFP2 registers D16-D31 first. // Prefer even-odd pairs as they are easier to copy. - let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))]; - let AltOrderSelect = [{ return 1; }]; + let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16)), + (add (trunc QPR, 8), (trunc DPair, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().hasMVEIntegerOps(); + }]; } // Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP. diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index ed5a3a7bb696..ce74d325c4e5 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -1,9 +1,8 @@ //===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -425,4 +424,4 @@ include "ARMScheduleA9.td" include "ARMScheduleSwift.td" include "ARMScheduleR52.td" include "ARMScheduleA57.td" -include "ARMScheduleM3.td" +include "ARMScheduleM4.td" diff --git a/lib/Target/ARM/ARMScheduleA57.td b/lib/Target/ARM/ARMScheduleA57.td index 63f975ba6e39..a79f3348f338 100644 --- a/lib/Target/ARM/ARMScheduleA57.td +++ b/lib/Target/ARM/ARMScheduleA57.td @@ -1,9 +1,8 @@ //=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -95,6 +94,9 @@ def CortexA57Model : SchedMachineModel { // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; + + let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat, + HasFPRegsV8_1M]; } //===----------------------------------------------------------------------===// @@ -1175,7 +1177,8 @@ def : InstRW<[A57Write_8cyc_1V], (instregex // ASIMD FP max/min def : InstRW<[A57Write_5cyc_1V], (instregex - "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>; + "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "(NEON|VFP)_VMAXNM", + "(NEON|VFP)_VMINNM")>; // ASIMD FP multiply def A57WriteVMUL_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 5; } diff --git a/lib/Target/ARM/ARMScheduleA57WriteRes.td b/lib/Target/ARM/ARMScheduleA57WriteRes.td index 670717dc7c13..5ba61503686e 100644 --- a/lib/Target/ARM/ARMScheduleA57WriteRes.td +++ b/lib/Target/ARM/ARMScheduleA57WriteRes.td @@ -1,9 +1,8 @@ //=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index ba380cba100f..1be0ee4334a8 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -1,9 +1,8 @@ //=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index fc301c589269..21d32bde4710 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1,9 +1,8 @@ //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td deleted file mode 100644 index 93f8299f9bd0..000000000000 --- a/lib/Target/ARM/ARMScheduleM3.td +++ /dev/null @@ -1,21 +0,0 @@ -//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for the ARM Cortex-M3 processor. -// -//===----------------------------------------------------------------------===// - -def CortexM3Model : SchedMachineModel { - let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue - let MicroOpBufferSize = 0; // In-order - let LoadLatency = 2; // Latency when not pipelined, not pc-relative - let MispredictPenalty = 2; // Best case branch taken cost - - let CompleteModel = 0; -} diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td new file mode 100644 index 000000000000..38c8ea2b4f35 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleM4.td @@ -0,0 +1,119 @@ +//==- ARMScheduleM4.td - Cortex-M4 Scheduling Definitions -*- tablegen -*-====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the SchedRead/Write data for the ARM Cortex-M4 processor. +// +//===----------------------------------------------------------------------===// + +def CortexM4Model : SchedMachineModel { + let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue + let MicroOpBufferSize = 0; // In-order + let LoadLatency = 2; // Latency when not pipelined, not pc-relative + let MispredictPenalty = 2; // Best case branch taken cost + let PostRAScheduler = 1; + + let CompleteModel = 0; +} + + +// We model the entire cpu as a single pipeline with a BufferSize = 0 since +// Cortex-M4 is in-order. + +def M4Unit : ProcResource<1> { let BufferSize = 0; } + + +let SchedModel = CortexM4Model in { + +// Some definitions of latencies we apply to different instructions + +class M4UnitL1 : WriteRes { let Latency = 1; } +class M4UnitL2 : WriteRes { let Latency = 2; } +class M4UnitL3 : WriteRes { let Latency = 3; } +class M4UnitL14 : WriteRes { let Latency = 14; } +def M4UnitL1_wr : SchedWriteRes<[M4Unit]> { let Latency = 1; } +def M4UnitL2_wr : SchedWriteRes<[M4Unit]> { let Latency = 2; } +class M4UnitL1I : InstRW<[M4UnitL1_wr], instr>; +class M4UnitL2I : InstRW<[M4UnitL2_wr], instr>; + + +// Loads, MAC's and DIV all get a higher latency of 2 +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; + +def : M4UnitL2I<(instregex "(t|t2)LDM")>; + + +// Stores we use a latency of 1 as they have no outputs + +def : M4UnitL1; +def : M4UnitL1I<(instregex "(t|t2)STM")>; + + +// Everything else has a Latency of 1 + +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1I<(instregex "(t|t2)MOV")>; +def : M4UnitL1I<(instrs COPY)>; +def : M4UnitL1I<(instregex "t2IT")>; +def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", + "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>; + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// Most FP instructions are single-cycle latency, except MAC's, Div's and Sqrt's. +// Loads still take 2 cycles. + +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL2I<(instregex "VLD")>; +def : M4UnitL1I<(instregex "VST")>; +def : M4UnitL3; +def : M4UnitL3; +def : M4UnitL14; +def : M4UnitL14; +def : M4UnitL14; +def : M4UnitL14; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; + +def : ReadAdvance; +def : ReadAdvance; + +} diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td index 11bce45161b3..d1cbf754b5a1 100644 --- a/lib/Target/ARM/ARMScheduleR52.td +++ b/lib/Target/ARM/ARMScheduleR52.td @@ -1,9 +1,8 @@ //==- ARMScheduleR52.td - Cortex-R52 Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td index 87984648139b..00a44599b1b2 100644 --- a/lib/Target/ARM/ARMScheduleSwift.td +++ b/lib/Target/ARM/ARMScheduleSwift.td @@ -1,9 +1,8 @@ //=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index 57d0bfb65049..9b86097329c0 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -1,9 +1,8 @@ //===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index 4d685158e258..cade06e8c109 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -171,7 +170,7 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( // Code size optimisation: do not inline memcpy if expansion results in // more instructions than the libary call. - if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction().optForMinSize()) { + if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) { return SDValue(); } diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h index 2ddb42c95397..b8a86ae7310f 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index b1d0761e3231..978faed776b0 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -1,9 +1,8 @@ //===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -93,10 +92,12 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU, ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - const ARMBaseTargetMachine &TM, bool IsLittle) + const ARMBaseTargetMachine &TM, bool IsLittle, + bool MinSize) : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps), - CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), - TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), + CPUString(CPU), OptMinSize(MinSize), IsLittle(IsLittle), + TargetTriple(TT), Options(TM.Options), TM(TM), + FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. InstrInfo(isThumb1Only() @@ -283,6 +284,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { case CortexA72: case CortexA73: case CortexA75: + case CortexA76: case CortexR4: case CortexR4F: case CortexR5: @@ -359,6 +361,13 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { } bool ARMSubtarget::enableMachineScheduler() const { + // The MachineScheduler can increase register usage, so we use more high + // registers and end up with more T2 instructions that cannot be converted to + // T1 instructions. At least until we do better at converting to thumb1 + // instructions, on cortex-m at Oz where we are size-paranoid, don't use the + // Machine scheduler, relying on the DAG register pressure scheduler instead. + if (isMClass() && hasMinSize()) + return false; // Enable the MachineScheduler before register allocation for subtargets // with the use-misched feature. return useMachineScheduler(); @@ -374,20 +383,20 @@ bool ARMSubtarget::enablePostRAScheduler() const { bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); } -bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const { +bool ARMSubtarget::useStride4VFPs() const { // For general targets, the prologue can grow when VFPs are allocated with // stride 4 (more vpush instructions). But WatchOS uses a compact unwind // format which it's more important to get right. return isTargetWatchABI() || - (useWideStrideVFP() && !MF.getFunction().optForMinSize()); + (useWideStrideVFP() && !OptMinSize); } -bool ARMSubtarget::useMovt(const MachineFunction &MF) const { +bool ARMSubtarget::useMovt() const { // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit // immediates as it is inherently position independent, and may be out of // range otherwise. return !NoMovt && hasV8MBaselineOps() && - (isTargetWindows() || !MF.getFunction().optForMinSize() || genExecuteOnly()); + (isTargetWindows() || !OptMinSize || genExecuteOnly()); } bool ARMSubtarget::useFastISel() const { @@ -404,3 +413,45 @@ bool ARMSubtarget::useFastISel() const { ((isTargetMachO() && !isThumb1Only()) || (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb())); } + +unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const { + // The GPR register class has multiple possible allocation orders, with + // tradeoffs preferred by different sub-architectures and optimisation goals. + // The allocation orders are: + // 0: (the default tablegen order, not used) + // 1: r14, r0-r13 + // 2: r0-r7 + // 3: r0-r7, r12, lr, r8-r11 + // Note that the register allocator will change this order so that + // callee-saved registers are used later, as they require extra work in the + // prologue/epilogue (though we sometimes override that). + + // For thumb1-only targets, only the low registers are allocatable. + if (isThumb1Only()) + return 2; + + // Allocate low registers first, so we can select more 16-bit instructions. + // We also (in ignoreCSRForAllocationOrder) override the default behaviour + // with regards to callee-saved registers, because pushing extra registers is + // much cheaper (in terms of code size) than using high registers. After + // that, we allocate r12 (doesn't need to be saved), lr (saving it means we + // can return with the pop, don't need an extra "bx lr") and then the rest of + // the high registers. + if (isThumb2() && MF.getFunction().hasMinSize()) + return 3; + + // Otherwise, allocate in the default order, using LR first because saving it + // allows a shorter epilogue sequence. + return 1; +} + +bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF, + unsigned PhysReg) const { + // To minimize code size in Thumb2, we prefer the usage of low regs (lower + // cost per use) so we can use narrow encoding. By default, caller-saved + // registers (e.g. lr, r12) are always allocated first, regardless of + // their cost per use. When optForMinSize, we prefer the low regs even if + // they are CSR because usually push/pop can be folded into existing ones. + return isThumb2() && MF.getFunction().hasMinSize() && + ARM::GPRRegClass.contains(PhysReg); +} diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 11841b4467a2..c2b0f052b843 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -1,9 +1,8 @@ //===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -60,6 +59,7 @@ protected: CortexA72, CortexA73, CortexA75, + CortexA76, CortexA8, CortexA9, CortexM3, @@ -110,7 +110,8 @@ protected: ARMv8a, ARMv8mBaseline, ARMv8mMainline, - ARMv8r + ARMv8r, + ARMv81mMainline, }; public: @@ -157,6 +158,9 @@ protected: bool HasV8_5aOps = false; bool HasV8MBaselineOps = false; bool HasV8MMainlineOps = false; + bool HasV8_1MMainlineOps = false; + bool HasMVEIntegerOps = false; + bool HasMVEFloatOps = false; /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what /// floating point ISAs are supported. @@ -165,6 +169,24 @@ protected: bool HasVFPv4 = false; bool HasFPARMv8 = false; bool HasNEON = false; + bool HasFPRegs = false; + bool HasFPRegs16 = false; + bool HasFPRegs64 = false; + + /// Versions of the VFP flags restricted to single precision, or to + /// 16 d-registers, or both. + bool HasVFPv2SP = false; + bool HasVFPv3SP = false; + bool HasVFPv4SP = false; + bool HasFPARMv8SP = false; + bool HasVFPv2D16 = false; + bool HasVFPv3D16 = false; + bool HasVFPv4D16 = false; + bool HasFPARMv8D16 = false; + bool HasVFPv2D16SP = false; + bool HasVFPv3D16SP = false; + bool HasVFPv4D16SP = false; + bool HasFPARMv8D16SP = false; /// HasDotProd - True if the ARMv8.2A dot product instructions are supported. bool HasDotProd = false; @@ -232,9 +254,9 @@ protected: /// HasFP16FML - True if subtarget supports half-precision FP fml operations bool HasFP16FML = false; - /// HasD16 - True if subtarget is limited to 16 double precision + /// HasD32 - True if subtarget has the full 32 double precision /// FP registers for VFPv3. - bool HasD16 = false; + bool HasD32 = false; /// HasHardwareDivide - True if subtarget supports [su]div in Thumb mode bool HasHardwareDivideInThumb = false; @@ -291,9 +313,9 @@ protected: /// extension. bool HasVirtualization = false; - /// FPOnlySP - If true, the floating point unit only supports single + /// HasFP64 - If true, the floating point unit supports double /// precision. - bool FPOnlySP = false; + bool HasFP64 = false; /// If true, the processor supports the Performance Monitor Extensions. These /// include a generic cycle-counter as well as more fine-grained (often @@ -321,6 +343,9 @@ protected: /// HasRAS - if true, the processor supports RAS extensions bool HasRAS = false; + /// HasLOB - if true, the processor supports the Low Overhead Branch extension + bool HasLOB = false; + /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are /// particularly effective at zeroing a VFP register. bool HasZeroCycleZeroing = false; @@ -446,6 +471,10 @@ protected: /// What alignment is preferred for loop bodies, in log2(bytes). unsigned PrefLoopAlignment = 0; + /// OptMinSize - True if we're optimising for minimum code size, equal to + /// the function attribute. + bool OptMinSize = false; + /// IsLittle - The target is Little Endian bool IsLittle; @@ -468,7 +497,8 @@ public: /// of the specified triple. /// ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - const ARMBaseTargetMachine &TM, bool IsLittle); + const ARMBaseTargetMachine &TM, bool IsLittle, + bool MinSize = false); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -546,6 +576,12 @@ public: bool hasV8_5aOps() const { return HasV8_5aOps; } bool hasV8MBaselineOps() const { return HasV8MBaselineOps; } bool hasV8MMainlineOps() const { return HasV8MMainlineOps; } + bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; } + bool hasMVEIntegerOps() const { return HasMVEIntegerOps; } + bool hasMVEFloatOps() const { return HasMVEFloatOps; } + bool hasFPRegs() const { return HasFPRegs; } + bool hasFPRegs16() const { return HasFPRegs16; } + bool hasFPRegs64() const { return HasFPRegs64; } /// @{ /// These functions are obsolete, please consider adding subtarget features @@ -564,10 +600,10 @@ public: bool hasARMOps() const { return !NoARM; } - bool hasVFP2() const { return HasVFPv2; } - bool hasVFP3() const { return HasVFPv3; } - bool hasVFP4() const { return HasVFPv4; } - bool hasFPARMv8() const { return HasFPARMv8; } + bool hasVFP2Base() const { return HasVFPv2D16SP; } + bool hasVFP3Base() const { return HasVFPv3D16SP; } + bool hasVFP4Base() const { return HasVFPv4D16SP; } + bool hasFPARMv8Base() const { return HasFPARMv8D16SP; } bool hasNEON() const { return HasNEON; } bool hasSHA2() const { return HasSHA2; } bool hasAES() const { return HasAES; } @@ -575,6 +611,7 @@ public: bool hasDotProd() const { return HasDotProd; } bool hasCRC() const { return HasCRC; } bool hasRAS() const { return HasRAS; } + bool hasLOB() const { return HasLOB; } bool hasVirtualization() const { return HasVirtualization; } bool useNEONForSinglePrecisionFP() const { @@ -596,7 +633,7 @@ public: bool useFPVMLx() const { return !SlowFPVMLx; } bool hasVMLxForwarding() const { return HasVMLxForwarding; } bool isFPBrccSlow() const { return SlowFPBrcc; } - bool isFPOnlySP() const { return FPOnlySP; } + bool hasFP64() const { return HasFP64; } bool hasPerfMon() const { return HasPerfMon; } bool hasTrustZone() const { return HasTrustZone; } bool has8MSecExt() const { return Has8MSecExt; } @@ -633,7 +670,7 @@ public: bool genExecuteOnly() const { return GenExecuteOnly; } bool hasFP16() const { return HasFP16; } - bool hasD16() const { return HasD16; } + bool hasD32() const { return HasD32; } bool hasFullFP16() const { return HasFullFP16; } bool hasFP16FML() const { return HasFP16FML; } @@ -710,6 +747,7 @@ public: bool disablePostRAScheduler() const { return DisablePostRAScheduler; } bool useSoftFloat() const { return UseSoftFloat; } bool isThumb() const { return InThumbMode; } + bool hasMinSize() const { return OptMinSize; } bool isThumb1Only() const { return InThumbMode && !HasThumb2; } bool isThumb2() const { return InThumbMode && HasThumb2; } bool hasThumb2() const { return HasThumb2; } @@ -736,9 +774,9 @@ public: isThumb1Only(); } - bool useStride4VFPs(const MachineFunction &MF) const; + bool useStride4VFPs() const; - bool useMovt(const MachineFunction &MF) const; + bool useMovt() const; bool supportsTailCall() const { return SupportsTailCall; } @@ -818,6 +856,10 @@ public: unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + + bool ignoreCSRForAllocationOrder(const MachineFunction &MF, + unsigned PhysReg) const override; + unsigned getGPRAllocationOrder(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMSystemRegister.td b/lib/Target/ARM/ARMSystemRegister.td index ad1d37168e08..f21c7f0246f9 100644 --- a/lib/Target/ARM/ARMSystemRegister.td +++ b/lib/Target/ARM/ARMSystemRegister.td @@ -1,9 +1,8 @@ //===-- ARMSystemRegister.td - ARM Register defs -------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index ec02c840d5e1..7f0aae1739b3 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "ARMTargetObjectFile.h" #include "ARMTargetTransformInfo.h" #include "MCTargetDesc/ARMMCTargetDesc.h" +#include "TargetInfo/ARMTargetInfo.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -95,6 +95,8 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); initializeThumb2SizeReducePass(Registry); + initializeMVEVPTBlockPass(Registry); + initializeARMLowOverheadLoopsPass(Registry); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -142,6 +144,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, // Pointers are 32 bits and aligned to 32 bits. Ret += "-p:32:32"; + // Function pointers are aligned to 8 bits (because the LSB stores the + // ARM/Thumb state). + Ret += "-Fi8"; + // ABIs other than APCS have 64 bit integers with natural alignment. if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS) Ret += "-i64:64"; @@ -264,13 +270,20 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { if (SoftFloat) FS += FS.empty() ? "+soft-float" : ",+soft-float"; - auto &I = SubtargetMap[CPU + FS]; + // Use the optminsize to identify the subtarget, but don't use it in the + // feature string. + std::string Key = CPU + FS; + if (F.hasMinSize()) + Key += "+minsize"; + + auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique(TargetTriple, CPU, FS, *this, isLittle); + I = llvm::make_unique(TargetTriple, CPU, FS, *this, isLittle, + F.hasMinSize()); if (!I->isThumb() && !I->hasARMOps()) F.getContext().emitError("Function '" + F.getName() + "' uses ARM " @@ -351,6 +364,8 @@ public: void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + + std::unique_ptr getCSEConfig() const override; }; class ARMExecutionDomainFix : public ExecutionDomainFix { @@ -375,6 +390,10 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { return new ARMPassConfig(*this, PM); } +std::unique_ptr ARMPassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} + void ARMPassConfig::addIRPasses() { if (TM->Options.ThreadModel == ThreadModel::Single) addPass(createLowerAtomicPass()); @@ -393,6 +412,10 @@ void ARMPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); + // Run the parallel DSP pass. + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createARMParallelDSPPass()); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); @@ -405,9 +428,6 @@ void ARMPassConfig::addCodeGenPrepare() { } bool ARMPassConfig::addPreISel() { - if (getOptLevel() != CodeGenOpt::None) - addPass(createARMParallelDSPPass()); - if ((TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge == cl::BOU_UNSET) || EnableGlobalMerge == cl::BOU_TRUE) { @@ -427,6 +447,9 @@ bool ARMPassConfig::addPreISel() { MergeExternalByDefault)); } + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createHardwareLoopsPass()); + return false; } @@ -490,6 +513,7 @@ void ARMPassConfig::addPreSched2() { return !MF.getSubtarget().isThumb1Only(); })); } + addPass(createMVEVPTBlockPass()); addPass(createThumb2ITBlockPass()); } @@ -506,4 +530,5 @@ void ARMPassConfig::addPreEmitPass() { addPass(createARMOptimizeBarriersPass()); addPass(createARMConstantIslandPass()); + addPass(createARMLowOverheadLoopsPass()); } diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 2c791998e702..cb8650d8139b 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -1,9 +1,8 @@ //===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp index 9c13359cba71..891329d3f297 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h index 0dc0882809c0..7b15dcc61f56 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.h +++ b/lib/Target/ARM/ARMTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index f72bb8632eb7..2a8ec734a05f 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -22,6 +21,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" @@ -36,6 +36,10 @@ using namespace llvm; #define DEBUG_TYPE "armtti" +static cl::opt DisableLowOverheadLoops( + "disable-arm-loloops", cl::Hidden, cl::init(true), + cl::desc("Disable the generation of low-overhead loops")); + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -107,9 +111,13 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Idx == 1) return 0; - if (Opcode == Instruction::And) - // Conversion to BIC is free, and means we can use ~Imm instead. - return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty)); + if (Opcode == Instruction::And) { + // UXTB/UXTH + if (Imm == 255 || Imm == 65535) + return 0; + // Conversion to BIC is free, and means we can use ~Imm instead. + return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty)); + } if (Opcode == Instruction::Add) // Conversion to SUB is free, and means we can use -Imm instead. @@ -398,6 +406,40 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return 1; } +int ARMTTIImpl::getMemcpyCost(const Instruction *I) { + const MemCpyInst *MI = dyn_cast(I); + assert(MI && "MemcpyInst expected"); + ConstantInt *C = dyn_cast(MI->getLength()); + + // To model the cost of a library call, we assume 1 for the call, and + // 3 for the argument setup. + const unsigned LibCallCost = 4; + + // If 'size' is not a constant, a library call will be generated. + if (!C) + return LibCallCost; + + const unsigned Size = C->getValue().getZExtValue(); + const unsigned DstAlign = MI->getDestAlignment(); + const unsigned SrcAlign = MI->getSourceAlignment(); + const Function *F = I->getParent()->getParent(); + const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); + std::vector MemOps; + + // MemOps will be poplulated with a list of data types that needs to be + // loaded and stored. That's why we multiply the number of elements by 2 to + // get the cost for this memcpy. + if (getTLI()->findOptimalMemOpLowering( + MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/, + false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/, + MI->getDestAddressSpace(), MI->getSourceAddressSpace(), + F->getAttributes())) + return MemOps.size() * 2; + + // If we can't find an optimal memop lowering, return the default cost + return LibCallCost; +} + int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (Kind == TTI::SK_Broadcast) { @@ -590,6 +632,222 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, UseMaskForCond, UseMaskForGaps); } +bool ARMTTIImpl::isLoweredToCall(const Function *F) { + if (!F->isIntrinsic()) + BaseT::isLoweredToCall(F); + + // Assume all Arm-specific intrinsics map to an instruction. + if (F->getName().startswith("llvm.arm")) + return false; + + switch (F->getIntrinsicID()) { + default: break; + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::exp: + case Intrinsic::exp2: + return true; + case Intrinsic::sqrt: + case Intrinsic::fabs: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::canonicalize: + case Intrinsic::lround: + case Intrinsic::llround: + case Intrinsic::lrint: + case Intrinsic::llrint: + if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) + return true; + if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) + return true; + // Some operations can be handled by vector instructions and assume + // unsupported vectors will be expanded into supported scalar ones. + // TODO Handle scalar operations properly. + return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); + case Intrinsic::masked_store: + case Intrinsic::masked_load: + case Intrinsic::masked_gather: + case Intrinsic::masked_scatter: + return !ST->hasMVEIntegerOps(); + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + return false; + } + + return BaseT::isLoweredToCall(F); +} + +bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) { + // Low-overhead branches are only supported in the 'low-overhead branch' + // extension of v8.1-m. + if (!ST->hasLOB() || DisableLowOverheadLoops) + return false; + + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) + return false; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return false; + + const SCEV *TripCountSCEV = + SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); + + // We need to store the trip count in LR, a 32-bit register. + if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) + return false; + + // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little + // point in generating a hardware loop if that's going to happen. + auto MaybeCall = [this](Instruction &I) { + const ARMTargetLowering *TLI = getTLI(); + unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); + EVT VT = TLI->getValueType(DL, I.getType(), true); + if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) + return true; + + // Check if an intrinsic will be lowered to a call and assume that any + // other CallInst will generate a bl. + if (auto *Call = dyn_cast(&I)) { + if (isa(Call)) { + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } + return true; + } + + // FPv5 provides conversions between integer, double-precision, + // single-precision, and half-precision formats. + switch (I.getOpcode()) { + default: + break; + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + return !ST->hasFPARMv8Base(); + } + + // FIXME: Unfortunately the approach of checking the Operation Action does + // not catch all cases of Legalization that use library calls. Our + // Legalization step categorizes some transformations into library calls as + // Custom, Expand or even Legal when doing type legalization. So for now + // we have to special case for instance the SDIV of 64bit integers and the + // use of floating point emulation. + if (VT.isInteger() && VT.getSizeInBits() >= 64) { + switch (ISD) { + default: + break; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: + return true; + } + } + + // Assume all other non-float operations are supported. + if (!VT.isFloatingPoint()) + return false; + + // We'll need a library call to handle most floats when using soft. + if (TLI->useSoftFloat()) { + switch (I.getOpcode()) { + default: + return true; + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::Select: + case Instruction::PHI: + return false; + } + } + + // We'll need a libcall to perform double precision operations on a single + // precision only FPU. + if (I.getType()->isDoubleTy() && !ST->hasFP64()) + return true; + + // Likewise for half precision arithmetic. + if (I.getType()->isHalfTy() && !ST->hasFullFP16()) + return true; + + return false; + }; + + auto IsHardwareLoopIntrinsic = [](Instruction &I) { + if (auto *Call = dyn_cast(&I)) { + switch (Call->getIntrinsicID()) { + default: + break; + case Intrinsic::set_loop_iterations: + case Intrinsic::test_set_loop_iterations: + case Intrinsic::loop_decrement: + case Intrinsic::loop_decrement_reg: + return true; + } + } + return false; + }; + + // Scan the instructions to see if there's any that we know will turn into a + // call or if this loop is already a low-overhead loop. + auto ScanLoop = [&](Loop *L) { + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) + return false; + } + } + return true; + }; + + // Visit inner loops. + for (auto Inner : *L) + if (!ScanLoop(Inner)) + return false; + + if (!ScanLoop(L)) + return false; + + // TODO: Check whether the trip count calculation is expensive. If L is the + // inner loop but we know it has a low trip count, calculating that trip + // count (in the parent loop) may be detrimental. + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CounterInReg = true; + HWLoopInfo.IsNestingLegal = false; + HWLoopInfo.PerformEntryTest = true; + HWLoopInfo.CountType = Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. @@ -599,7 +857,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Disable loop unrolling for Oz and Os. UP.OptSizeThreshold = 0; UP.PartialOptSizeThreshold = 0; - if (L->getHeader()->getParent()->optForSize()) + if (L->getHeader()->getParent()->hasOptSize()) return; // Only enable on Thumb-2 targets. @@ -645,6 +903,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, UP.Partial = true; UP.Runtime = true; + UP.UpperBound = true; UP.UnrollRemainder = true; UP.DefaultUnrollRuntimeCount = 4; UP.UnrollAndJam = true; diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 2dd143d48a15..52f6ea4a6e2f 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -1,9 +1,8 @@ //===- ARMTargetTransformInfo.h - ARM specific TTI --------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -49,7 +48,7 @@ class ARMTTIImpl : public BasicTTIImplBase { const ARMTargetLowering *TLI; // Currently the following features are excluded from InlineFeatureWhitelist. - // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureVFPOnlySP, FeatureD16 + // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32 // Depending on whether they are set or unset, different // instructions/registers are available. For example, inlining a callee with // -thumb-mode in a caller with +thumb-mode, may cause the assembler to @@ -94,6 +93,12 @@ public: bool enableInterleavedAccessVectorization() { return true; } + bool shouldFavorBackedgeIndex(const Loop *L) const { + if (L->getHeader()->getParent()->hasOptSize()) + return false; + return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; + } + /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD /// is IEEE-754 compliant, but it's not covered in this target. @@ -143,6 +148,8 @@ public: return ST->getMaxInterleaveFactor(); } + int getMemcpyCost(const Instruction *I); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, @@ -173,6 +180,12 @@ public: bool UseMaskForCond = false, bool UseMaskForGaps = false); + bool isLoweredToCall(const Function *F); + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 3832b0112b87..1da9452f1d22 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -1,19 +1,20 @@ //===- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "ARMFeatures.h" -#include "InstPrinter/ARMInstPrinter.h" +#include "ARMBaseInstrInfo.h" #include "Utils/ARMBaseInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMInstPrinter.h" #include "MCTargetDesc/ARMMCExpr.h" #include "MCTargetDesc/ARMMCTargetDesc.h" +#include "TargetInfo/ARMTargetInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/None.h" @@ -69,6 +70,10 @@ using namespace llvm; +namespace llvm { +extern const MCInstrDesc ARMInsts[]; +} // end namespace llvm + namespace { enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly }; @@ -90,6 +95,16 @@ static cl::opt AddBuildAttributes("arm-add-build-attributes", enum VectorLaneTy { NoLanes, AllLanes, IndexedLane }; +static inline unsigned extractITMaskBit(unsigned Mask, unsigned Position) { + // Position==0 means we're not in an IT block at all. Position==1 + // means we want the first state bit, which is always 0 (Then). + // Position==2 means we want the second state bit, stored at bit 3 + // of Mask, and so on downwards. So (5 - Position) will shift the + // right bit down to bit 0, including the always-0 bit at bit 4 for + // the mandatory initial Then. + return (Mask >> (5 - Position) & 1); +} + class UnwindContext { using Locs = SmallVector; @@ -165,6 +180,7 @@ public: } }; + class ARMAsmParser : public MCTargetAsmParser { const MCRegisterInfo *MRI; UnwindContext UC; @@ -225,11 +241,10 @@ class ARMAsmParser : public MCTargetAsmParser { } // Emit the IT instruction - unsigned Mask = getITMaskEncoding(); MCInst ITInst; ITInst.setOpcode(ARM::t2IT); ITInst.addOperand(MCOperand::createImm(ITState.Cond)); - ITInst.addOperand(MCOperand::createImm(Mask)); + ITInst.addOperand(MCOperand::createImm(ITState.Mask)); Out.EmitInstruction(ITInst, getSTI()); // Emit the conditonal instructions @@ -287,27 +302,10 @@ class ARMAsmParser : public MCTargetAsmParser { return MRI->getSubReg(QReg, ARM::dsub_0); } - // Get the encoding of the IT mask, as it will appear in an IT instruction. - unsigned getITMaskEncoding() { - assert(inITBlock()); - unsigned Mask = ITState.Mask; - unsigned TZ = countTrailingZeros(Mask); - if ((ITState.Cond & 1) == 0) { - assert(Mask && TZ <= 3 && "illegal IT mask value!"); - Mask ^= (0xE << TZ) & 0xF; - } - return Mask; - } - // Get the condition code corresponding to the current IT block slot. ARMCC::CondCodes currentITCond() { - unsigned MaskBit; - if (ITState.CurPosition == 1) - MaskBit = 1; - else - MaskBit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1; - - return MaskBit ? ITState.Cond : ARMCC::getOppositeCondition(ITState.Cond); + unsigned MaskBit = extractITMaskBit(ITState.Mask, ITState.CurPosition); + return MaskBit ? ARMCC::getOppositeCondition(ITState.Cond) : ITState.Cond; } // Invert the condition of the current IT block slot without changing any @@ -337,7 +335,7 @@ class ARMAsmParser : public MCTargetAsmParser { // Keep any existing condition bits. NewMask |= ITState.Mask & (0xE << TZ); // Insert the new condition bit. - NewMask |= (Cond == ITState.Cond) << TZ; + NewMask |= (Cond != ITState.Cond) << TZ; // Move the trailing 1 down one bit. NewMask |= 1 << (TZ - 1); ITState.Mask = NewMask; @@ -352,9 +350,10 @@ class ARMAsmParser : public MCTargetAsmParser { ITState.IsExplicit = false; } - // Create a new explicit IT block with the given condition and mask. The mask - // should be in the parsed format, with a 1 implying 't', regardless of the - // low bit of the condition. + // Create a new explicit IT block with the given condition and mask. + // The mask should be in the format used in ARMOperand and + // MCOperand, with a 1 implying 'e', regardless of the low bit of + // the condition. void startExplicitITBlock(ARMCC::CondCodes Cond, unsigned Mask) { assert(!inITBlock()); ITState.Cond = Cond; @@ -363,6 +362,18 @@ class ARMAsmParser : public MCTargetAsmParser { ITState.IsExplicit = true; } + struct { + unsigned Mask : 4; + unsigned CurPosition; + } VPTState; + bool inVPTBlock() { return VPTState.CurPosition != ~0U; } + void forwardVPTPosition() { + if (!inVPTBlock()) return; + unsigned TZ = countTrailingZeros(VPTState.Mask); + if (++VPTState.CurPosition == 5 - TZ) + VPTState.CurPosition = ~0U; + } + void Note(SMLoc L, const Twine &Msg, SMRange Range = None) { return getParser().Note(L, Msg, Range); } @@ -383,7 +394,7 @@ class ARMAsmParser : public MCTargetAsmParser { int tryParseRegister(); bool tryParseRegisterWithWriteBack(OperandVector &); int tryParseShiftRegister(OperandVector &); - bool parseRegisterList(OperandVector &); + bool parseRegisterList(OperandVector &, bool EnforceOrder = true); bool parseMemory(OperandVector &); bool parseOperand(OperandVector &, StringRef Mnemonic); bool parsePrefix(ARMMCExpr::VariantKind &RefKind); @@ -421,12 +432,15 @@ class ARMAsmParser : public MCTargetAsmParser { bool parseDirectiveAlign(SMLoc L); bool parseDirectiveThumbSet(SMLoc L); - StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode, - bool &CarrySetting, unsigned &ProcessorIMod, - StringRef &ITMask); - void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, - bool &CanAcceptCarrySet, - bool &CanAcceptPredicationCode); + bool isMnemonicVPTPredicable(StringRef Mnemonic, StringRef ExtraToken); + StringRef splitMnemonic(StringRef Mnemonic, StringRef ExtraToken, + unsigned &PredicationCode, + unsigned &VPTPredicationCode, bool &CarrySetting, + unsigned &ProcessorIMod, StringRef &ITMask); + void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef ExtraToken, + StringRef FullInst, bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode, + bool &CanAcceptVPTPredicationCode); void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting, OperandVector &Operands); @@ -478,7 +492,15 @@ class ARMAsmParser : public MCTargetAsmParser { bool hasV8MMainline() const { return getSTI().getFeatureBits()[ARM::HasV8MMainlineOps]; } - + bool hasV8_1MMainline() const { + return getSTI().getFeatureBits()[ARM::HasV8_1MMainlineOps]; + } + bool hasMVE() const { + return getSTI().getFeatureBits()[ARM::HasMVEIntegerOps]; + } + bool hasMVEFloat() const { + return getSTI().getFeatureBits()[ARM::HasMVEFloatOps]; + } bool has8MSecExt() const { return getSTI().getFeatureBits()[ARM::Feature8MSecExt]; } @@ -491,8 +513,8 @@ class ARMAsmParser : public MCTargetAsmParser { return getSTI().getFeatureBits()[ARM::FeatureDSP]; } - bool hasD16() const { - return getSTI().getFeatureBits()[ARM::FeatureD16]; + bool hasD32() const { + return getSTI().getFeatureBits()[ARM::FeatureD32]; } bool hasV8_1aOps() const { @@ -505,7 +527,7 @@ class ARMAsmParser : public MCTargetAsmParser { void SwitchMode() { MCSubtargetInfo &STI = copySTI(); - uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); + auto FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); setAvailableFeatures(FB); } @@ -556,11 +578,13 @@ class ARMAsmParser : public MCTargetAsmParser { // Asm Match Converter Methods void cvtThumbMultiply(MCInst &Inst, const OperandVector &); void cvtThumbBranches(MCInst &Inst, const OperandVector &); + void cvtMVEVMOVQtoDReg(MCInst &Inst, const OperandVector &); bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out); bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands); bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands); + bool shouldOmitVectorPredicateOperand(StringRef Mnemonic, OperandVector &Operands); bool isITBlockTerminator(MCInst &Inst) const; void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands); bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands, @@ -597,6 +621,8 @@ public: // Not in an ITBlock to start with. ITState.CurPosition = ~0U; + VPTState.CurPosition = ~0U; + NextSymbolIsThumb = false; } @@ -642,6 +668,7 @@ public: class ARMOperand : public MCParsedAsmOperand { enum KindTy { k_CondCode, + k_VPTPred, k_CCOut, k_ITCondMask, k_CoprocNum, @@ -659,8 +686,11 @@ class ARMOperand : public MCParsedAsmOperand { k_VectorIndex, k_Register, k_RegisterList, + k_RegisterListWithAPSR, k_DPRRegisterList, k_SPRRegisterList, + k_FPSRegisterListWithVPR, + k_FPDRegisterListWithVPR, k_VectorList, k_VectorListAllLanes, k_VectorListIndexed, @@ -681,6 +711,10 @@ class ARMOperand : public MCParsedAsmOperand { ARMCC::CondCodes Val; }; + struct VCCOp { + ARMVCC::VPTCodes Val; + }; + struct CopOp { unsigned Val; }; @@ -797,6 +831,7 @@ class ARMOperand : public MCParsedAsmOperand { union { struct CCOp CC; + struct VCCOp VCC; struct CopOp Cop; struct CoprocOptionOp CoprocOption; struct MBOptOp MBOpt; @@ -845,6 +880,11 @@ public: return CC.Val; } + ARMVCC::VPTCodes getVPTPred() const { + assert(isVPTPred() && "Invalid access!"); + return VCC.Val; + } + unsigned getCoproc() const { assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!"); return Cop.Val; @@ -861,8 +901,11 @@ public: } const SmallVectorImpl &getRegList() const { - assert((Kind == k_RegisterList || Kind == k_DPRRegisterList || - Kind == k_SPRRegisterList) && "Invalid access!"); + assert((Kind == k_RegisterList || Kind == k_RegisterListWithAPSR || + Kind == k_DPRRegisterList || Kind == k_SPRRegisterList || + Kind == k_FPSRegisterListWithVPR || + Kind == k_FPDRegisterListWithVPR) && + "Invalid access!"); return Registers; } @@ -915,6 +958,7 @@ public: bool isCoprocReg() const { return Kind == k_CoprocReg; } bool isCoprocOption() const { return Kind == k_CoprocOption; } bool isCondCode() const { return Kind == k_CondCode; } + bool isVPTPred() const { return Kind == k_VPTPred; } bool isCCOut() const { return Kind == k_CCOut; } bool isITMask() const { return Kind == k_ITCondMask; } bool isITCondCode() const { return Kind == k_CondCode; } @@ -970,6 +1014,18 @@ public: return false; } + // checks whether this operand is an offset suitable for the LE / + // LETP instructions in Arm v8.1M + bool isLEOffset() const { + if (!isImm()) return false; + if (isa(Imm.Val)) return true; + if (const MCConstantExpr *CE = dyn_cast(Imm.Val)) { + int64_t Val = CE->getValue(); + return Val < 0 && Val >= -4094 && (Val & 1) == 0; + } + return false; + } + // checks whether this operand is a memory operand computed as an offset // applied to PC. the offset may have 8 bits of magnitude and is represented // with two bits of shift. textually it may be either [pc, #imm], #imm or @@ -982,7 +1038,7 @@ public: if (!CE) return false; Val = CE->getValue(); } - else if (isMem()) { + else if (isGPRMem()) { if(!Memory.OffsetImm || Memory.OffsetRegNum) return false; if(Memory.BaseRegNum != ARM::PC) return false; Val = Memory.OffsetImm->getValue(); @@ -1016,7 +1072,14 @@ public: int64_t Value = CE->getValue(); return ((Value & 3) == 0) && Value >= N && Value <= M; } - + template + bool isImmediateS2() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 1) == 0) && Value >= N && Value <= M; + } bool isFBits16() const { return isImmediate<0, 17>(); } @@ -1026,6 +1089,21 @@ public: bool isImm8s4() const { return isImmediateS4<-1020, 1020>(); } + bool isImm7s4() const { + return isImmediateS4<-508, 508>(); + } + bool isImm7Shift0() const { + return isImmediate<-127, 127>(); + } + bool isImm7Shift1() const { + return isImmediateS2<-255, 255>(); + } + bool isImm7Shift2() const { + return isImmediateS4<-511, 511>(); + } + bool isImm7() const { + return isImmediate<-127, 127>(); + } bool isImm0_1020s4() const { return isImmediateS4<0, 1020>(); } @@ -1098,6 +1176,34 @@ public: return isImmediate<1, 33>(); } + template + bool isExpImmValue(uint64_t Value) const { + uint64_t mask = (1 << shift) - 1; + if ((Value & mask) != 0 || (Value >> shift) > 0xff) + return false; + return true; + } + + template + bool isExpImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + + return isExpImmValue(CE->getValue()); + } + + template + bool isInvertedExpImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + + uint64_t OriginalValue = CE->getValue(); + uint64_t InvertedValue = OriginalValue ^ (((uint64_t)1 << size) - 1); + return isExpImmValue(InvertedValue); + } + bool isPKHLSLImm() const { return isImmediate<0, 32>(); } @@ -1167,13 +1273,34 @@ public: bool isReg() const override { return Kind == k_Register; } bool isRegList() const { return Kind == k_RegisterList; } + bool isRegListWithAPSR() const { + return Kind == k_RegisterListWithAPSR || Kind == k_RegisterList; + } bool isDPRRegList() const { return Kind == k_DPRRegisterList; } bool isSPRRegList() const { return Kind == k_SPRRegisterList; } + bool isFPSRegListWithVPR() const { return Kind == k_FPSRegisterListWithVPR; } + bool isFPDRegListWithVPR() const { return Kind == k_FPDRegisterListWithVPR; } bool isToken() const override { return Kind == k_Token; } bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; } bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; } bool isTraceSyncBarrierOpt() const { return Kind == k_TraceSyncBarrierOpt; } bool isMem() const override { + return isGPRMem() || isMVEMem(); + } + bool isMVEMem() const { + if (Kind != k_Memory) + return false; + if (Memory.BaseRegNum && + !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.BaseRegNum) && + !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Memory.BaseRegNum)) + return false; + if (Memory.OffsetRegNum && + !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains( + Memory.OffsetRegNum)) + return false; + return true; + } + bool isGPRMem() const { if (Kind != k_Memory) return false; if (Memory.BaseRegNum && @@ -1198,6 +1325,16 @@ public: RegShiftedImm.SrcReg); } bool isRotImm() const { return Kind == k_RotateImmediate; } + + template + bool isPowerTwoInRange() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && countPopulation((uint64_t)Value) == 1 && + Value >= Min && Value <= Max; + } bool isModImm() const { return Kind == k_ModifiedImmediate; } bool isModImmNot() const { @@ -1243,14 +1380,50 @@ public: return isPostIdxRegShifted() && PostIdxReg.ShiftTy == ARM_AM::no_shift; } bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const { - if (!isMem()) + if (!isGPRMem()) + return false; + // No offset of any kind. + return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr && + (alignOK || Memory.Alignment == Alignment); + } + bool isMemNoOffsetT2(bool alignOK = false, unsigned Alignment = 0) const { + if (!isGPRMem()) + return false; + + if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains( + Memory.BaseRegNum)) + return false; + + // No offset of any kind. + return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr && + (alignOK || Memory.Alignment == Alignment); + } + bool isMemNoOffsetT2NoSp(bool alignOK = false, unsigned Alignment = 0) const { + if (!isGPRMem()) + return false; + + if (!ARMMCRegisterClasses[ARM::rGPRRegClassID].contains( + Memory.BaseRegNum)) return false; + + // No offset of any kind. + return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr && + (alignOK || Memory.Alignment == Alignment); + } + bool isMemNoOffsetT(bool alignOK = false, unsigned Alignment = 0) const { + if (!isGPRMem()) + return false; + + if (!ARMMCRegisterClasses[ARM::tGPRRegClassID].contains( + Memory.BaseRegNum)) + return false; + // No offset of any kind. return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr && (alignOK || Memory.Alignment == Alignment); } bool isMemPCRelImm12() const { - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Base register must be PC. if (Memory.BaseRegNum != ARM::PC) @@ -1337,7 +1510,7 @@ public: } bool isAddrMode2() const { - if (!isMem() || Memory.Alignment != 0) return false; + if (!isGPRMem() || Memory.Alignment != 0) return false; // Check for register offset. if (Memory.OffsetRegNum) return true; // Immediate offset in range [-4095, 4095]. @@ -1362,7 +1535,7 @@ public: // and we reject it. if (isImm() && !isa(getImm())) return true; - if (!isMem() || Memory.Alignment != 0) return false; + if (!isGPRMem() || Memory.Alignment != 0) return false; // No shifts are legal for AM3. if (Memory.ShiftType != ARM_AM::no_shift) return false; // Check for register offset. @@ -1396,7 +1569,7 @@ public: // and we reject it. if (isImm() && !isa(getImm())) return true; - if (!isMem() || Memory.Alignment != 0) return false; + if (!isGPRMem() || Memory.Alignment != 0) return false; // Check for register offset. if (Memory.OffsetRegNum) return false; // Immediate offset in range [-1020, 1020] and a multiple of 4. @@ -1412,7 +1585,7 @@ public: // and we reject it. if (isImm() && !isa(getImm())) return true; - if (!isMem() || Memory.Alignment != 0) return false; + if (!isGPRMem() || Memory.Alignment != 0) return false; // Check for register offset. if (Memory.OffsetRegNum) return false; // Immediate offset in range [-510, 510] and a multiple of 2. @@ -1423,14 +1596,14 @@ public: } bool isMemTBB() const { - if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative || Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0) return false; return true; } bool isMemTBH() const { - if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative || Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 || Memory.Alignment != 0 ) return false; @@ -1438,13 +1611,13 @@ public: } bool isMemRegOffset() const { - if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0) + if (!isGPRMem() || !Memory.OffsetRegNum || Memory.Alignment != 0) return false; return true; } bool isT2MemRegOffset() const { - if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative || Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC) return false; // Only lsl #{0, 1, 2, 3} allowed. @@ -1458,7 +1631,7 @@ public: bool isMemThumbRR() const { // Thumb reg+reg addressing is simple. Just two registers, a base and // an offset. No shifts, negations or any other complicating factors. - if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative || Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0) return false; return isARMLowRegister(Memory.BaseRegNum) && @@ -1466,7 +1639,7 @@ public: } bool isMemThumbRIs4() const { - if (!isMem() || Memory.OffsetRegNum != 0 || + if (!isGPRMem() || Memory.OffsetRegNum != 0 || !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) return false; // Immediate offset, multiple of 4 in range [0, 124]. @@ -1476,7 +1649,7 @@ public: } bool isMemThumbRIs2() const { - if (!isMem() || Memory.OffsetRegNum != 0 || + if (!isGPRMem() || Memory.OffsetRegNum != 0 || !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) return false; // Immediate offset, multiple of 4 in range [0, 62]. @@ -1486,7 +1659,7 @@ public: } bool isMemThumbRIs1() const { - if (!isMem() || Memory.OffsetRegNum != 0 || + if (!isGPRMem() || Memory.OffsetRegNum != 0 || !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) return false; // Immediate offset in range [0, 31]. @@ -1496,7 +1669,7 @@ public: } bool isMemThumbSPI() const { - if (!isMem() || Memory.OffsetRegNum != 0 || + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0) return false; // Immediate offset, multiple of 4 in range [0, 1020]. @@ -1511,7 +1684,7 @@ public: // and we reject it. if (isImm() && !isa(getImm())) return true; - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Immediate offset a multiple of 4 in range [-1020, 1020]. if (!Memory.OffsetImm) return true; @@ -1520,9 +1693,24 @@ public: return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == std::numeric_limits::min(); } - + bool isMemImm7s4Offset() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm() && !isa(getImm())) + return true; + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 || + !ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains( + Memory.BaseRegNum)) + return false; + // Immediate offset a multiple of 4 in range [-508, 508]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + // Special case, #-0 is INT32_MIN. + return (Val >= -508 && Val <= 508 && (Val & 3) == 0) || Val == INT32_MIN; + } bool isMemImm0_1020s4Offset() const { - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Immediate offset a multiple of 4 in range [0, 1020]. if (!Memory.OffsetImm) return true; @@ -1531,7 +1719,7 @@ public: } bool isMemImm8Offset() const { - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Base reg of PC isn't allowed for these encodings. if (Memory.BaseRegNum == ARM::PC) return false; @@ -1542,8 +1730,81 @@ public: (Val > -256 && Val < 256); } + template + bool isMemImm7ShiftedOffset() const { + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 || + !ARMMCRegisterClasses[RegClassID].contains(Memory.BaseRegNum)) + return false; + + // Expect an immediate offset equal to an element of the range + // [-127, 127], shifted left by Bits. + + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + + // INT32_MIN is a special-case value (indicating the encoding with + // zero offset and the subtract bit set) + if (Val == INT32_MIN) + return true; + + unsigned Divisor = 1U << Bits; + + // Check that the low bits are zero + if (Val % Divisor != 0) + return false; + + // Check that the remaining offset is within range. + Val /= Divisor; + return (Val >= -127 && Val <= 127); + } + + template bool isMemRegRQOffset() const { + if (!isMVEMem() || Memory.OffsetImm != 0 || Memory.Alignment != 0) + return false; + + if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains( + Memory.BaseRegNum)) + return false; + if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains( + Memory.OffsetRegNum)) + return false; + + if (shift == 0 && Memory.ShiftType != ARM_AM::no_shift) + return false; + + if (shift > 0 && + (Memory.ShiftType != ARM_AM::uxtw || Memory.ShiftImm != shift)) + return false; + + return true; + } + + template bool isMemRegQOffset() const { + if (!isMVEMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + + if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains( + Memory.BaseRegNum)) + return false; + + if(!Memory.OffsetImm) return true; + static_assert(shift < 56, + "Such that we dont shift by a value higher than 62"); + int64_t Val = Memory.OffsetImm->getValue(); + + // The value must be a multiple of (1 << shift) + if ((Val & ((1U << shift) - 1)) != 0) + return false; + + // And be in the right range, depending on the amount that it is shifted + // by. Shift 0, is equal to 7 unsigned bits, the sign bit is set + // separately. + int64_t Range = (1U << (7+shift)) - 1; + return (Val == INT32_MIN) || (Val > -Range && Val < Range); + } + bool isMemPosImm8Offset() const { - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Immediate offset in range [0, 255]. if (!Memory.OffsetImm) return true; @@ -1552,7 +1813,7 @@ public: } bool isMemNegImm8Offset() const { - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Base reg of PC isn't allowed for these encodings. if (Memory.BaseRegNum == ARM::PC) return false; @@ -1564,7 +1825,7 @@ public: } bool isMemUImm12Offset() const { - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Immediate offset in range [0, 4095]. if (!Memory.OffsetImm) return true; @@ -1580,7 +1841,7 @@ public: if (isImm() && !isa(getImm())) return true; - if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) return false; // Immediate offset in range [-4095, 4095]. if (!Memory.OffsetImm) return true; @@ -1631,6 +1892,12 @@ public: return VectorList.Count == 1; } + bool isVecListTwoMQ() const { + return isSingleSpacedVectorList() && VectorList.Count == 2 && + ARMMCRegisterClasses[ARM::MQPRRegClassID].contains( + VectorList.RegNum); + } + bool isVecListDPair() const { if (!isSingleSpacedVectorList()) return false; return (ARMMCRegisterClasses[ARM::DPairRegClassID] @@ -1664,6 +1931,12 @@ public: return VectorList.Count == 4; } + bool isVecListFourMQ() const { + return isSingleSpacedVectorList() && VectorList.Count == 4 && + ARMMCRegisterClasses[ARM::MQPRRegClassID].contains( + VectorList.RegNum); + } + bool isSingleSpacedVectorAllLanes() const { return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced; } @@ -1806,23 +2079,24 @@ public: return VectorList.Count == 4 && VectorList.LaneIndex <= 1; } - bool isVectorIndex8() const { - if (Kind != k_VectorIndex) return false; - return VectorIndex.Val < 8; - } + bool isVectorIndex() const { return Kind == k_VectorIndex; } - bool isVectorIndex16() const { + template + bool isVectorIndexInRange() const { if (Kind != k_VectorIndex) return false; - return VectorIndex.Val < 4; + return VectorIndex.Val < NumLanes; } - bool isVectorIndex32() const { - if (Kind != k_VectorIndex) return false; - return VectorIndex.Val < 2; - } - bool isVectorIndex64() const { + bool isVectorIndex8() const { return isVectorIndexInRange<8>(); } + bool isVectorIndex16() const { return isVectorIndexInRange<4>(); } + bool isVectorIndex32() const { return isVectorIndexInRange<2>(); } + bool isVectorIndex64() const { return isVectorIndexInRange<1>(); } + + template + bool isMVEPairVectorIndex() const { if (Kind != k_VectorIndex) return false; - return VectorIndex.Val < 1; + return VectorIndex.Val == PermittedValue || + VectorIndex.Val == OtherPermittedValue; } bool isNEONi8splat() const { @@ -1992,6 +2266,51 @@ public: return (Value % Angle == Remainder && Value <= 270); } + bool isMVELongShift() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + // Must be a constant. + if (!CE) return false; + uint64_t Value = CE->getValue(); + return Value >= 1 && Value <= 32; + } + + bool isITCondCodeNoAL() const { + if (!isITCondCode()) return false; + ARMCC::CondCodes CC = getCondCode(); + return CC != ARMCC::AL; + } + + bool isITCondCodeRestrictedI() const { + if (!isITCondCode()) + return false; + ARMCC::CondCodes CC = getCondCode(); + return CC == ARMCC::EQ || CC == ARMCC::NE; + } + + bool isITCondCodeRestrictedS() const { + if (!isITCondCode()) + return false; + ARMCC::CondCodes CC = getCondCode(); + return CC == ARMCC::LT || CC == ARMCC::GT || CC == ARMCC::LE || + CC == ARMCC::GE; + } + + bool isITCondCodeRestrictedU() const { + if (!isITCondCode()) + return false; + ARMCC::CondCodes CC = getCondCode(); + return CC == ARMCC::HS || CC == ARMCC::HI; + } + + bool isITCondCodeRestrictedFP() const { + if (!isITCondCode()) + return false; + ARMCC::CondCodes CC = getCondCode(); + return CC == ARMCC::EQ || CC == ARMCC::NE || CC == ARMCC::LT || + CC == ARMCC::GT || CC == ARMCC::LE || CC == ARMCC::GE; + } + void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. Null MCExpr = 0. if (!Expr) @@ -2019,6 +2338,30 @@ public: Inst.addOperand(MCOperand::createReg(RegNum)); } + void addVPTPredNOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred()))); + unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0; + Inst.addOperand(MCOperand::createReg(RegNum)); + } + + void addVPTPredROperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + addVPTPredNOperands(Inst, N-1); + unsigned RegNum; + if (getVPTPred() == ARMVCC::None) { + RegNum = 0; + } else { + unsigned NextOpIndex = Inst.getNumOperands(); + const MCInstrDesc &MCID = ARMInsts[Inst.getOpcode()]; + int TiedOp = MCID.getOperandConstraint(NextOpIndex, MCOI::TIED_TO); + assert(TiedOp >= 0 && + "Inactive register in vpred_r is not tied to an output!"); + RegNum = Inst.getOperand(TiedOp).getReg(); + } + Inst.addOperand(MCOperand::createReg(RegNum)); + } + void addCoprocNumOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(getCoproc())); @@ -2044,6 +2387,11 @@ public: Inst.addOperand(MCOperand::createImm(unsigned(getCondCode()))); } + void addITCondCodeInvOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(ARMCC::getOppositeCondition(getCondCode())))); + } + void addCCOutOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getReg())); @@ -2089,6 +2437,14 @@ public: Inst.addOperand(MCOperand::createReg(*I)); } + void addRegListWithAPSROperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const SmallVectorImpl &RegList = getRegList(); + for (SmallVectorImpl::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ++I) + Inst.addOperand(MCOperand::createReg(*I)); + } + void addDPRRegListOperands(MCInst &Inst, unsigned N) const { addRegListOperands(Inst, N); } @@ -2097,6 +2453,14 @@ public: addRegListOperands(Inst, N); } + void addFPSRegListWithVPROperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + + void addFPDRegListWithVPROperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + void addRotImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // Encoded as val>>3. The printer handles display as 8, 16, 24. @@ -2184,6 +2548,42 @@ public: Inst.addOperand(MCOperand::createImm(CE->getValue())); } + void addImm7s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR + // instruction don't encode operands that way yet. + const MCConstantExpr *CE = dyn_cast(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + + void addImm7Shift0Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast(getImm()); + assert(CE != nullptr && "Invalid operand type!"); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + + void addImm7Shift1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast(getImm()); + assert(CE != nullptr && "Invalid operand type!"); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + + void addImm7Shift2Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast(getImm()); + assert(CE != nullptr && "Invalid operand type!"); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + + void addImm7Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast(getImm()); + assert(CE != nullptr && "Invalid operand type!"); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored @@ -2293,7 +2693,7 @@ public: return; } - assert(isMem() && "Unknown value type!"); + assert(isGPRMem() && "Unknown value type!"); assert(isa(Memory.OffsetImm) && "Unknown value type!"); Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue())); } @@ -2318,6 +2718,21 @@ public: Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); } + void addMemNoOffsetT2Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + } + + void addMemNoOffsetT2NoSpOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + } + + void addMemNoOffsetTOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + } + void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); int32_t Imm = Memory.OffsetImm->getValue(); @@ -2535,6 +2950,22 @@ public: Inst.addOperand(MCOperand::createImm(Val)); } + void addMemImm7s4OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm()) { + Inst.addOperand(MCOperand::createExpr(getImm())); + Inst.addOperand(MCOperand::createImm(0)); + return; + } + + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); // The lower two bits are always zero and as such are not encoded. @@ -2543,19 +2974,17 @@ public: Inst.addOperand(MCOperand::createImm(Val)); } - void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const { + void addMemImmOffsetOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); Inst.addOperand(MCOperand::createImm(Val)); } - void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const { - addMemImm8OffsetOperands(Inst, N); - } - - void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const { - addMemImm8OffsetOperands(Inst, N); + void addMemRegRQOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); } void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const { @@ -2699,6 +3128,12 @@ public: Inst.addOperand(MCOperand::createImm(Imm)); } + void addPowerTwoOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + void addMSRMaskOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask()))); @@ -2719,6 +3154,37 @@ public: Inst.addOperand(MCOperand::createReg(VectorList.RegNum)); } + void addMVEVecListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + // When we come here, the VectorList field will identify a range + // of q-registers by its base register and length, and it will + // have already been error-checked to be the expected length of + // range and contain only q-regs in the range q0-q7. So we can + // count on the base register being in the range q0-q6 (for 2 + // regs) or q0-q4 (for 4) + // + // The MVE instructions taking a register range of this kind will + // need an operand in the QQPR or QQQQPR class, representing the + // entire range as a unit. So we must translate into that class, + // by finding the index of the base register in the MQPR reg + // class, and returning the super-register at the corresponding + // index in the target class. + + const MCRegisterClass *RC_in = &ARMMCRegisterClasses[ARM::MQPRRegClassID]; + const MCRegisterClass *RC_out = (VectorList.Count == 2) ? + &ARMMCRegisterClasses[ARM::QQPRRegClassID] : + &ARMMCRegisterClasses[ARM::QQQQPRRegClassID]; + + unsigned I, E = RC_out->getNumRegs(); + for (I = 0; I < E; I++) + if (RC_in->getRegister(I) == VectorList.RegNum) + break; + assert(I < E && "Invalid vector list start register!"); + + Inst.addOperand(MCOperand::createReg(RC_out->getRegister(I))); + } + void addVecListIndexedOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(VectorList.RegNum)); @@ -2745,6 +3211,16 @@ public: Inst.addOperand(MCOperand::createImm(getVectorIndex())); } + void addMVEVectorIndexOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addMVEPairVectorIndexOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + void addNEONi8splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. @@ -2913,6 +3389,15 @@ public: return Op; } + static std::unique_ptr CreateVPTPred(ARMVCC::VPTCodes CC, + SMLoc S) { + auto Op = make_unique(k_VPTPred); + Op->VCC.Val = CC; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + static std::unique_ptr CreateCoprocNum(unsigned CopVal, SMLoc S) { auto Op = make_unique(k_CoprocNum); Op->Cop.Val = CopVal; @@ -3044,19 +3529,31 @@ public: assert(Regs.size() > 0 && "RegList contains no registers?"); KindTy Kind = k_RegisterList; - if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second)) - Kind = k_DPRRegisterList; - else if (ARMMCRegisterClasses[ARM::SPRRegClassID]. - contains(Regs.front().second)) - Kind = k_SPRRegisterList; + if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains( + Regs.front().second)) { + if (Regs.back().second == ARM::VPR) + Kind = k_FPDRegisterListWithVPR; + else + Kind = k_DPRRegisterList; + } else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains( + Regs.front().second)) { + if (Regs.back().second == ARM::VPR) + Kind = k_FPSRegisterListWithVPR; + else + Kind = k_SPRRegisterList; + } // Sort based on the register encoding values. array_pod_sort(Regs.begin(), Regs.end()); + if (Kind == k_RegisterList && Regs.back().second == ARM::APSR) + Kind = k_RegisterListWithAPSR; + auto Op = make_unique(Kind); for (SmallVectorImpl>::const_iterator I = Regs.begin(), E = Regs.end(); I != E; ++I) Op->Registers.push_back(I->second); + Op->StartLoc = StartLoc; Op->EndLoc = EndLoc; return Op; @@ -3217,15 +3714,18 @@ void ARMOperand::print(raw_ostream &OS) const { case k_CondCode: OS << ""; break; + case k_VPTPred: + OS << ""; + break; case k_CCOut: OS << ""; break; case k_ITCondMask: { static const char *const MaskStr[] = { - "(invalid)", "(teee)", "(tee)", "(teet)", - "(te)", "(tete)", "(tet)", "(tett)", - "(t)", "(ttee)", "(tte)", "(ttet)", - "(tt)", "(ttte)", "(ttt)", "(tttt)" + "(invalid)", "(tttt)", "(ttt)", "(ttte)", + "(tt)", "(ttet)", "(tte)", "(ttee)", + "(t)", "(tett)", "(tet)", "(tete)", + "(te)", "(teet)", "(tee)", "(teee)", }; assert((ITMask.Mask & 0xf) == ITMask.Mask); OS << ""; @@ -3324,8 +3824,11 @@ void ARMOperand::print(raw_ostream &OS) const { << ", width: " << Bitfield.Width << ">"; break; case k_RegisterList: + case k_RegisterListWithAPSR: case k_DPRRegisterList: - case k_SPRRegisterList: { + case k_SPRRegisterList: + case k_FPSRegisterListWithVPR: + case k_FPDRegisterListWithVPR: { OS << " &RegList = getRegList(); @@ -3423,7 +3926,7 @@ int ARMAsmParser::tryParseRegister() { } // Some FPUs only have 16 D registers, so D16-D31 are invalid - if (hasD16() && RegNum >= ARM::D16 && RegNum <= ARM::D31) + if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31) return -1; Parser.Lex(); // Eat identifier token. @@ -3662,11 +4165,10 @@ ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) { if (Tok.isNot(AsmToken::Identifier)) return MatchOperand_NoMatch; - int Num = MatchCoprocessorOperandName(Tok.getString(), 'p'); + int Num = MatchCoprocessorOperandName(Tok.getString().lower(), 'p'); if (Num == -1) return MatchOperand_NoMatch; - // ARMv7 and v8 don't allow cp10/cp11 due to VFP/NEON specific instructions - if ((hasV7Ops() || hasV8Ops()) && (Num == 10 || Num == 11)) + if (!isValidCoprocessorNumber(Num, getSTI().getFeatureBits())) return MatchOperand_NoMatch; Parser.Lex(); // Eat identifier token. @@ -3685,7 +4187,7 @@ ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) { if (Tok.isNot(AsmToken::Identifier)) return MatchOperand_NoMatch; - int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c'); + int Reg = MatchCoprocessorOperandName(Tok.getString().lower(), 'c'); if (Reg == -1) return MatchOperand_NoMatch; @@ -3752,7 +4254,8 @@ static unsigned getNextRegister(unsigned Reg) { } /// Parse a register list. -bool ARMAsmParser::parseRegisterList(OperandVector &Operands) { +bool ARMAsmParser::parseRegisterList(OperandVector &Operands, + bool EnforceOrder) { MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::LCurly)) return TokError("Token is not a Left Curly Brace"); @@ -3785,6 +4288,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands) { RC = &ARMMCRegisterClasses[ARM::DPRRegClassID]; else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg)) RC = &ARMMCRegisterClasses[ARM::SPRRegClassID]; + else if (ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID]; else return Error(RegLoc, "invalid register in register list"); @@ -3838,14 +4343,32 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands) { Reg = getDRegFromQReg(Reg); isQReg = true; } + if (!RC->contains(Reg) && + RC->getID() == ARMMCRegisterClasses[ARM::GPRRegClassID].getID() && + ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) { + // switch the register classes, as GPRwithAPSRnospRegClassID is a partial + // subset of GPRRegClassId except it contains APSR as well. + RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID]; + } + if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] || + RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) { + RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID]; + EReg = MRI->getEncodingValue(Reg); + Registers.push_back(std::pair(EReg, Reg)); + continue; + } // The register must be in the same register class as the first. if (!RC->contains(Reg)) return Error(RegLoc, "invalid register in register list"); - // List must be monotonically increasing. - if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) { + // In most cases, the list must be monotonically increasing. An + // exception is CLRM, which is order-independent anyway, so + // there's no potential for confusion if you write clrm {r2,r1} + // instead of clrm {r1,r2}. + if (EnforceOrder && + MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) { if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) Warning(RegLoc, "register list not in ascending order"); - else + else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) return Error(RegLoc, "register list not in ascending order"); } if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) { @@ -3855,6 +4378,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands) { } // VFP register lists must also be contiguous. if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] && + RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] && Reg != OldReg + 1) return Error(RegLoc, "non-contiguous register range"); EReg = MRI->getEncodingValue(Reg); @@ -3944,7 +4468,7 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) { // As an extension (to match gas), support a plain D register or Q register // (without encosing curly braces) as a single or double entry list, // respectively. - if (Parser.getTok().is(AsmToken::Identifier)) { + if (!hasMVE() && Parser.getTok().is(AsmToken::Identifier)) { SMLoc E = Parser.getTok().getEndLoc(); int Reg = tryParseRegister(); if (Reg == -1) @@ -4012,9 +4536,14 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) { unsigned Count = 1; int Spacing = 0; unsigned FirstReg = Reg; + + if (hasMVE() && !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Reg)) { + Error(Parser.getTok().getLoc(), "vector register in range Q0-Q7 expected"); + return MatchOperand_ParseFail; + } // The list is of D registers, but we also allow Q regs and just interpret // them as the two D sub-registers. - if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + else if (!hasMVE() && ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { FirstReg = Reg = getDRegFromQReg(Reg); Spacing = 1; // double-spacing requires explicit D registers, otherwise // it's ambiguous with four-register single spaced. @@ -4044,14 +4573,17 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) { return MatchOperand_ParseFail; } // Allow Q regs and just interpret them as the two D sub-registers. - if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg)) + if (!hasMVE() && ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg)) EndReg = getDRegFromQReg(EndReg) + 1; // If the register is the same as the start reg, there's nothing // more to do. if (Reg == EndReg) continue; // The register must be in the same register class as the first. - if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) { + if ((hasMVE() && + !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(EndReg)) || + (!hasMVE() && + !ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg))) { Error(AfterMinusLoc, "invalid register in register list"); return MatchOperand_ParseFail; } @@ -4084,13 +4616,21 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) { Error(RegLoc, "register expected"); return MatchOperand_ParseFail; } + + if (hasMVE()) { + if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Reg)) { + Error(RegLoc, "vector register in range Q0-Q7 expected"); + return MatchOperand_ParseFail; + } + Spacing = 1; + } // vector register lists must be contiguous. // It's OK to use the enumeration values directly here rather, as the // VFP register classes have the enum sorted properly. // // The list is of D registers, but we also allow Q regs and just interpret // them as the two D sub-registers. - if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + else if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { if (!Spacing) Spacing = 1; // Register range implies a single spaced list. else if (Spacing == 2) { @@ -4151,30 +4691,20 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) { switch (LaneKind) { case NoLanes: + case AllLanes: { // Two-register operands have been converted to the // composite register classes. - if (Count == 2) { - const MCRegisterClass *RC = (Spacing == 1) ? - &ARMMCRegisterClasses[ARM::DPairRegClassID] : - &ARMMCRegisterClasses[ARM::DPairSpcRegClassID]; - FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC); - } - Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count, - (Spacing == 2), S, E)); - break; - case AllLanes: - // Two-register operands have been converted to the - // composite register classes. - if (Count == 2) { + if (Count == 2 && !hasMVE()) { const MCRegisterClass *RC = (Spacing == 1) ? &ARMMCRegisterClasses[ARM::DPairRegClassID] : &ARMMCRegisterClasses[ARM::DPairSpcRegClassID]; FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC); } - Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count, - (Spacing == 2), - S, E)); + auto Create = (LaneKind == NoLanes ? ARMOperand::CreateVectorList : + ARMOperand::CreateVectorListAllLanes); + Operands.push_back(Create(FirstReg, Count, (Spacing == 2), S, E)); break; + } case IndexedLane: Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count, LaneIndex, @@ -5061,6 +5591,21 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst, ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2); } +void ARMAsmParser::cvtMVEVMOVQtoDReg( + MCInst &Inst, const OperandVector &Operands) { + + // mnemonic, condition code, Rt, Rt2, Qd, idx, Qd again, idx2 + assert(Operands.size() == 8); + + ((ARMOperand &)*Operands[2]).addRegOperands(Inst, 1); // Rt + ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1); // Rt2 + ((ARMOperand &)*Operands[4]).addRegOperands(Inst, 1); // Qd + ((ARMOperand &)*Operands[5]).addMVEPairVectorIndexOperands(Inst, 1); // idx + // skip second copy of Qd in Operands[6] + ((ARMOperand &)*Operands[7]).addMVEPairVectorIndexOperands(Inst, 1); // idx2 + ((ARMOperand &)*Operands[1]).addCondCodeOperands(Inst, 2); // condition code +} + /// Parse an ARM memory expression, return false if successful else return true /// or an error. The first token must be a '[' when called. bool ARMAsmParser::parseMemory(OperandVector &Operands) { @@ -5275,6 +5820,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St, St = ARM_AM::ror; else if (ShiftName == "rrx" || ShiftName == "RRX") St = ARM_AM::rrx; + else if (ShiftName == "uxtw" || ShiftName == "UXTW") + St = ARM_AM::uxtw; else return Error(Loc, "illegal shift operator"); Parser.Lex(); // Eat shift type token. @@ -5463,7 +6010,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { case AsmToken::LBrac: return parseMemory(Operands); case AsmToken::LCurly: - return parseRegisterList(Operands); + return parseRegisterList(Operands, !Mnemonic.startswith("clr")); case AsmToken::Dollar: case AsmToken::Hash: // #42 -> immediate. @@ -5595,6 +6142,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) { case MCObjectFileInfo::IsWasm: CurrentFormat = WASM; break; + case MCObjectFileInfo::IsXCOFF: + llvm_unreachable("unexpected object format"); + break; } if (~Prefix->SupportedFormats & CurrentFormat) { @@ -5621,11 +6171,14 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) { // FIXME: Would be nice to autogen this. // FIXME: This is a bit of a maze of special cases. StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, + StringRef ExtraToken, unsigned &PredicationCode, + unsigned &VPTPredicationCode, bool &CarrySetting, unsigned &ProcessorIMod, StringRef &ITMask) { PredicationCode = ARMCC::AL; + VPTPredicationCode = ARMVCC::None; CarrySetting = false; ProcessorIMod = 0; @@ -5649,7 +6202,12 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "bxns" || Mnemonic == "blxns" || Mnemonic == "vudot" || Mnemonic == "vsdot" || Mnemonic == "vcmla" || Mnemonic == "vcadd" || - Mnemonic == "vfmal" || Mnemonic == "vfmsl") + Mnemonic == "vfmal" || Mnemonic == "vfmsl" || + Mnemonic == "wls" || Mnemonic == "le" || Mnemonic == "dls" || + Mnemonic == "csel" || Mnemonic == "csinc" || + Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" || + Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" || + Mnemonic == "csetm") return Mnemonic; // First, split out any predication code. Ignore mnemonics we know aren't @@ -5657,7 +6215,18 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" && Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" && Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" && - Mnemonic != "sbcs" && Mnemonic != "rscs") { + Mnemonic != "sbcs" && Mnemonic != "rscs" && + !(hasMVE() && + (Mnemonic == "vmine" || + Mnemonic == "vshle" || Mnemonic == "vshlt" || Mnemonic == "vshllt" || + Mnemonic == "vrshle" || Mnemonic == "vrshlt" || + Mnemonic == "vmvne" || Mnemonic == "vorne" || + Mnemonic == "vnege" || Mnemonic == "vnegt" || + Mnemonic == "vmule" || Mnemonic == "vmult" || + Mnemonic == "vrintne" || + Mnemonic == "vcmult" || Mnemonic == "vcmule" || + Mnemonic == "vpsele" || Mnemonic == "vpselt" || + Mnemonic.startswith("vq")))) { unsigned CC = ARMCondCodeFromString(Mnemonic.substr(Mnemonic.size()-2)); if (CC != ~0U) { Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2); @@ -5677,7 +6246,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" || Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" || Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" || - Mnemonic == "bxns" || Mnemonic == "blxns" || + Mnemonic == "bxns" || Mnemonic == "blxns" || Mnemonic == "vfmas" || + Mnemonic == "vmlas" || (Mnemonic == "movs" && isThumb()))) { Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1); CarrySetting = true; @@ -5698,12 +6268,36 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, } } + if (isMnemonicVPTPredicable(Mnemonic, ExtraToken) && Mnemonic != "vmovlt" && + Mnemonic != "vshllt" && Mnemonic != "vrshrnt" && Mnemonic != "vshrnt" && + Mnemonic != "vqrshrunt" && Mnemonic != "vqshrunt" && + Mnemonic != "vqrshrnt" && Mnemonic != "vqshrnt" && Mnemonic != "vmullt" && + Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" && + Mnemonic != "vqmovnt" && Mnemonic != "vmovnt" && Mnemonic != "vqdmullt" && + Mnemonic != "vpnot" && Mnemonic != "vcvtt" && Mnemonic != "vcvt") { + unsigned CC = ARMVectorCondCodeFromString(Mnemonic.substr(Mnemonic.size()-1)); + if (CC != ~0U) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size()-1); + VPTPredicationCode = CC; + } + return Mnemonic; + } + // The "it" instruction has the condition mask on the end of the mnemonic. if (Mnemonic.startswith("it")) { ITMask = Mnemonic.slice(2, Mnemonic.size()); Mnemonic = Mnemonic.slice(0, 2); } + if (Mnemonic.startswith("vpst")) { + ITMask = Mnemonic.slice(4, Mnemonic.size()); + Mnemonic = Mnemonic.slice(0, 4); + } + else if (Mnemonic.startswith("vpt")) { + ITMask = Mnemonic.slice(3, Mnemonic.size()); + Mnemonic = Mnemonic.slice(0, 3); + } + return Mnemonic; } @@ -5711,9 +6305,14 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, /// inclusion of carry set or predication code operands. // // FIXME: It would be nice to autogen this. -void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, +void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, + StringRef ExtraToken, + StringRef FullInst, bool &CanAcceptCarrySet, - bool &CanAcceptPredicationCode) { + bool &CanAcceptPredicationCode, + bool &CanAcceptVPTPredicationCode) { + CanAcceptVPTPredicationCode = isMnemonicVPTPredicable(Mnemonic, ExtraToken); + CanAcceptCarrySet = Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" || Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" || @@ -5742,7 +6341,18 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, Mnemonic == "vcmla" || Mnemonic == "vcadd" || Mnemonic == "vfmal" || Mnemonic == "vfmsl" || Mnemonic == "sb" || Mnemonic == "ssbb" || - Mnemonic == "pssbb") { + Mnemonic == "pssbb" || + Mnemonic == "bfcsel" || Mnemonic == "wls" || + Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" || + Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" || + Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" || + Mnemonic == "cset" || Mnemonic == "csetm" || + Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") || + (hasMVE() && + (Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") || + Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") || + Mnemonic.startswith("wlstp") || Mnemonic.startswith("dlstp") || + Mnemonic.startswith("letp")))) { // These mnemonics are never predicable CanAcceptPredicationCode = false; } else if (!isThumb()) { @@ -5976,7 +6586,8 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands) { // VRINT{Z, X} have a predicate operand in VFP, but not in NEON unsigned RegIdx = 3; - if ((Mnemonic == "vrintz" || Mnemonic == "vrintx") && + if ((((Mnemonic == "vrintz" || Mnemonic == "vrintx") && !hasMVE()) || + Mnemonic == "vrintr") && (static_cast(*Operands[2]).getToken() == ".f32" || static_cast(*Operands[2]).getToken() == ".f16")) { if (static_cast(*Operands[3]).isToken() && @@ -5994,6 +6605,47 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic, return false; } +bool ARMAsmParser::shouldOmitVectorPredicateOperand(StringRef Mnemonic, + OperandVector &Operands) { + if (!hasMVE() || Operands.size() < 3) + return true; + + if (Mnemonic.startswith("vld2") || Mnemonic.startswith("vld4") || + Mnemonic.startswith("vst2") || Mnemonic.startswith("vst4")) + return true; + + if (Mnemonic.startswith("vctp") || Mnemonic.startswith("vpnot")) + return false; + + if (Mnemonic.startswith("vmov") && + !(Mnemonic.startswith("vmovl") || Mnemonic.startswith("vmovn") || + Mnemonic.startswith("vmovx"))) { + for (auto &Operand : Operands) { + if (static_cast(*Operand).isVectorIndex() || + ((*Operand).isReg() && + (ARMMCRegisterClasses[ARM::SPRRegClassID].contains( + (*Operand).getReg()) || + ARMMCRegisterClasses[ARM::DPRRegClassID].contains( + (*Operand).getReg())))) { + return true; + } + } + return false; + } else { + for (auto &Operand : Operands) { + // We check the larger class QPR instead of just the legal class + // MQPR, to more accurately report errors when using Q registers + // outside of the allowed range. + if (static_cast(*Operand).isVectorIndex() || + (Operand->isReg() && + (ARMMCRegisterClasses[ARM::QPRRegClassID].contains( + Operand->getReg())))) + return false; + } + return true; + } +} + static bool isDataTypeToken(StringRef Tok) { return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" || Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" || @@ -6010,7 +6662,8 @@ static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) { return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm"); } -static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features, +static void applyMnemonicAliases(StringRef &Mnemonic, + const FeatureBitset &Features, unsigned VariantID); // The GNU assembler has aliases of ldrd and strd with the second register @@ -6033,7 +6686,7 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic, if (!Op2.isReg()) return; - if (!Op3.isMem()) + if (!Op3.isGPRMem()) return; const MCRegisterClass &GPR = MRI->getRegClass(ARM::GPRRegClassID); @@ -6068,7 +6721,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // The generic tblgen'erated code does this later, at the start of // MatchInstructionImpl(), but that's too late for aliases that include // any sort of suffix. - uint64_t AvailableFeatures = getAvailableFeatures(); + const FeatureBitset &AvailableFeatures = getAvailableFeatures(); unsigned AssemblerDialect = getParser().getAssemblerDialect(); applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect); @@ -6084,14 +6737,16 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Create the leading tokens for the mnemonic, split by '.' characters. size_t Start = 0, Next = Name.find('.'); StringRef Mnemonic = Name.slice(Start, Next); + StringRef ExtraToken = Name.slice(Next, Name.find(' ', Next + 1)); // Split out the predication code and carry setting flag from the mnemonic. unsigned PredicationCode; + unsigned VPTPredicationCode; unsigned ProcessorIMod; bool CarrySetting; StringRef ITMask; - Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting, - ProcessorIMod, ITMask); + Mnemonic = splitMnemonic(Mnemonic, ExtraToken, PredicationCode, VPTPredicationCode, + CarrySetting, ProcessorIMod, ITMask); // In Thumb1, only the branch (B) instruction can be predicated. if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") { @@ -6100,15 +6755,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc)); - // Handle the IT instruction ITMask. Convert it to a bitmask. This - // is the mask as it will be for the IT encoding if the conditional - // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case - // where the conditional bit0 is zero, the instruction post-processing - // will adjust the mask accordingly. - if (Mnemonic == "it") { - SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2); + // Handle the mask for IT and VPT instructions. In ARMOperand and + // MCOperand, this is stored in a format independent of the + // condition code: the lowest set bit indicates the end of the + // encoding, and above that, a 1 bit indicates 'else', and an 0 + // indicates 'then'. E.g. + // IT -> 1000 + // ITx -> x100 (ITT -> 0100, ITE -> 1100) + // ITxy -> xy10 (e.g. ITET -> 1010) + // ITxyz -> xyz1 (e.g. ITEET -> 1101) + if (Mnemonic == "it" || Mnemonic.startswith("vpt") || + Mnemonic.startswith("vpst")) { + SMLoc Loc = Mnemonic == "it" ? SMLoc::getFromPointer(NameLoc.getPointer() + 2) : + Mnemonic == "vpt" ? SMLoc::getFromPointer(NameLoc.getPointer() + 3) : + SMLoc::getFromPointer(NameLoc.getPointer() + 4); if (ITMask.size() > 3) { - return Error(Loc, "too many conditions on IT instruction"); + if (Mnemonic == "it") + return Error(Loc, "too many conditions on IT instruction"); + return Error(Loc, "too many conditions on VPT instruction"); } unsigned Mask = 8; for (unsigned i = ITMask.size(); i != 0; --i) { @@ -6117,7 +6781,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return Error(Loc, "illegal IT block condition mask '" + ITMask + "'"); } Mask >>= 1; - if (ITMask[i - 1] == 't') + if (ITMask[i - 1] == 'e') Mask |= 8; } Operands.push_back(ARMOperand::CreateITMask(Mask, Loc)); @@ -6133,8 +6797,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // ConditionCode operands to match the mnemonic "as written" and then we let // the matcher deal with finding the right instruction or generating an // appropriate error. - bool CanAcceptCarrySet, CanAcceptPredicationCode; - getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode); + bool CanAcceptCarrySet, CanAcceptPredicationCode, CanAcceptVPTPredicationCode; + getMnemonicAcceptInfo(Mnemonic, ExtraToken, Name, CanAcceptCarrySet, + CanAcceptPredicationCode, CanAcceptVPTPredicationCode); // If we had a carry-set on an instruction that can't do that, issue an // error. @@ -6149,6 +6814,13 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, "' is not predicable, but condition code specified"); } + // If we had a VPT predication code on an instruction that can't do that, issue an + // error. + if (!CanAcceptVPTPredicationCode && VPTPredicationCode != ARMVCC::None) { + return Error(NameLoc, "instruction '" + Mnemonic + + "' is not VPT predicable, but VPT code T/E is specified"); + } + // Add the carry setting operand, if necessary. if (CanAcceptCarrySet) { SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size()); @@ -6161,7 +6833,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() + CarrySetting); Operands.push_back(ARMOperand::CreateCondCode( - ARMCC::CondCodes(PredicationCode), Loc)); + ARMCC::CondCodes(PredicationCode), Loc)); + } + + // Add the VPT predication code operand, if necessary. + // FIXME: We don't add them for the instructions filtered below as these can + // have custom operands which need special parsing. This parsing requires + // the operand to be in the same place in the OperandVector as their + // definition in tblgen. Since these instructions may also have the + // scalar predication operand we do not add the vector one and leave until + // now to fix it up. + if (CanAcceptVPTPredicationCode && Mnemonic != "vmov" && + !Mnemonic.startswith("vcmp") && + !(Mnemonic.startswith("vcvt") && Mnemonic != "vcvta" && + Mnemonic != "vcvtn" && Mnemonic != "vcvtp" && Mnemonic != "vcvtm")) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() + + CarrySetting); + Operands.push_back(ARMOperand::CreateVPTPred( + ARMVCC::VPTCodes(VPTPredicationCode), Loc)); } // Add the processor imod operand, if necessary. @@ -6177,7 +6866,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, while (Next != StringRef::npos) { Start = Next; Next = Name.find('.', Start + 1); - StringRef ExtraToken = Name.slice(Start, Next); + ExtraToken = Name.slice(Start, Next); // Some NEON instructions have an optional datatype suffix that is // completely ignored. Check for that. @@ -6233,57 +6922,173 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Some instructions have the same mnemonic, but don't always // have a predicate. Distinguish them here and delete the - // predicate if needed. + // appropriate predicate if needed. This could be either the scalar + // predication code or the vector predication code. if (PredicationCode == ARMCC::AL && shouldOmitPredicateOperand(Mnemonic, Operands)) Operands.erase(Operands.begin() + 1); - // ARM mode 'blx' need special handling, as the register operand version - // is predicable, but the label operand version is not. So, we can't rely - // on the Mnemonic based checking to correctly figure out when to put - // a k_CondCode operand in the list. If we're trying to match the label - // version, remove the k_CondCode operand here. - if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 && - static_cast(*Operands[2]).isImm()) - Operands.erase(Operands.begin() + 1); - // Adjust operands of ldrexd/strexd to MCK_GPRPair. - // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, - // a single GPRPair reg operand is used in the .td file to replace the two - // GPRs. However, when parsing from asm, the two GRPs cannot be automatically - // expressed as a GPRPair, so we have to manually merge them. - // FIXME: We would really like to be able to tablegen'erate this. - if (!isThumb() && Operands.size() > 4 && - (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" || - Mnemonic == "stlexd")) { - bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd"); - unsigned Idx = isLoad ? 2 : 3; - ARMOperand &Op1 = static_cast(*Operands[Idx]); - ARMOperand &Op2 = static_cast(*Operands[Idx + 1]); - - const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID); - // Adjust only if Op1 and Op2 are GPRs. - if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) && - MRC.contains(Op2.getReg())) { - unsigned Reg1 = Op1.getReg(); - unsigned Reg2 = Op2.getReg(); - unsigned Rt = MRI->getEncodingValue(Reg1); - unsigned Rt2 = MRI->getEncodingValue(Reg2); - - // Rt2 must be Rt + 1 and Rt must be even. - if (Rt + 1 != Rt2 || (Rt & 1)) { - return Error(Op2.getStartLoc(), - isLoad ? "destination operands must be sequential" - : "source operands must be sequential"); + if (hasMVE()) { + if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands) && + Mnemonic == "vmov" && PredicationCode == ARMCC::LT) { + // Very nasty hack to deal with the vector predicated variant of vmovlt + // the scalar predicated vmov with condition 'lt'. We can not tell them + // apart until we have parsed their operands. + Operands.erase(Operands.begin() + 1); + Operands.erase(Operands.begin()); + SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer()); + SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + Mnemonic.size() - 1 + CarrySetting); + Operands.insert(Operands.begin(), + ARMOperand::CreateVPTPred(ARMVCC::None, PLoc)); + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(StringRef("vmovlt"), MLoc)); + } else if (Mnemonic == "vcvt" && PredicationCode == ARMCC::NE && + !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) { + // Another nasty hack to deal with the ambiguity between vcvt with scalar + // predication 'ne' and vcvtn with vector predication 'e'. As above we + // can only distinguish between the two after we have parsed their + // operands. + Operands.erase(Operands.begin() + 1); + Operands.erase(Operands.begin()); + SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer()); + SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + Mnemonic.size() - 1 + CarrySetting); + Operands.insert(Operands.begin(), + ARMOperand::CreateVPTPred(ARMVCC::Else, PLoc)); + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(StringRef("vcvtn"), MLoc)); + } else if (Mnemonic == "vmul" && PredicationCode == ARMCC::LT && + !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) { + // Another hack, this time to distinguish between scalar predicated vmul + // with 'lt' predication code and the vector instruction vmullt with + // vector predication code "none" + Operands.erase(Operands.begin() + 1); + Operands.erase(Operands.begin()); + SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer()); + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(StringRef("vmullt"), MLoc)); + } + // For vmov and vcmp, as mentioned earlier, we did not add the vector + // predication code, since these may contain operands that require + // special parsing. So now we have to see if they require vector + // predication and replace the scalar one with the vector predication + // operand if that is the case. + else if (Mnemonic == "vmov" || Mnemonic.startswith("vcmp") || + (Mnemonic.startswith("vcvt") && !Mnemonic.startswith("vcvta") && + !Mnemonic.startswith("vcvtn") && !Mnemonic.startswith("vcvtp") && + !Mnemonic.startswith("vcvtm"))) { + if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands)) { + // We could not split the vector predicate off vcvt because it might + // have been the scalar vcvtt instruction. Now we know its a vector + // instruction, we still need to check whether its the vector + // predicated vcvt with 'Then' predication or the vector vcvtt. We can + // distinguish the two based on the suffixes, if it is any of + // ".f16.f32", ".f32.f16", ".f16.f64" or ".f64.f16" then it is the vcvtt. + if (Mnemonic.startswith("vcvtt") && Operands.size() >= 4) { + auto Sz1 = static_cast(*Operands[2]); + auto Sz2 = static_cast(*Operands[3]); + if (!(Sz1.isToken() && Sz1.getToken().startswith(".f") && + Sz2.isToken() && Sz2.getToken().startswith(".f"))) { + Operands.erase(Operands.begin()); + SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer()); + VPTPredicationCode = ARMVCC::Then; + + Mnemonic = Mnemonic.substr(0, 4); + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(Mnemonic, MLoc)); + } + } + Operands.erase(Operands.begin() + 1); + SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + Mnemonic.size() + CarrySetting); + Operands.insert(Operands.begin() + 1, + ARMOperand::CreateVPTPred( + ARMVCC::VPTCodes(VPTPredicationCode), PLoc)); + } + } else if (CanAcceptVPTPredicationCode) { + // For all other instructions, make sure only one of the two + // predication operands is left behind, depending on whether we should + // use the vector predication. + if (shouldOmitVectorPredicateOperand(Mnemonic, Operands)) { + if (CanAcceptPredicationCode) + Operands.erase(Operands.begin() + 2); + else + Operands.erase(Operands.begin() + 1); + } else if (CanAcceptPredicationCode && PredicationCode == ARMCC::AL) { + Operands.erase(Operands.begin() + 1); } - unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0, - &(MRI->getRegClass(ARM::GPRPairRegClassID))); - Operands[Idx] = - ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc()); - Operands.erase(Operands.begin() + Idx + 1); } } + if (VPTPredicationCode != ARMVCC::None) { + bool usedVPTPredicationCode = false; + for (unsigned I = 1; I < Operands.size(); ++I) + if (static_cast(*Operands[I]).isVPTPred()) + usedVPTPredicationCode = true; + if (!usedVPTPredicationCode) { + // If we have a VPT predication code and we haven't just turned it + // into an operand, then it was a mistake for splitMnemonic to + // separate it from the rest of the mnemonic in the first place, + // and this may lead to wrong disassembly (e.g. scalar floating + // point VCMPE is actually a different instruction from VCMP, so + // we mustn't treat them the same). In that situation, glue it + // back on. + Mnemonic = Name.slice(0, Mnemonic.size() + 1); + Operands.erase(Operands.begin()); + Operands.insert(Operands.begin(), + ARMOperand::CreateToken(Mnemonic, NameLoc)); + } + } + + // ARM mode 'blx' need special handling, as the register operand version + // is predicable, but the label operand version is not. So, we can't rely + // on the Mnemonic based checking to correctly figure out when to put + // a k_CondCode operand in the list. If we're trying to match the label + // version, remove the k_CondCode operand here. + if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 && + static_cast(*Operands[2]).isImm()) + Operands.erase(Operands.begin() + 1); + + // Adjust operands of ldrexd/strexd to MCK_GPRPair. + // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, + // a single GPRPair reg operand is used in the .td file to replace the two + // GPRs. However, when parsing from asm, the two GRPs cannot be + // automatically + // expressed as a GPRPair, so we have to manually merge them. + // FIXME: We would really like to be able to tablegen'erate this. + if (!isThumb() && Operands.size() > 4 && + (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" || + Mnemonic == "stlexd")) { + bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd"); + unsigned Idx = isLoad ? 2 : 3; + ARMOperand &Op1 = static_cast(*Operands[Idx]); + ARMOperand &Op2 = static_cast(*Operands[Idx + 1]); + + const MCRegisterClass &MRC = MRI->getRegClass(ARM::GPRRegClassID); + // Adjust only if Op1 and Op2 are GPRs. + if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) && + MRC.contains(Op2.getReg())) { + unsigned Reg1 = Op1.getReg(); + unsigned Reg2 = Op2.getReg(); + unsigned Rt = MRI->getEncodingValue(Reg1); + unsigned Rt2 = MRI->getEncodingValue(Reg2); + + // Rt2 must be Rt + 1 and Rt must be even. + if (Rt + 1 != Rt2 || (Rt & 1)) { + return Error(Op2.getStartLoc(), + isLoad ? "destination operands must be sequential" + : "source operands must be sequential"); + } + unsigned NewReg = MRI->getMatchingSuperReg( + Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID))); + Operands[Idx] = + ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc()); + Operands.erase(Operands.begin() + Idx + 1); + } + } + // GNU Assembler extension (compatibility). fixupGNULDRDAlias(Mnemonic, Operands); @@ -6442,6 +7247,17 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst, return false; } +static int findFirstVectorPredOperandIdx(const MCInstrDesc &MCID) { + for (unsigned i = 0; i < MCID.NumOperands; ++i) { + if (ARM::isVpred(MCID.OpInfo[i].OperandType)) + return i; + } + return -1; +} + +static bool isVectorPredicable(const MCInstrDesc &MCID) { + return findFirstVectorPredOperandIdx(MCID) != -1; +} // FIXME: We would really like to be able to tablegen'erate this. bool ARMAsmParser::validateInstruction(MCInst &Inst, @@ -6473,12 +7289,25 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, } else if (isThumbTwo() && MCID.isPredicable() && Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() != ARMCC::AL && Inst.getOpcode() != ARM::tBcc && - Inst.getOpcode() != ARM::t2Bcc) { + Inst.getOpcode() != ARM::t2Bcc && + Inst.getOpcode() != ARM::t2BFic) { return Error(Loc, "predicated instructions must be in IT block"); } else if (!isThumb() && !useImplicitITARM() && MCID.isPredicable() && Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() != ARMCC::AL) { return Warning(Loc, "predicated instructions should be in IT block"); + } else if (!MCID.isPredicable()) { + // Check the instruction doesn't have a predicate operand anyway + // that it's not allowed to use. Sometimes this happens in order + // to keep instructions the same shape even though one cannot + // legally be predicated, e.g. vmul.f16 vs vmul.f32. + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) { + if (MCID.OpInfo[i].isPredicate()) { + if (Inst.getOperand(i).getImm() != ARMCC::AL) + return Error(Loc, "instruction is not predicable"); + break; + } + } } // PC-setting instructions in an IT block, but not the last instruction of @@ -6487,6 +7316,28 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, return Error(Loc, "instruction must be outside of IT block or the last instruction in an IT block"); } + if (inVPTBlock() && !instIsBreakpoint(Inst)) { + unsigned Bit = extractITMaskBit(VPTState.Mask, VPTState.CurPosition); + if (!isVectorPredicable(MCID)) + return Error(Loc, "instruction in VPT block must be predicable"); + unsigned Pred = Inst.getOperand(findFirstVectorPredOperandIdx(MCID)).getImm(); + unsigned VPTPred = Bit ? ARMVCC::Else : ARMVCC::Then; + if (Pred != VPTPred) { + SMLoc PredLoc; + for (unsigned I = 1; I < Operands.size(); ++I) + if (static_cast(*Operands[I]).isVPTPred()) + PredLoc = Operands[I]->getStartLoc(); + return Error(PredLoc, "incorrect predication in VPT block; got '" + + StringRef(ARMVPTPredToString(ARMVCC::VPTCodes(Pred))) + + "', but expected '" + + ARMVPTPredToString(ARMVCC::VPTCodes(VPTPred)) + "'"); + } + } + else if (isVectorPredicable(MCID) && + Inst.getOperand(findFirstVectorPredOperandIdx(MCID)).getImm() != + ARMVCC::None) + return Error(Loc, "VPT predicated instructions must be in VPT block"); + const unsigned Opcode = Inst.getOpcode(); switch (Opcode) { case ARM::t2IT: { @@ -6496,11 +7347,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, unsigned Cond = Inst.getOperand(0).getImm(); unsigned Mask = Inst.getOperand(1).getImm(); - // Mask hasn't been modified to the IT instruction encoding yet so - // conditions only allowing a 't' are a block of 1s starting at bit 3 - // followed by all 0s. Easiest way is to just list the 4 possibilities. - if (Cond == ARMCC::AL && Mask != 8 && Mask != 12 && Mask != 14 && - Mask != 15) + // Conditions only allowing a 't' are those with no set bit except + // the lowest-order one that indicates the end of the sequence. In + // other words, powers of 2. + if (Cond == ARMCC::AL && countPopulation(Mask) != 1) return Error(Loc, "unpredictable IT predicate sequence"); break; } @@ -6609,6 +7459,54 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, "destination register and base register can't be identical"); return false; } + + case ARM::MVE_VLDRBU8_rq: + case ARM::MVE_VLDRBU16_rq: + case ARM::MVE_VLDRBS16_rq: + case ARM::MVE_VLDRBU32_rq: + case ARM::MVE_VLDRBS32_rq: + case ARM::MVE_VLDRHU16_rq: + case ARM::MVE_VLDRHU16_rq_u: + case ARM::MVE_VLDRHU32_rq: + case ARM::MVE_VLDRHU32_rq_u: + case ARM::MVE_VLDRHS32_rq: + case ARM::MVE_VLDRHS32_rq_u: + case ARM::MVE_VLDRWU32_rq: + case ARM::MVE_VLDRWU32_rq_u: + case ARM::MVE_VLDRDU64_rq: + case ARM::MVE_VLDRDU64_rq_u: + case ARM::MVE_VLDRWU32_qi: + case ARM::MVE_VLDRWU32_qi_pre: + case ARM::MVE_VLDRDU64_qi: + case ARM::MVE_VLDRDU64_qi_pre: { + // Qd must be different from Qm. + unsigned QdIdx = 0, QmIdx = 2; + bool QmIsPointer = false; + switch (Opcode) { + case ARM::MVE_VLDRWU32_qi: + case ARM::MVE_VLDRDU64_qi: + QmIdx = 1; + QmIsPointer = true; + break; + case ARM::MVE_VLDRWU32_qi_pre: + case ARM::MVE_VLDRDU64_qi_pre: + QdIdx = 1; + QmIsPointer = true; + break; + } + + const unsigned Qd = MRI->getEncodingValue(Inst.getOperand(QdIdx).getReg()); + const unsigned Qm = MRI->getEncodingValue(Inst.getOperand(QmIdx).getReg()); + + if (Qd == Qm) { + return Error(Operands[3]->getStartLoc(), + Twine("destination vector register and vector ") + + (QmIsPointer ? "pointer" : "offset") + + " register can't be identical"); + } + return false; + } + case ARM::SBFX: case ARM::t2SBFX: case ARM::UBFX: @@ -6776,6 +7674,20 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, } break; + case ARM::t2ADDri: + case ARM::t2ADDri12: + case ARM::t2ADDrr: + case ARM::t2ADDrs: + case ARM::t2SUBri: + case ARM::t2SUBri12: + case ARM::t2SUBrr: + case ARM::t2SUBrs: + if (Inst.getOperand(0).getReg() == ARM::SP && + Inst.getOperand(1).getReg() != ARM::SP) + return Error(Operands[4]->getStartLoc(), + "source register must be sp if destination is sp"); + break; + // Final range checking for Thumb unconditional branch instructions. case ARM::tB: if (!(static_cast(*Operands[2])).isSignedOffset<11, 1>()) @@ -6845,6 +7757,61 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, "code specified"); break; } + case ARM::t2BFi: + case ARM::t2BFr: + case ARM::t2BFLi: + case ARM::t2BFLr: { + if (!static_cast(*Operands[2]).isUnsignedOffset<4, 1>() || + (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0)) + return Error(Operands[2]->getStartLoc(), + "branch location out of range or not a multiple of 2"); + + if (Opcode == ARM::t2BFi) { + if (!static_cast(*Operands[3]).isSignedOffset<16, 1>()) + return Error(Operands[3]->getStartLoc(), + "branch target out of range or not a multiple of 2"); + } else if (Opcode == ARM::t2BFLi) { + if (!static_cast(*Operands[3]).isSignedOffset<18, 1>()) + return Error(Operands[3]->getStartLoc(), + "branch target out of range or not a multiple of 2"); + } + break; + } + case ARM::t2BFic: { + if (!static_cast(*Operands[1]).isUnsignedOffset<4, 1>() || + (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0)) + return Error(Operands[1]->getStartLoc(), + "branch location out of range or not a multiple of 2"); + + if (!static_cast(*Operands[2]).isSignedOffset<16, 1>()) + return Error(Operands[2]->getStartLoc(), + "branch target out of range or not a multiple of 2"); + + assert(Inst.getOperand(0).isImm() == Inst.getOperand(2).isImm() && + "branch location and else branch target should either both be " + "immediates or both labels"); + + if (Inst.getOperand(0).isImm() && Inst.getOperand(2).isImm()) { + int Diff = Inst.getOperand(2).getImm() - Inst.getOperand(0).getImm(); + if (Diff != 4 && Diff != 2) + return Error( + Operands[3]->getStartLoc(), + "else branch target must be 2 or 4 greater than the branch location"); + } + break; + } + case ARM::t2CLRM: { + for (unsigned i = 2; i < Inst.getNumOperands(); i++) { + if (Inst.getOperand(i).isReg() && + !ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains( + Inst.getOperand(i).getReg())) { + return Error(Operands[2]->getStartLoc(), + "invalid register in register list. Valid registers are " + "r0-r12, lr/r14 and APSR."); + } + } + break; + } case ARM::DSB: case ARM::t2DSB: { @@ -6892,6 +7859,39 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, "list of registers must be at least 1 and at most 16"); break; } + case ARM::MVE_VQDMULLs32bh: + case ARM::MVE_VQDMULLs32th: + case ARM::MVE_VCMULf32: + case ARM::MVE_VMULLs32bh: + case ARM::MVE_VMULLs32th: + case ARM::MVE_VMULLu32bh: + case ARM::MVE_VMULLu32th: { + if (Operands[3]->getReg() == Operands[4]->getReg()) { + return Error (Operands[3]->getStartLoc(), + "Qd register and Qn register can't be identical"); + } + if (Operands[3]->getReg() == Operands[5]->getReg()) { + return Error (Operands[3]->getStartLoc(), + "Qd register and Qm register can't be identical"); + } + break; + } + case ARM::MVE_VMOV_rr_q: { + if (Operands[4]->getReg() != Operands[6]->getReg()) + return Error (Operands[4]->getStartLoc(), "Q-registers must be the same"); + if (static_cast(*Operands[5]).getVectorIndex() != + static_cast(*Operands[7]).getVectorIndex() + 2) + return Error (Operands[5]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1"); + break; + } + case ARM::MVE_VMOV_q_rr: { + if (Operands[2]->getReg() != Operands[4]->getReg()) + return Error (Operands[2]->getStartLoc(), "Q-registers must be the same"); + if (static_cast(*Operands[3]).getVectorIndex() != + static_cast(*Operands[5]).getVectorIndex() + 2) + return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1"); + break; + } } return false; @@ -7168,6 +8168,50 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, } switch (Inst.getOpcode()) { + case ARM::MVE_VORNIZ0v4i32: + case ARM::MVE_VORNIZ0v8i16: + case ARM::MVE_VORNIZ8v4i32: + case ARM::MVE_VORNIZ8v8i16: + case ARM::MVE_VORNIZ16v4i32: + case ARM::MVE_VORNIZ24v4i32: + case ARM::MVE_VANDIZ0v4i32: + case ARM::MVE_VANDIZ0v8i16: + case ARM::MVE_VANDIZ8v4i32: + case ARM::MVE_VANDIZ8v8i16: + case ARM::MVE_VANDIZ16v4i32: + case ARM::MVE_VANDIZ24v4i32: { + unsigned Opcode; + bool imm16 = false; + switch(Inst.getOpcode()) { + case ARM::MVE_VORNIZ0v4i32: Opcode = ARM::MVE_VORRIZ0v4i32; break; + case ARM::MVE_VORNIZ0v8i16: Opcode = ARM::MVE_VORRIZ0v8i16; imm16 = true; break; + case ARM::MVE_VORNIZ8v4i32: Opcode = ARM::MVE_VORRIZ8v4i32; break; + case ARM::MVE_VORNIZ8v8i16: Opcode = ARM::MVE_VORRIZ8v8i16; imm16 = true; break; + case ARM::MVE_VORNIZ16v4i32: Opcode = ARM::MVE_VORRIZ16v4i32; break; + case ARM::MVE_VORNIZ24v4i32: Opcode = ARM::MVE_VORRIZ24v4i32; break; + case ARM::MVE_VANDIZ0v4i32: Opcode = ARM::MVE_VBICIZ0v4i32; break; + case ARM::MVE_VANDIZ0v8i16: Opcode = ARM::MVE_VBICIZ0v8i16; imm16 = true; break; + case ARM::MVE_VANDIZ8v4i32: Opcode = ARM::MVE_VBICIZ8v4i32; break; + case ARM::MVE_VANDIZ8v8i16: Opcode = ARM::MVE_VBICIZ8v8i16; imm16 = true; break; + case ARM::MVE_VANDIZ16v4i32: Opcode = ARM::MVE_VBICIZ16v4i32; break; + case ARM::MVE_VANDIZ24v4i32: Opcode = ARM::MVE_VBICIZ24v4i32; break; + default: llvm_unreachable("unexpected opcode"); + } + + MCInst TmpInst; + TmpInst.setOpcode(Opcode); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + + // invert immediate + unsigned imm = ~Inst.getOperand(2).getImm() & (imm16 ? 0xffff : 0xffffffff); + TmpInst.addOperand(MCOperand::createImm(imm)); + + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction. case ARM::LDRT_POST: case ARM::LDRBT_POST: { @@ -8990,15 +10034,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, } case ARM::ITasm: case ARM::t2IT: { - MCOperand &MO = Inst.getOperand(1); - unsigned Mask = MO.getImm(); - ARMCC::CondCodes Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm()); - // Set up the IT block state according to the IT instruction we just // matched. assert(!inITBlock() && "nested IT blocks?!"); - startExplicitITBlock(Cond, Mask); - MO.setImm(getITMaskEncoding()); + startExplicitITBlock(ARMCC::CondCodes(Inst.getOperand(0).getImm()), + Inst.getOperand(1).getImm()); break; } case ARM::t2LSLrr: @@ -9074,6 +10114,35 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, return true; } return false; + case ARM::MVE_VPST: + case ARM::MVE_VPTv16i8: + case ARM::MVE_VPTv8i16: + case ARM::MVE_VPTv4i32: + case ARM::MVE_VPTv16u8: + case ARM::MVE_VPTv8u16: + case ARM::MVE_VPTv4u32: + case ARM::MVE_VPTv16s8: + case ARM::MVE_VPTv8s16: + case ARM::MVE_VPTv4s32: + case ARM::MVE_VPTv4f32: + case ARM::MVE_VPTv8f16: + case ARM::MVE_VPTv16i8r: + case ARM::MVE_VPTv8i16r: + case ARM::MVE_VPTv4i32r: + case ARM::MVE_VPTv16u8r: + case ARM::MVE_VPTv8u16r: + case ARM::MVE_VPTv4u32r: + case ARM::MVE_VPTv16s8r: + case ARM::MVE_VPTv8s16r: + case ARM::MVE_VPTv4s32r: + case ARM::MVE_VPTv4f32r: + case ARM::MVE_VPTv8f16r: { + assert(!inVPTBlock() && "Nested VPT blocks are not allowed"); + MCOperand &MO = Inst.getOperand(0); + VPTState.Mask = MO.getImm(); + VPTState.CurPosition = 0; + break; + } } return false; } @@ -9138,18 +10207,50 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_RequiresV8; } - // Use of SP for VMRS/VMSR is only allowed in ARM mode with the exception of - // ARMv8-A. - if ((Inst.getOpcode() == ARM::VMRS || Inst.getOpcode() == ARM::VMSR) && - Inst.getOperand(0).getReg() == ARM::SP && (isThumb() && !hasV8Ops())) - return Match_InvalidOperand; + switch (Inst.getOpcode()) { + case ARM::VMRS: + case ARM::VMSR: + case ARM::VMRS_FPCXTS: + case ARM::VMRS_FPCXTNS: + case ARM::VMSR_FPCXTS: + case ARM::VMSR_FPCXTNS: + case ARM::VMRS_FPSCR_NZCVQC: + case ARM::VMSR_FPSCR_NZCVQC: + case ARM::FMSTAT: + case ARM::VMRS_VPR: + case ARM::VMRS_P0: + case ARM::VMSR_VPR: + case ARM::VMSR_P0: + // Use of SP for VMRS/VMSR is only allowed in ARM mode with the exception of + // ARMv8-A. + if (Inst.getOperand(0).isReg() && Inst.getOperand(0).getReg() == ARM::SP && + (isThumb() && !hasV8Ops())) + return Match_InvalidOperand; + break; + default: + break; + } for (unsigned I = 0; I < MCID.NumOperands; ++I) if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) { // rGPRRegClass excludes PC, and also excluded SP before ARMv8 - if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops()) + const auto &Op = Inst.getOperand(I); + if (!Op.isReg()) { + // This can happen in awkward cases with tied operands, e.g. a + // writeback load/store with a complex addressing mode in + // which there's an output operand corresponding to the + // updated written-back base register: the Tablegen-generated + // AsmMatcher will have written a placeholder operand to that + // slot in the form of an immediate 0, because it can't + // generate the register part of the complex addressing-mode + // operand ahead of time. + continue; + } + + unsigned Reg = Op.getReg(); + if ((Reg == ARM::SP) && !hasV8Ops()) return Match_RequiresV8; - else if (Inst.getOperand(I).getReg() == ARM::PC) + else if (Reg == ARM::PC) return Match_InvalidOperand; } @@ -9268,7 +10369,7 @@ unsigned ARMAsmParser::MatchInstruction(OperandVector &Operands, MCInst &Inst, return PlainMatchResult; } -static std::string ARMMnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string ARMMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS, unsigned VariantID = 0); static const char *getSubtargetFeatureName(uint64_t Val); @@ -9296,6 +10397,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // Still progress the IT block, otherwise one wrong condition causes // nasty cascading errors. forwardITPosition(); + forwardVPTPosition(); return true; } @@ -9322,6 +10424,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // and process gets a consistent answer about whether we're in an IT // block. forwardITPosition(); + forwardVPTPosition(); // ITasm is an ARM mode pseudo-instruction that just sets the ITblock and // doesn't actually encode. @@ -9341,7 +10444,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ReportNearMisses(NearMisses, IDLoc, Operands); return true; case Match_MnemonicFail: { - uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = ARMMnemonicSpellCheck( ((ARMOperand &)*Operands[0]).getToken(), FBS); return Error(IDLoc, "invalid instruction" + Suggestion, @@ -10384,11 +11487,11 @@ ARMAsmParser::getCustomOperandDiag(ARMMatchResultTy MatchError) { : "operand must be a register in range [r0, r12] or r14"; // DPR contains 16 registers for some FPUs, and 32 for others. case Match_DPR: - return hasD16() ? "operand must be a register in range [d0, d15]" - : "operand must be a register in range [d0, d31]"; + return hasD32() ? "operand must be a register in range [d0, d31]" + : "operand must be a register in range [d0, d15]"; case Match_DPR_RegList: - return hasD16() ? "operand must be a list of registers in range [d0, d15]" - : "operand must be a list of registers in range [d0, d31]"; + return hasD32() ? "operand must be a list of registers in range [d0, d31]" + : "operand must be a list of registers in range [d0, d15]"; // For all other diags, use the static string from tablegen. default: @@ -10416,7 +11519,7 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl &NearMissesIn, // variants of an instruction that take 8- and 16-bit immediates, we want // to only report the widest one. std::multimap OperandMissesSeen; - SmallSet FeatureMissesSeen; + SmallSet FeatureMissesSeen; bool ReportedTooFewOperands = false; // Process the near-misses in reverse order, so that we see more general ones @@ -10467,7 +11570,7 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl &NearMissesIn, break; } case NearMissInfo::NearMissFeature: { - uint64_t MissingFeatures = I.getFeatures(); + const FeatureBitset &MissingFeatures = I.getFeatures(); // Don't report the same set of features twice. if (FeatureMissesSeen.count(MissingFeatures)) break; @@ -10475,20 +11578,21 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl &NearMissesIn, // Special case: don't report a feature set which includes arm-mode for // targets that don't have ARM mode. - if ((MissingFeatures & Feature_IsARM) && !hasARM()) + if (MissingFeatures.test(Feature_IsARMBit) && !hasARM()) break; // Don't report any near-misses that both require switching instruction // set, and adding other subtarget features. - if (isThumb() && (MissingFeatures & Feature_IsARM) && - (MissingFeatures & ~Feature_IsARM)) + if (isThumb() && MissingFeatures.test(Feature_IsARMBit) && + MissingFeatures.count() > 1) break; - if (!isThumb() && (MissingFeatures & Feature_IsThumb) && - (MissingFeatures & ~Feature_IsThumb)) + if (!isThumb() && MissingFeatures.test(Feature_IsThumbBit) && + MissingFeatures.count() > 1) break; - if (!isThumb() && (MissingFeatures & Feature_IsThumb2) && - (MissingFeatures & ~(Feature_IsThumb2 | Feature_IsThumb))) + if (!isThumb() && MissingFeatures.test(Feature_IsThumb2Bit) && + (MissingFeatures & ~FeatureBitset({Feature_IsThumb2Bit, + Feature_IsThumbBit})).any()) break; - if (isMClass() && (MissingFeatures & Feature_HasNEON)) + if (isMClass() && MissingFeatures.test(Feature_HasNEONBit)) break; NearMissMessage Message; @@ -10496,14 +11600,10 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl &NearMissesIn, raw_svector_ostream OS(Message.Message); OS << "instruction requires:"; - uint64_t Mask = 1; - for (unsigned MaskPos = 0; MaskPos < (sizeof(MissingFeatures) * 8 - 1); - ++MaskPos) { - if (MissingFeatures & Mask) { - OS << " " << getSubtargetFeatureName(MissingFeatures & Mask); - } - Mask <<= 1; - } + for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) + if (MissingFeatures.test(i)) + OS << ' ' << getSubtargetFeatureName(i); + NearMissesOut.emplace_back(Message); break; @@ -10579,38 +11679,44 @@ void ARMAsmParser::ReportNearMisses(SmallVectorImpl &NearMisses, } } -// FIXME: This structure should be moved inside ARMTargetParser -// when we start to table-generate them, and we can use the ARM -// flags below, that were generated by table-gen. -static const struct { - const unsigned Kind; - const uint64_t ArchCheck; - const FeatureBitset Features; -} Extensions[] = { - { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} }, - { ARM::AEK_CRYPTO, Feature_HasV8, - {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, - { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} }, - { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass, - {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} }, - { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} }, - { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} }, - { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, - // FIXME: Only available in A-class, isel not predicated - { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} }, - { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, - { ARM::AEK_RAS, Feature_HasV8, {ARM::FeatureRAS} }, - // FIXME: Unsupported extensions. - { ARM::AEK_OS, Feature_None, {} }, - { ARM::AEK_IWMMXT, Feature_None, {} }, - { ARM::AEK_IWMMXT2, Feature_None, {} }, - { ARM::AEK_MAVERICK, Feature_None, {} }, - { ARM::AEK_XSCALE, Feature_None, {} }, -}; - /// parseDirectiveArchExtension /// ::= .arch_extension [no]feature bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { + // FIXME: This structure should be moved inside ARMTargetParser + // when we start to table-generate them, and we can use the ARM + // flags below, that were generated by table-gen. + static const struct { + const unsigned Kind; + const FeatureBitset ArchCheck; + const FeatureBitset Features; + } Extensions[] = { + { ARM::AEK_CRC, {Feature_HasV8Bit}, {ARM::FeatureCRC} }, + { ARM::AEK_CRYPTO, {Feature_HasV8Bit}, + {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, + { ARM::AEK_FP, {Feature_HasV8Bit}, + {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} }, + { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), + {Feature_HasV7Bit, Feature_IsNotMClassBit}, + {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} }, + { ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit}, + {ARM::FeatureMP} }, + { ARM::AEK_SIMD, {Feature_HasV8Bit}, + {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} }, + { ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} }, + // FIXME: Only available in A-class, isel not predicated + { ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} }, + { ARM::AEK_FP16, {Feature_HasV8_2aBit}, + {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, + { ARM::AEK_RAS, {Feature_HasV8Bit}, {ARM::FeatureRAS} }, + { ARM::AEK_LOB, {Feature_HasV8_1MMainlineBit}, {ARM::FeatureLOB} }, + // FIXME: Unsupported extensions. + { ARM::AEK_OS, {}, {} }, + { ARM::AEK_IWMMXT, {}, {} }, + { ARM::AEK_IWMMXT2, {}, {} }, + { ARM::AEK_MAVERICK, {}, {} }, + { ARM::AEK_XSCALE, {}, {} }, + }; + MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::Identifier)) @@ -10646,12 +11752,12 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { "allowed for the current base architecture"); MCSubtargetInfo &STI = copySTI(); - FeatureBitset ToggleFeatures = EnableFeature - ? (~STI.getFeatureBits() & Extension.Features) - : ( STI.getFeatureBits() & Extension.Features); - - uint64_t Features = - ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); + if (EnableFeature) { + STI.SetFeatureBitsTransitively(Extension.Features); + } else { + STI.ClearFeatureBitsTransitively(Extension.Features); + } + FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); setAvailableFeatures(Features); return false; } @@ -10675,6 +11781,18 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, if (CE->getValue() == 0) return Match_Success; break; + case MCK__35_8: + if (Op.isImm()) + if (const MCConstantExpr *CE = dyn_cast(Op.getImm())) + if (CE->getValue() == 8) + return Match_Success; + break; + case MCK__35_16: + if (Op.isImm()) + if (const MCConstantExpr *CE = dyn_cast(Op.getImm())) + if (CE->getValue() == 16) + return Match_Success; + break; case MCK_ModImm: if (Op.isImm()) { const MCExpr *SOExpr = Op.getImm(); @@ -10698,3 +11816,76 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, } return Match_InvalidOperand; } + +bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic, + StringRef ExtraToken) { + if (!hasMVE()) + return false; + + return Mnemonic.startswith("vabav") || Mnemonic.startswith("vaddv") || + Mnemonic.startswith("vaddlv") || Mnemonic.startswith("vminnmv") || + Mnemonic.startswith("vminnmav") || Mnemonic.startswith("vminv") || + Mnemonic.startswith("vminav") || Mnemonic.startswith("vmaxnmv") || + Mnemonic.startswith("vmaxnmav") || Mnemonic.startswith("vmaxv") || + Mnemonic.startswith("vmaxav") || Mnemonic.startswith("vmladav") || + Mnemonic.startswith("vrmlaldavh") || Mnemonic.startswith("vrmlalvh") || + Mnemonic.startswith("vmlsdav") || Mnemonic.startswith("vmlav") || + Mnemonic.startswith("vmlaldav") || Mnemonic.startswith("vmlalv") || + Mnemonic.startswith("vmaxnm") || Mnemonic.startswith("vminnm") || + Mnemonic.startswith("vmax") || Mnemonic.startswith("vmin") || + Mnemonic.startswith("vshlc") || Mnemonic.startswith("vmovlt") || + Mnemonic.startswith("vmovlb") || Mnemonic.startswith("vshll") || + Mnemonic.startswith("vrshrn") || Mnemonic.startswith("vshrn") || + Mnemonic.startswith("vqrshrun") || Mnemonic.startswith("vqshrun") || + Mnemonic.startswith("vqrshrn") || Mnemonic.startswith("vqshrn") || + Mnemonic.startswith("vbic") || Mnemonic.startswith("vrev64") || + Mnemonic.startswith("vrev32") || Mnemonic.startswith("vrev16") || + Mnemonic.startswith("vmvn") || Mnemonic.startswith("veor") || + Mnemonic.startswith("vorn") || Mnemonic.startswith("vorr") || + Mnemonic.startswith("vand") || Mnemonic.startswith("vmul") || + Mnemonic.startswith("vqrdmulh") || Mnemonic.startswith("vqdmulh") || + Mnemonic.startswith("vsub") || Mnemonic.startswith("vadd") || + Mnemonic.startswith("vqsub") || Mnemonic.startswith("vqadd") || + Mnemonic.startswith("vabd") || Mnemonic.startswith("vrhadd") || + Mnemonic.startswith("vhsub") || Mnemonic.startswith("vhadd") || + Mnemonic.startswith("vdup") || Mnemonic.startswith("vcls") || + Mnemonic.startswith("vclz") || Mnemonic.startswith("vneg") || + Mnemonic.startswith("vabs") || Mnemonic.startswith("vqneg") || + Mnemonic.startswith("vqabs") || + (Mnemonic.startswith("vrint") && Mnemonic != "vrintr") || + Mnemonic.startswith("vcmla") || Mnemonic.startswith("vfma") || + Mnemonic.startswith("vfms") || Mnemonic.startswith("vcadd") || + Mnemonic.startswith("vadd") || Mnemonic.startswith("vsub") || + Mnemonic.startswith("vshl") || Mnemonic.startswith("vqshl") || + Mnemonic.startswith("vqrshl") || Mnemonic.startswith("vrshl") || + Mnemonic.startswith("vsri") || Mnemonic.startswith("vsli") || + Mnemonic.startswith("vrshr") || Mnemonic.startswith("vshr") || + Mnemonic.startswith("vpsel") || Mnemonic.startswith("vcmp") || + Mnemonic.startswith("vqdmladh") || Mnemonic.startswith("vqrdmladh") || + Mnemonic.startswith("vqdmlsdh") || Mnemonic.startswith("vqrdmlsdh") || + Mnemonic.startswith("vcmul") || Mnemonic.startswith("vrmulh") || + Mnemonic.startswith("vqmovn") || Mnemonic.startswith("vqmovun") || + Mnemonic.startswith("vmovnt") || Mnemonic.startswith("vmovnb") || + Mnemonic.startswith("vmaxa") || Mnemonic.startswith("vmaxnma") || + Mnemonic.startswith("vhcadd") || Mnemonic.startswith("vadc") || + Mnemonic.startswith("vsbc") || Mnemonic.startswith("vrshr") || + Mnemonic.startswith("vshr") || Mnemonic.startswith("vstrb") || + Mnemonic.startswith("vldrb") || + (Mnemonic.startswith("vstrh") && Mnemonic != "vstrhi") || + (Mnemonic.startswith("vldrh") && Mnemonic != "vldrhi") || + Mnemonic.startswith("vstrw") || Mnemonic.startswith("vldrw") || + Mnemonic.startswith("vldrd") || Mnemonic.startswith("vstrd") || + Mnemonic.startswith("vqdmull") || Mnemonic.startswith("vbrsr") || + Mnemonic.startswith("vfmas") || Mnemonic.startswith("vmlas") || + Mnemonic.startswith("vmla") || Mnemonic.startswith("vqdmlash") || + Mnemonic.startswith("vqdmlah") || Mnemonic.startswith("vqrdmlash") || + Mnemonic.startswith("vqrdmlah") || Mnemonic.startswith("viwdup") || + Mnemonic.startswith("vdwdup") || Mnemonic.startswith("vidup") || + Mnemonic.startswith("vddup") || Mnemonic.startswith("vctp") || + Mnemonic.startswith("vpnot") || Mnemonic.startswith("vbic") || + Mnemonic.startswith("vrmlsldavh") || Mnemonic.startswith("vmlsldav") || + Mnemonic.startswith("vcvt") || + (Mnemonic.startswith("vmov") && + !(ExtraToken == ".f16" || ExtraToken == ".32" || + ExtraToken == ".16" || ExtraToken == ".8")); +} diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 61bec04678dd..673691ebd93e 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -1,15 +1,16 @@ //===- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include "ARMBaseInstrInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "MCTargetDesc/ARMMCTargetDesc.h" +#include "TargetInfo/ARMTargetInfo.h" #include "Utils/ARMBaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -63,22 +64,19 @@ namespace { return ITStates.size() == 1; } - // Called when decoding an IT instruction. Sets the IT state for the following - // instructions that for the IT block. Firstcond and Mask correspond to the - // fields in the IT instruction encoding. + // Called when decoding an IT instruction. Sets the IT state for + // the following instructions that for the IT block. Firstcond + // corresponds to the field in the IT instruction encoding; Mask + // is in the MCOperand format in which 1 means 'else' and 0 'then'. void setITState(char Firstcond, char Mask) { // (3 - the number of trailing zeros) is the number of then / else. - unsigned CondBit0 = Firstcond & 1; unsigned NumTZ = countTrailingZeros(Mask); unsigned char CCBits = static_cast(Firstcond & 0xf); assert(NumTZ <= 3 && "Invalid IT mask!"); // push condition codes onto the stack the correct order for the pops for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) { - bool T = ((Mask >> Pos) & 1) == CondBit0; - if (T) - ITStates.push_back(CCBits); - else - ITStates.push_back(CCBits ^ 1); + unsigned Else = (Mask >> Pos) & 1; + ITStates.push_back(CCBits ^ Else); } ITStates.push_back(CCBits); } @@ -87,6 +85,47 @@ namespace { std::vector ITStates; }; + class VPTStatus + { + public: + unsigned getVPTPred() { + unsigned Pred = ARMVCC::None; + if (instrInVPTBlock()) + Pred = VPTStates.back(); + return Pred; + } + + void advanceVPTState() { + VPTStates.pop_back(); + } + + bool instrInVPTBlock() { + return !VPTStates.empty(); + } + + bool instrLastInVPTBlock() { + return VPTStates.size() == 1; + } + + void setVPTState(char Mask) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned NumTZ = countTrailingZeros(Mask); + assert(NumTZ <= 3 && "Invalid VPT mask!"); + // push predicates onto the stack the correct order for the pops + for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) { + bool T = ((Mask >> Pos) & 1) == 0; + if (T) + VPTStates.push_back(ARMVCC::Then); + else + VPTStates.push_back(ARMVCC::Else); + } + VPTStates.push_back(ARMVCC::Then); + } + + private: + SmallVector VPTStates; + }; + /// ARM disassembler for all ARM platforms. class ARMDisassembler : public MCDisassembler { public: @@ -100,27 +139,23 @@ public: ArrayRef Bytes, uint64_t Address, raw_ostream &VStream, raw_ostream &CStream) const override; -}; - -/// Thumb disassembler for all Thumb platforms. -class ThumbDisassembler : public MCDisassembler { -public: - ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : - MCDisassembler(STI, Ctx) { - } - ~ThumbDisassembler() override = default; +private: + DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const; - DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, - ArrayRef Bytes, uint64_t Address, - raw_ostream &VStream, - raw_ostream &CStream) const override; + DecodeStatus getThumbInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const; -private: mutable ITStatus ITBlock; + mutable VPTStatus VPTBlock; DecodeStatus AddThumbPredicate(MCInst&) const; - void UpdateThumbVFPPredicate(MCInst&) const; + void UpdateThumbVFPPredicate(DecodeStatus &, MCInst&) const; }; } // end anonymous namespace @@ -144,12 +179,23 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) { // Definitions are further down. static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPRwithZRnospRegisterClass( + MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -166,12 +212,20 @@ static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst, @@ -262,6 +316,10 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val, @@ -276,6 +334,11 @@ static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn, @@ -324,6 +387,8 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeNEONComplexLane64Instruction(MCInst &Inst, unsigned Val, uint64_t Address, @@ -359,14 +424,28 @@ static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, uint64_t Address, const void* Decoder); static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn, @@ -409,6 +488,82 @@ static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val, static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst, + unsigned Val, + uint64_t Address, + const void *Decoder); +template +static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); +template +static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +template +static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +template +static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +template +static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); #include "ARMGenDisassemblerTables.inc" static MCDisassembler *createARMDisassembler(const Target &T, @@ -417,12 +572,6 @@ static MCDisassembler *createARMDisassembler(const Target &T, return new ARMDisassembler(STI, Ctx); } -static MCDisassembler *createThumbDisassembler(const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new ThumbDisassembler(STI, Ctx); -} - // Post-decoding checks static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, uint64_t Address, raw_ostream &OS, @@ -440,6 +589,18 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, return MCDisassembler::SoftFail; return Result; } + case ARM::t2ADDri: + case ARM::t2ADDri12: + case ARM::t2ADDrr: + case ARM::t2ADDrs: + case ARM::t2SUBri: + case ARM::t2SUBri12: + case ARM::t2SUBrr: + case ARM::t2SUBrs: + if (MI.getOperand(0).getReg() == ARM::SP && + MI.getOperand(1).getReg() != ARM::SP) + return MCDisassembler::SoftFail; + return Result; default: return Result; } } @@ -448,6 +609,16 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes, uint64_t Address, raw_ostream &OS, raw_ostream &CS) const { + if (STI.getFeatureBits()[ARM::ModeThumb]) + return getThumbInstruction(MI, Size, Bytes, Address, OS, CS); + return getARMInstruction(MI, Size, Bytes, Address, OS, CS); +} + +DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &OS, + raw_ostream &CS) const { CommentStream = &CS; assert(!STI.getFeatureBits()[ARM::ModeThumb] && @@ -569,12 +740,22 @@ static void AddThumb1SBit(MCInst &MI, bool InITBlock) { MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR)); } +static bool isVectorPredicable(unsigned Opcode) { + const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + unsigned short NumOps = ARMInsts[Opcode].NumOperands; + for (unsigned i = 0; i < NumOps; ++i) { + if (ARM::isVpred(OpInfo[i].OperandType)) + return true; + } + return false; +} + // Most Thumb instructions don't have explicit predicates in the // encoding, but rather get their predicates from IT context. We need // to fix up the predicate operands using this context information as a // post-pass. MCDisassembler::DecodeStatus -ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { +ARMDisassembler::AddThumbPredicate(MCInst &MI) const { MCDisassembler::DecodeStatus S = Success; const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits(); @@ -590,6 +771,10 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { case ARM::t2CPS3p: case ARM::t2CPS2p: case ARM::t2CPS1p: + case ARM::t2CSEL: + case ARM::t2CSINC: + case ARM::t2CSINV: + case ARM::t2CSNEG: case ARM::tMOVSr: case ARM::tSETEND: // Some instructions (mostly conditional branches) are not @@ -616,37 +801,66 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { break; } - // If we're in an IT block, base the predicate on that. Otherwise, + // Warn on non-VPT predicable instruction in a VPT block and a VPT + // predicable instruction in an IT block + if ((!isVectorPredicable(MI.getOpcode()) && VPTBlock.instrInVPTBlock()) || + (isVectorPredicable(MI.getOpcode()) && ITBlock.instrInITBlock())) + S = SoftFail; + + // If we're in an IT/VPT block, base the predicate on that. Otherwise, // assume a predicate of AL. - unsigned CC; - CC = ITBlock.getITCC(); - if (CC == 0xF) - CC = ARMCC::AL; - if (ITBlock.instrInITBlock()) + unsigned CC = ARMCC::AL; + unsigned VCC = ARMVCC::None; + if (ITBlock.instrInITBlock()) { + CC = ITBlock.getITCC(); ITBlock.advanceITState(); + } else if (VPTBlock.instrInVPTBlock()) { + VCC = VPTBlock.getVPTPred(); + VPTBlock.advanceVPTState(); + } const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; - MCInst::iterator I = MI.begin(); - for (unsigned i = 0; i < NumOps; ++i, ++I) { - if (I == MI.end()) break; - if (OpInfo[i].isPredicate()) { - I = MI.insert(I, MCOperand::createImm(CC)); - ++I; - if (CC == ARMCC::AL) - MI.insert(I, MCOperand::createReg(0)); - else - MI.insert(I, MCOperand::createReg(ARM::CPSR)); - return S; - } + + MCInst::iterator CCI = MI.begin(); + for (unsigned i = 0; i < NumOps; ++i, ++CCI) { + if (OpInfo[i].isPredicate() || CCI == MI.end()) break; } - I = MI.insert(I, MCOperand::createImm(CC)); - ++I; - if (CC == ARMCC::AL) - MI.insert(I, MCOperand::createReg(0)); - else - MI.insert(I, MCOperand::createReg(ARM::CPSR)); + if (ARMInsts[MI.getOpcode()].isPredicable()) { + CCI = MI.insert(CCI, MCOperand::createImm(CC)); + ++CCI; + if (CC == ARMCC::AL) + MI.insert(CCI, MCOperand::createReg(0)); + else + MI.insert(CCI, MCOperand::createReg(ARM::CPSR)); + } else if (CC != ARMCC::AL) { + Check(S, SoftFail); + } + + MCInst::iterator VCCI = MI.begin(); + unsigned VCCPos; + for (VCCPos = 0; VCCPos < NumOps; ++VCCPos, ++VCCI) { + if (ARM::isVpred(OpInfo[VCCPos].OperandType) || VCCI == MI.end()) break; + } + + if (isVectorPredicable(MI.getOpcode())) { + VCCI = MI.insert(VCCI, MCOperand::createImm(VCC)); + ++VCCI; + if (VCC == ARMVCC::None) + MI.insert(VCCI, MCOperand::createReg(0)); + else + MI.insert(VCCI, MCOperand::createReg(ARM::P0)); + if (OpInfo[VCCPos].OperandType == ARM::OPERAND_VPRED_R) { + int TiedOp = ARMInsts[MI.getOpcode()].getOperandConstraint( + VCCPos + 2, MCOI::TIED_TO); + assert(TiedOp >= 0 && + "Inactive register in vpred_r is not tied to an output!"); + MI.insert(VCCI, MI.getOperand(TiedOp)); + } + } else if (VCC != ARMVCC::None) { + Check(S, SoftFail); + } return S; } @@ -656,19 +870,26 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { // mode, the auto-generated decoder will give them an (incorrect) // predicate operand. We need to rewrite these operands based on the IT // context as a post-pass. -void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const { +void ARMDisassembler::UpdateThumbVFPPredicate( + DecodeStatus &S, MCInst &MI) const { unsigned CC; CC = ITBlock.getITCC(); if (CC == 0xF) CC = ARMCC::AL; if (ITBlock.instrInITBlock()) ITBlock.advanceITState(); + else if (VPTBlock.instrInVPTBlock()) { + CC = VPTBlock.getVPTPred(); + VPTBlock.advanceVPTState(); + } const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; MCInst::iterator I = MI.begin(); unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; for (unsigned i = 0; i < NumOps; ++i, ++I) { if (OpInfo[i].isPredicate() ) { + if (CC != ARMCC::AL && !ARMInsts[MI.getOpcode()].isPredicable()) + Check(S, SoftFail); I->setImm(CC); ++I; if (CC == ARMCC::AL) @@ -680,11 +901,11 @@ void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const { } } -DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, - ArrayRef Bytes, - uint64_t Address, - raw_ostream &OS, - raw_ostream &CS) const { +DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &OS, + raw_ostream &CS) const { CommentStream = &CS; assert(STI.getFeatureBits()[ARM::ModeThumb] && @@ -751,6 +972,27 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, uint32_t Insn32 = (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16); + + Result = + decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + + // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add + // the VPT predicate. + if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock()) + Result = MCDisassembler::SoftFail; + + Check(Result, AddThumbPredicate(MI)); + + if (isVPTOpcode(MI.getOpcode())) { + unsigned Mask = MI.getOperand(0).getImm(); + VPTBlock.setVPTState(Mask); + } + + return Result; + } + Result = decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -766,7 +1008,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Result != MCDisassembler::Fail) { Size = 4; Check(Result, AddThumbPredicate(MI)); - return Result; + return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn32, Result); } if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { @@ -774,7 +1016,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; - UpdateThumbVFPPredicate(MI); + UpdateThumbVFPPredicate(Result, MI); return Result; } } @@ -861,9 +1103,9 @@ extern "C" void LLVMInitializeARMDisassembler() { TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(), createARMDisassembler); TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(), - createThumbDisassembler); + createARMDisassembler); TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(), - createThumbDisassembler); + createARMDisassembler); } static const uint16_t GPRDecoderTable[] = { @@ -873,6 +1115,13 @@ static const uint16_t GPRDecoderTable[] = { ARM::R12, ARM::SP, ARM::LR, ARM::PC }; +static const uint16_t CLRMGPRDecoderTable[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, ARM::R11, + ARM::R12, 0, ARM::LR, ARM::APSR +}; + static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { if (RegNo > 15) @@ -883,6 +1132,20 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + unsigned Register = CLRMGPRDecoderTable[RegNo]; + if (Register == 0) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { @@ -911,6 +1174,34 @@ DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, return S; } +static DecodeStatus +DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo == 15) + { + Inst.addOperand(MCOperand::createReg(ARM::ZR)); + return MCDisassembler::Success; + } + + if (RegNo == 13) + Check(S, MCDisassembler::SoftFail); + + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); + return S; +} + +static DecodeStatus +DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + if (RegNo == 13) + return MCDisassembler::Fail; + Check(S, DecodeGPRwithZRRegisterClass(Inst, RegNo, Address, Decoder)); + return S; +} + static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { if (RegNo > 7) @@ -1024,9 +1315,9 @@ static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, const FeatureBitset &featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); - bool hasD16 = featureBits[ARM::FeatureD16]; + bool hasD32 = featureBits[ARM::FeatureD32]; - if (RegNo > 31 || (hasD16 && RegNo > 15)) + if (RegNo > 31 || (!hasD32 && RegNo > 15)) return MCDisassembler::Fail; unsigned Register = DPRDecoderTable[RegNo]; @@ -1041,6 +1332,13 @@ static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo, return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder); } +static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { @@ -1111,16 +1409,19 @@ static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst, static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; if (Val == 0xF) return MCDisassembler::Fail; // AL predicate is not allowed on Thumb1 branches. if (Inst.getOpcode() == ARM::tBcc && Val == 0xE) return MCDisassembler::Fail; + if (Val != ARMCC::AL && !ARMInsts[Inst.getOpcode()].isPredicable()) + Check(S, MCDisassembler::SoftFail); Inst.addOperand(MCOperand::createImm(Val)); if (Val == ARMCC::AL) { Inst.addOperand(MCOperand::createReg(0)); } else Inst.addOperand(MCOperand::createReg(ARM::CPSR)); - return MCDisassembler::Success; + return S; } static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val, @@ -1210,6 +1511,7 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, bool NeedDisjointWriteback = false; unsigned WritebackReg = 0; + bool CLRM = false; switch (Inst.getOpcode()) { default: break; @@ -1224,17 +1526,26 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, NeedDisjointWriteback = true; WritebackReg = Inst.getOperand(0).getReg(); break; + case ARM::t2CLRM: + CLRM = true; + break; } // Empty register lists are not allowed. if (Val == 0) return MCDisassembler::Fail; for (unsigned i = 0; i < 16; ++i) { if (Val & (1 << i)) { - if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder))) - return MCDisassembler::Fail; - // Writeback not allowed if Rn is in the target list. - if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg()) - Check(S, MCDisassembler::SoftFail); + if (CLRM) { + if (!Check(S, DecodeCLRMGPRRegisterClass(Inst, i, Address, Decoder))) { + return MCDisassembler::Fail; + } + } else { + if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder))) + return MCDisassembler::Fail; + // Writeback not allowed if Rn is in the target list. + if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg()) + Check(S, MCDisassembler::SoftFail); + } } } @@ -1327,6 +1638,8 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn, unsigned imm = fieldFromInstruction(Insn, 0, 8); unsigned Rn = fieldFromInstruction(Insn, 16, 4); unsigned U = fieldFromInstruction(Insn, 23, 1); + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); switch (Inst.getOpcode()) { case ARM::LDC_OFFSET: @@ -1361,15 +1674,42 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn, case ARM::t2STCL_PRE: case ARM::t2STCL_POST: case ARM::t2STCL_OPTION: - if (coproc == 0xA || coproc == 0xB) + case ARM::t2LDC2_OFFSET: + case ARM::t2LDC2L_OFFSET: + case ARM::t2LDC2_PRE: + case ARM::t2LDC2L_PRE: + case ARM::t2STC2_OFFSET: + case ARM::t2STC2L_OFFSET: + case ARM::t2STC2_PRE: + case ARM::t2STC2L_PRE: + case ARM::LDC2_OFFSET: + case ARM::LDC2L_OFFSET: + case ARM::LDC2_PRE: + case ARM::LDC2L_PRE: + case ARM::STC2_OFFSET: + case ARM::STC2L_OFFSET: + case ARM::STC2_PRE: + case ARM::STC2L_PRE: + case ARM::t2LDC2_OPTION: + case ARM::t2STC2_OPTION: + case ARM::t2LDC2_POST: + case ARM::t2LDC2L_POST: + case ARM::t2STC2_POST: + case ARM::t2STC2L_POST: + case ARM::LDC2_POST: + case ARM::LDC2L_POST: + case ARM::STC2_POST: + case ARM::STC2L_POST: + if (coproc == 0xA || coproc == 0xB || + (featureBits[ARM::HasV8_1MMainlineOps] && + (coproc == 0x8 || coproc == 0x9 || coproc == 0xA || coproc == 0xB || + coproc == 0xE || coproc == 0xF))) return MCDisassembler::Fail; break; default: break; } - const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); if (featureBits[ARM::HasV8Ops] && (coproc != 14)) return MCDisassembler::Fail; @@ -3150,6 +3490,60 @@ DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus +DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) | + fieldFromInstruction(Insn, 13, 3)); + unsigned cmode = fieldFromInstruction(Insn, 8, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 4); + imm |= fieldFromInstruction(Insn, 16, 3) << 4; + imm |= fieldFromInstruction(Insn, 28, 1) << 7; + imm |= cmode << 8; + imm |= fieldFromInstruction(Insn, 5, 1) << 12; + + if (cmode == 0xF && Inst.getOpcode() == ARM::MVE_VMVNimmi32) + return MCDisassembler::Fail; + + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(imm)); + + Inst.addOperand(MCOperand::createImm(ARMVCC::None)); + Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createImm(0)); + + return S; +} + +static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Qd = fieldFromInstruction(Insn, 13, 3); + Qd |= fieldFromInstruction(Insn, 22, 1) << 3; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV)); + + unsigned Qn = fieldFromInstruction(Insn, 17, 3); + Qn |= fieldFromInstruction(Insn, 7, 1) << 3; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qn, Address, Decoder))) + return MCDisassembler::Fail; + unsigned Qm = fieldFromInstruction(Insn, 1, 3); + Qm |= fieldFromInstruction(Insn, 5, 1) << 3; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder))) + return MCDisassembler::Fail; + if (!fieldFromInstruction(Insn, 12, 1)) // I bit clear => need input FPSCR + Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV)); + Inst.addOperand(MCOperand::createImm(Qd)); + + return S; +} + static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -3706,6 +4100,21 @@ static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, return MCDisassembler::Success; } +static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address, + const void *Decoder) { + if (Val == 0) + Inst.addOperand(MCOperand::createImm(INT32_MIN)); + else { + int imm = Val & 0x7F; + + if (!(Val & 0x80)) + imm *= -1; + Inst.addOperand(MCOperand::createImm(imm * 4)); + } + + return MCDisassembler::Success; +} + static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -3721,6 +4130,22 @@ static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val, return S; } +static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 8, 4); + unsigned imm = fieldFromInstruction(Val, 0, 8); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm7S4(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -3748,8 +4173,23 @@ static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, return MCDisassembler::Success; } -static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, - uint64_t Address, const void *Decoder) { +template +static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + int imm = Val & 0x7F; + if (Val == 0) + imm = INT32_MIN; + else if (!(Val & 0x80)) + imm *= -1; + if (imm != INT32_MIN) + imm *= (1U << shift); + Inst.addOperand(MCOperand::createImm(imm)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; unsigned Rn = fieldFromInstruction(Val, 9, 4); @@ -3794,6 +4234,42 @@ static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, return S; } +template +static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 8, 3); + unsigned imm = fieldFromInstruction(Val, 0, 8); + + if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm7(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +template +static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 8, 4); + unsigned imm = fieldFromInstruction(Val, 0, 8); + if (WriteBack) { + if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } else if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm7(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -3941,6 +4417,43 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rn = fieldFromInstruction(Insn, 3, 4); + unsigned Qm = fieldFromInstruction(Insn, 0, 3); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +template +static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Qm = fieldFromInstruction(Insn, 8, 3); + int imm = fieldFromInstruction(Insn, 0, 7); + + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder))) + return MCDisassembler::Fail; + + if(!fieldFromInstruction(Insn, 7, 1)) { + if (imm == 0) + imm = INT32_MIN; // indicate -0 + else + imm *= -1; + } + if (imm != INT32_MIN) + imm *= (1U << shift); + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { // Val is passed in as S:J1:J2:imm10H:imm10L:'0' @@ -3973,7 +4486,7 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val, const FeatureBitset &featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); - if (featureBits[ARM::HasV8Ops] && !(Val == 14 || Val == 15)) + if (!isValidCoprocessorNumber(Val, featureBits)) return MCDisassembler::Fail; Inst.addOperand(MCOperand::createImm(Val)); @@ -4981,6 +5494,16 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn, if (mask == 0x0) return MCDisassembler::Fail; + // IT masks are encoded as a sequence of replacement low-order bits + // for the condition code. So if the low bit of the starting + // condition code is 1, then we have to flip all the bits above the + // terminating bit (which is the lowest 1 bit). + if (pred & 1) { + unsigned LowBit = mask & -mask; + unsigned BitsAboveLowBit = 0xF & (-LowBit << 1); + mask ^= BitsAboveLowBit; + } + Inst.addOperand(MCOperand::createImm(pred)); Inst.addOperand(MCOperand::createImm(mask)); return S; @@ -5341,14 +5864,37 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val, ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); DecodeStatus S = MCDisassembler::Success; - unsigned Rt = fieldFromInstruction(Val, 12, 4); + // Add explicit operand for the destination sysreg, for cases where + // we have to model it for code generation purposes. + switch (Inst.getOpcode()) { + case ARM::VMSR_FPSCR_NZCVQC: + Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV)); + break; + case ARM::VMSR_P0: + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + break; + } - if (featureBits[ARM::ModeThumb] && !featureBits[ARM::HasV8Ops]) { - if (Rt == 13 || Rt == 15) - S = MCDisassembler::SoftFail; - Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)); - } else - Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)); + if (Inst.getOpcode() != ARM::FMSTAT) { + unsigned Rt = fieldFromInstruction(Val, 12, 4); + + if (featureBits[ARM::ModeThumb] && !featureBits[ARM::HasV8Ops]) { + if (Rt == 13 || Rt == 15) + S = MCDisassembler::SoftFail; + Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)); + } else + Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)); + } + + // Add explicit operand for the source sysreg, similarly to above. + switch (Inst.getOpcode()) { + case ARM::VMRS_FPSCR_NZCVQC: + Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV)); + break; + case ARM::VMRS_P0: + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + break; + } if (featureBits[ARM::ModeThumb]) { Inst.addOperand(MCOperand::createImm(ARMCC::AL)); @@ -5361,3 +5907,668 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val, return S; } + +template +static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + if (Val == 0 && !zeroPermitted) + S = MCDisassembler::Fail; + + uint64_t DecVal; + if (isSigned) + DecVal = SignExtend32(Val << 1); + else + DecVal = (Val << 1); + + if (!tryAddingSymbolicOperand(Address, Address + DecVal + 4, true, 4, Inst, + Decoder)) + Inst.addOperand(MCOperand::createImm(isNeg ? -DecVal : DecVal)); + return S; +} + +static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + + uint64_t LocImm = Inst.getOperand(0).getImm(); + Val = LocImm + (2 << Val); + if (!tryAddingSymbolicOperand(Address, Address + Val + 4, true, 4, Inst, + Decoder)) + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val >= ARMCC::AL) // also exclude the non-condition NV + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (Inst.getOpcode() == ARM::MVE_LCTP) + return S; + + unsigned Imm = fieldFromInstruction(Insn, 11, 1) | + fieldFromInstruction(Insn, 1, 10) << 1; + switch (Inst.getOpcode()) { + case ARM::t2LEUpdate: + case ARM::MVE_LETP: + Inst.addOperand(MCOperand::createReg(ARM::LR)); + Inst.addOperand(MCOperand::createReg(ARM::LR)); + LLVM_FALLTHROUGH; + case ARM::t2LE: + if (!Check(S, DecodeBFLabelOperand( + Inst, Imm, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::t2WLS: + case ARM::MVE_WLSTP_8: + case ARM::MVE_WLSTP_16: + case ARM::MVE_WLSTP_32: + case ARM::MVE_WLSTP_64: + Inst.addOperand(MCOperand::createReg(ARM::LR)); + if (!Check(S, + DecoderGPRRegisterClass(Inst, fieldFromInstruction(Insn, 16, 4), + Address, Decoder)) || + !Check(S, DecodeBFLabelOperand( + Inst, Imm, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::t2DLS: + case ARM::MVE_DLSTP_8: + case ARM::MVE_DLSTP_16: + case ARM::MVE_DLSTP_32: + case ARM::MVE_DLSTP_64: + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + if (Rn == 0xF) { + // Enforce all the rest of the instruction bits in LCTP, which + // won't have been reliably checked based on LCTP's own tablegen + // record, because we came to this decode by a roundabout route. + uint32_t CanonicalLCTP = 0xF00FE001, SBZMask = 0x00300FFE; + if ((Insn & ~SBZMask) != CanonicalLCTP) + return MCDisassembler::Fail; // a mandatory bit is wrong: hard fail + if (Insn != CanonicalLCTP) + Check(S, MCDisassembler::SoftFail); // an SBZ bit is wrong: soft fail + + Inst.setOpcode(ARM::MVE_LCTP); + } else { + Inst.addOperand(MCOperand::createReg(ARM::LR)); + if (!Check(S, DecoderGPRRegisterClass(Inst, + fieldFromInstruction(Insn, 16, 4), + Address, Decoder))) + return MCDisassembler::Fail; + } + break; + } + return S; +} + +static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (Val == 0) + Val = 32; + + Inst.addOperand(MCOperand::createImm(Val)); + + return S; +} + +static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if ((RegNo) + 1 > 11) + return MCDisassembler::Fail; + + unsigned Register = GPRDecoderTable[(RegNo) + 1]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if ((RegNo) > 14) + return MCDisassembler::Fail; + + unsigned Register = GPRDecoderTable[(RegNo)]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + Inst.addOperand(MCOperand::createImm(ARMCC::AL)); + Inst.addOperand(MCOperand::createReg(0)); + if (Inst.getOpcode() == ARM::VSCCLRMD) { + unsigned reglist = (fieldFromInstruction(Insn, 1, 7) << 1) | + (fieldFromInstruction(Insn, 12, 4) << 8) | + (fieldFromInstruction(Insn, 22, 1) << 12); + if (!Check(S, DecodeDPRRegListOperand(Inst, reglist, Address, Decoder))) { + return MCDisassembler::Fail; + } + } else { + unsigned reglist = fieldFromInstruction(Insn, 0, 8) | + (fieldFromInstruction(Insn, 22, 1) << 8) | + (fieldFromInstruction(Insn, 12, 4) << 9); + if (!Check(S, DecodeSPRRegListOperand(Inst, reglist, Address, Decoder))) { + return MCDisassembler::Fail; + } + } + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + + return S; +} + +static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + + unsigned Register = QPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const uint16_t QQPRDecoderTable[] = { + ARM::Q0_Q1, ARM::Q1_Q2, ARM::Q2_Q3, ARM::Q3_Q4, + ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7 +}; + +static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 6) + return MCDisassembler::Fail; + + unsigned Register = QQPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const uint16_t QQQQPRDecoderTable[] = { + ARM::Q0_Q1_Q2_Q3, ARM::Q1_Q2_Q3_Q4, ARM::Q2_Q3_Q4_Q5, + ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7 +}; + +static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 4) + return MCDisassembler::Fail; + + unsigned Register = QQQQPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + // Parse VPT mask and encode it in the MCInst as an immediate with the same + // format as the it_mask. That is, from the second 'e|t' encode 'e' as 1 and + // 't' as 0 and finish with a 1. + unsigned Imm = 0; + // We always start with a 't'. + unsigned CurBit = 0; + for (int i = 3; i >= 0; --i) { + // If the bit we are looking at is not the same as last one, invert the + // CurBit, if it is the same leave it as is. + CurBit ^= (Val >> i) & 1U; + + // Encode the CurBit at the right place in the immediate. + Imm |= (CurBit << i); + + // If we are done, finish the encoding with a 1. + if ((Val & ~(~0U << i)) == 0) { + Imm |= 1U << i; + break; + } + } + + Inst.addOperand(MCOperand::createImm(Imm)); + + return S; +} + +static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + // The vpred_r operand type includes an MQPR register field derived + // from the encoding. But we don't actually want to add an operand + // to the MCInst at this stage, because AddThumbPredicate will do it + // later, and will infer the register number from the TIED_TO + // constraint. So this is a deliberately empty decoder method that + // will inhibit the auto-generated disassembly code from adding an + // operand at all. + return MCDisassembler::Success; +} + +static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst, + unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm((Val & 0x1) == 0 ? ARMCC::EQ : ARMCC::NE)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst, + unsigned Val, + uint64_t Address, + const void *Decoder) { + unsigned Code; + switch (Val & 0x3) { + case 0: + Code = ARMCC::GE; + break; + case 1: + Code = ARMCC::LT; + break; + case 2: + Code = ARMCC::GT; + break; + case 3: + Code = ARMCC::LE; + break; + } + Inst.addOperand(MCOperand::createImm(Code)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeRestrictedUPredicateOperand(MCInst &Inst, + unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm((Val & 0x1) == 0 ? ARMCC::HS : ARMCC::HI)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + unsigned Code; + switch (Val) { + default: + return MCDisassembler::Fail; + case 0: + Code = ARMCC::EQ; + break; + case 1: + Code = ARMCC::NE; + break; + case 4: + Code = ARMCC::GE; + break; + case 5: + Code = ARMCC::LT; + break; + case 6: + Code = ARMCC::GT; + break; + case 7: + Code = ARMCC::LE; + break; + } + + Inst.addOperand(MCOperand::createImm(Code)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned DecodedVal = 64 - Val; + + switch (Inst.getOpcode()) { + case ARM::MVE_VCVTf16s16_fix: + case ARM::MVE_VCVTs16f16_fix: + case ARM::MVE_VCVTf16u16_fix: + case ARM::MVE_VCVTu16f16_fix: + if (DecodedVal > 16) + return MCDisassembler::Fail; + break; + case ARM::MVE_VCVTf32s32_fix: + case ARM::MVE_VCVTs32f32_fix: + case ARM::MVE_VCVTf32u32_fix: + case ARM::MVE_VCVTu32f32_fix: + if (DecodedVal > 32) + return MCDisassembler::Fail; + break; + } + + Inst.addOperand(MCOperand::createImm(64 - Val)); + + return S; +} + +static unsigned FixedRegForVSTRVLDR_SYSREG(unsigned Opcode) { + switch (Opcode) { + case ARM::VSTR_P0_off: + case ARM::VSTR_P0_pre: + case ARM::VSTR_P0_post: + case ARM::VLDR_P0_off: + case ARM::VLDR_P0_pre: + case ARM::VLDR_P0_post: + return ARM::P0; + default: + return 0; + } +} + +template +static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + switch (Inst.getOpcode()) { + case ARM::VSTR_FPSCR_pre: + case ARM::VSTR_FPSCR_NZCVQC_pre: + case ARM::VLDR_FPSCR_pre: + case ARM::VLDR_FPSCR_NZCVQC_pre: + case ARM::VSTR_FPSCR_off: + case ARM::VSTR_FPSCR_NZCVQC_off: + case ARM::VLDR_FPSCR_off: + case ARM::VLDR_FPSCR_NZCVQC_off: + case ARM::VSTR_FPSCR_post: + case ARM::VSTR_FPSCR_NZCVQC_post: + case ARM::VLDR_FPSCR_post: + case ARM::VLDR_FPSCR_NZCVQC_post: + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + + if (!featureBits[ARM::HasMVEIntegerOps] && !featureBits[ARM::FeatureVFP2]) + return MCDisassembler::Fail; + } + + DecodeStatus S = MCDisassembler::Success; + if (unsigned Sysreg = FixedRegForVSTRVLDR_SYSREG(Inst.getOpcode())) + Inst.addOperand(MCOperand::createReg(Sysreg)); + unsigned Rn = fieldFromInstruction(Val, 16, 4); + unsigned addr = fieldFromInstruction(Val, 0, 7) | + (fieldFromInstruction(Val, 23, 1) << 7) | (Rn << 8); + + if (Writeback) { + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeT2AddrModeImm7s4(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(ARMCC::AL)); + Inst.addOperand(MCOperand::createReg(0)); + + return S; +} + +static inline DecodeStatus DecodeMVE_MEM_pre( + MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder, + unsigned Rn, OperandDecoder RnDecoder, OperandDecoder AddrDecoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Qd = fieldFromInstruction(Val, 13, 3); + unsigned addr = fieldFromInstruction(Val, 0, 7) | + (fieldFromInstruction(Val, 23, 1) << 7) | (Rn << 8); + + if (!Check(S, RnDecoder(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +template +static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder, + fieldFromInstruction(Val, 16, 3), + DecodetGPRRegisterClass, + DecodeTAddrModeImm7); +} + +template +static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder, + fieldFromInstruction(Val, 16, 4), + DecoderGPRRegisterClass, + DecodeT2AddrModeImm7); +} + +template +static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder, + fieldFromInstruction(Val, 17, 3), + DecodeMQPRRegisterClass, + DecodeMveAddrModeQ); +} + +template +static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (Val < MinLog || Val > MaxLog) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(1LL << Val)); + return S; +} + +template +static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Val <<= shift; + + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +template +static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + Inst.addOperand(MCOperand::createImm(start + Val)); + + return S; +} + +static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rt = fieldFromInstruction(Insn, 0, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 16, 4); + unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) | + fieldFromInstruction(Insn, 13, 3)); + unsigned index = fieldFromInstruction(Insn, 4, 1); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMVEPairVectorIndexOperand<2>(Inst, index, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMVEPairVectorIndexOperand<0>(Inst, index, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rt = fieldFromInstruction(Insn, 0, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 16, 4); + unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) | + fieldFromInstruction(Insn, 13, 3)); + unsigned index = fieldFromInstruction(Insn, 4, 1); + + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMVEPairVectorIndexOperand<2>(Inst, index, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMVEPairVectorIndexOperand<0>(Inst, index, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeMVEOverlappingLongShift( + MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned RdaLo = fieldFromInstruction(Insn, 17, 3) << 1; + unsigned RdaHi = fieldFromInstruction(Insn, 9, 3) << 1; + unsigned Rm = fieldFromInstruction(Insn, 12, 4); + + if (RdaHi == 14) { + // This value of RdaHi (really indicating pc, because RdaHi has to + // be an odd-numbered register, so the low bit will be set by the + // decode function below) indicates that we must decode as SQRSHR + // or UQRSHL, which both have a single Rda register field with all + // four bits. + unsigned Rda = fieldFromInstruction(Insn, 16, 4); + + switch (Inst.getOpcode()) { + case ARM::MVE_ASRLr: + case ARM::MVE_SQRSHRL: + Inst.setOpcode(ARM::MVE_SQRSHR); + break; + case ARM::MVE_LSLLr: + case ARM::MVE_UQRSHLL: + Inst.setOpcode(ARM::MVE_UQRSHL); + break; + default: + llvm_unreachable("Unexpected starting opcode!"); + } + + // Rda as output parameter + if (!Check(S, DecoderGPRRegisterClass(Inst, Rda, Address, Decoder))) + return MCDisassembler::Fail; + + // Rda again as input parameter + if (!Check(S, DecoderGPRRegisterClass(Inst, Rda, Address, Decoder))) + return MCDisassembler::Fail; + + // Rm, the amount to shift by + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; + } + + // Otherwise, we decode as whichever opcode our caller has already + // put into Inst. Those all look the same: + + // RdaLo,RdaHi as output parameters + if (!Check(S, DecodetGPREvenRegisterClass(Inst, RdaLo, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodetGPROddRegisterClass(Inst, RdaHi, Address, Decoder))) + return MCDisassembler::Fail; + + // RdaLo,RdaHi again as input parameters + if (!Check(S, DecodetGPREvenRegisterClass(Inst, RdaLo, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodetGPROddRegisterClass(Inst, RdaHi, Address, Decoder))) + return MCDisassembler::Fail; + + // Rm, the amount to shift by + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) | + fieldFromInstruction(Insn, 13, 3)); + unsigned Qm = ((fieldFromInstruction(Insn, 5, 1) << 3) | + fieldFromInstruction(Insn, 1, 3)); + unsigned imm6 = fieldFromInstruction(Insn, 16, 6); + + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeVCVTImmOperand(Inst, imm6, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +template +static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + unsigned Qn = fieldFromInstruction(Insn, 17, 3); + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qn, Address, Decoder))) + return MCDisassembler::Fail; + + unsigned fc; + + if (scalar) { + fc = fieldFromInstruction(Insn, 12, 1) << 2 | + fieldFromInstruction(Insn, 7, 1) | + fieldFromInstruction(Insn, 5, 1) << 1; + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + if (!Check(S, DecodeGPRwithZRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else { + fc = fieldFromInstruction(Insn, 12, 1) << 2 | + fieldFromInstruction(Insn, 7, 1) | + fieldFromInstruction(Insn, 0, 1) << 1; + unsigned Qm = fieldFromInstruction(Insn, 5, 1) << 4 | + fieldFromInstruction(Insn, 1, 3); + if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, predicate_decoder(Inst, fc, Address, Decoder))) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(ARMVCC::None)); + Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createImm(0)); + + return S; +} + +static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp deleted file mode 100644 index 2f84719c4c4f..000000000000 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ /dev/null @@ -1,1571 +0,0 @@ -//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an ARM MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "ARMInstPrinter.h" -#include "Utils/ARMBaseInfo.h" -#include "MCTargetDesc/ARMAddressingModes.h" -#include "MCTargetDesc/ARMBaseInfo.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/SubtargetFeature.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#define PRINT_ALIAS_INSTR -#include "ARMGenAsmWriter.inc" - -/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing. -/// -/// getSORegOffset returns an integer from 0-31, representing '32' as 0. -static unsigned translateShiftImm(unsigned imm) { - // lsr #32 and asr #32 exist, but should be encoded as a 0. - assert((imm & ~0x1f) == 0 && "Invalid shift encoding"); - - if (imm == 0) - return 32; - return imm; -} - -/// Prints the shift value with an immediate value. -static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc, - unsigned ShImm, bool UseMarkup) { - if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm)) - return; - O << ", "; - - assert(!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0"); - O << getShiftOpcStr(ShOpc); - - if (ShOpc != ARM_AM::rrx) { - O << " "; - if (UseMarkup) - O << ""; - } -} - -ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - -void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << markup(""); -} - -void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - unsigned Opcode = MI->getOpcode(); - - switch (Opcode) { - // Check for MOVs and print canonical forms, instead. - case ARM::MOVsr: { - // FIXME: Thumb variants? - const MCOperand &Dst = MI->getOperand(0); - const MCOperand &MO1 = MI->getOperand(1); - const MCOperand &MO2 = MI->getOperand(2); - const MCOperand &MO3 = MI->getOperand(3); - - O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())); - printSBitModifierOperand(MI, 6, STI, O); - printPredicateOperand(MI, 4, STI, O); - - O << '\t'; - printRegName(O, Dst.getReg()); - O << ", "; - printRegName(O, MO1.getReg()); - - O << ", "; - printRegName(O, MO2.getReg()); - assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); - printAnnotation(O, Annot); - return; - } - - case ARM::MOVsi: { - // FIXME: Thumb variants? - const MCOperand &Dst = MI->getOperand(0); - const MCOperand &MO1 = MI->getOperand(1); - const MCOperand &MO2 = MI->getOperand(2); - - O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm())); - printSBitModifierOperand(MI, 5, STI, O); - printPredicateOperand(MI, 3, STI, O); - - O << '\t'; - printRegName(O, Dst.getReg()); - O << ", "; - printRegName(O, MO1.getReg()); - - if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) { - printAnnotation(O, Annot); - return; - } - - O << ", " << markup(""); - printAnnotation(O, Annot); - return; - } - - // A8.6.123 PUSH - case ARM::STMDB_UPD: - case ARM::t2STMDB_UPD: - if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) { - // Should only print PUSH if there are at least two registers in the list. - O << '\t' << "push"; - printPredicateOperand(MI, 2, STI, O); - if (Opcode == ARM::t2STMDB_UPD) - O << ".w"; - O << '\t'; - printRegisterList(MI, 4, STI, O); - printAnnotation(O, Annot); - return; - } else - break; - - case ARM::STR_PRE_IMM: - if (MI->getOperand(2).getReg() == ARM::SP && - MI->getOperand(3).getImm() == -4) { - O << '\t' << "push"; - printPredicateOperand(MI, 4, STI, O); - O << "\t{"; - printRegName(O, MI->getOperand(1).getReg()); - O << "}"; - printAnnotation(O, Annot); - return; - } else - break; - - // A8.6.122 POP - case ARM::LDMIA_UPD: - case ARM::t2LDMIA_UPD: - if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) { - // Should only print POP if there are at least two registers in the list. - O << '\t' << "pop"; - printPredicateOperand(MI, 2, STI, O); - if (Opcode == ARM::t2LDMIA_UPD) - O << ".w"; - O << '\t'; - printRegisterList(MI, 4, STI, O); - printAnnotation(O, Annot); - return; - } else - break; - - case ARM::LDR_POST_IMM: - if (MI->getOperand(2).getReg() == ARM::SP && - MI->getOperand(4).getImm() == 4) { - O << '\t' << "pop"; - printPredicateOperand(MI, 5, STI, O); - O << "\t{"; - printRegName(O, MI->getOperand(0).getReg()); - O << "}"; - printAnnotation(O, Annot); - return; - } else - break; - - // A8.6.355 VPUSH - case ARM::VSTMSDB_UPD: - case ARM::VSTMDDB_UPD: - if (MI->getOperand(0).getReg() == ARM::SP) { - O << '\t' << "vpush"; - printPredicateOperand(MI, 2, STI, O); - O << '\t'; - printRegisterList(MI, 4, STI, O); - printAnnotation(O, Annot); - return; - } else - break; - - // A8.6.354 VPOP - case ARM::VLDMSIA_UPD: - case ARM::VLDMDIA_UPD: - if (MI->getOperand(0).getReg() == ARM::SP) { - O << '\t' << "vpop"; - printPredicateOperand(MI, 2, STI, O); - O << '\t'; - printRegisterList(MI, 4, STI, O); - printAnnotation(O, Annot); - return; - } else - break; - - case ARM::tLDMIA: { - bool Writeback = true; - unsigned BaseReg = MI->getOperand(0).getReg(); - for (unsigned i = 3; i < MI->getNumOperands(); ++i) { - if (MI->getOperand(i).getReg() == BaseReg) - Writeback = false; - } - - O << "\tldm"; - - printPredicateOperand(MI, 1, STI, O); - O << '\t'; - printRegName(O, BaseReg); - if (Writeback) - O << "!"; - O << ", "; - printRegisterList(MI, 3, STI, O); - printAnnotation(O, Annot); - return; - } - - // Combine 2 GPRs from disassember into a GPRPair to match with instr def. - // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, - // a single GPRPair reg operand is used in the .td file to replace the two - // GPRs. However, when decoding them, the two GRPs cannot be automatically - // expressed as a GPRPair, so we have to manually merge them. - // FIXME: We would really like to be able to tablegen'erate this. - case ARM::LDREXD: - case ARM::STREXD: - case ARM::LDAEXD: - case ARM::STLEXD: { - const MCRegisterClass &MRC = MRI.getRegClass(ARM::GPRRegClassID); - bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD; - unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg(); - if (MRC.contains(Reg)) { - MCInst NewMI; - MCOperand NewReg; - NewMI.setOpcode(Opcode); - - if (isStore) - NewMI.addOperand(MI->getOperand(0)); - NewReg = MCOperand::createReg(MRI.getMatchingSuperReg( - Reg, ARM::gsub_0, &MRI.getRegClass(ARM::GPRPairRegClassID))); - NewMI.addOperand(NewReg); - - // Copy the rest operands into NewMI. - for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i) - NewMI.addOperand(MI->getOperand(i)); - printInstruction(&NewMI, STI, O); - return; - } - break; - } - case ARM::TSB: - case ARM::t2TSB: - O << "\ttsb\tcsync"; - return; - case ARM::t2DSB: - switch (MI->getOperand(0).getImm()) { - default: - if (!printAliasInstr(MI, STI, O)) - printInstruction(MI, STI, O); - break; - case 0: - O << "\tssbb"; - break; - case 4: - O << "\tpssbb"; - break; - } - printAnnotation(O, Annot); - return; - } - - if (!printAliasInstr(MI, STI, O)) - printInstruction(MI, STI, O); - - printAnnotation(O, Annot); -} - -void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - printRegName(O, Reg); - } else if (Op.isImm()) { - O << markup(""); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - const MCExpr *Expr = Op.getExpr(); - switch (Expr->getKind()) { - case MCExpr::Binary: - O << '#'; - Expr->print(O, &MAI); - break; - case MCExpr::Constant: { - // If a symbolic branch target was added as a constant expression then - // print that address in hex. And only print 32 unsigned bits for the - // address. - const MCConstantExpr *Constant = cast(Expr); - int64_t TargetAddress; - if (!Constant->evaluateAsAbsolute(TargetAddress)) { - O << '#'; - Expr->print(O, &MAI); - } else { - O << "0x"; - O.write_hex(static_cast(TargetAddress)); - } - break; - } - default: - // FIXME: Should we always treat this as if it is a constant literal and - // prefix it with '#'? - Expr->print(O, &MAI); - break; - } - } -} - -void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - if (MO1.isExpr()) { - MO1.getExpr()->print(O, &MAI); - return; - } - - O << markup(""); - } else { - O << markup(""); - } - O << "]" << markup(">"); -} - -// so_reg is a 4-operand unit corresponding to register forms of the A5.1 -// "Addressing Mode 1 - Data-processing operands" forms. This includes: -// REG 0 0 - e.g. R5 -// REG REG 0,SH_OPC - e.g. R5, ROR R3 -// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 -void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - const MCOperand &MO3 = MI->getOperand(OpNum + 2); - - printRegName(O, MO1.getReg()); - - // Print the shift opc. - ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm()); - O << ", " << ARM_AM::getShiftOpcStr(ShOpc); - if (ShOpc == ARM_AM::rrx) - return; - - O << ' '; - printRegName(O, MO2.getReg()); - assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); -} - -void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - printRegName(O, MO1.getReg()); - - // Print the shift opc. - printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()), - ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); -} - -//===--------------------------------------------------------------------===// -// Addressing Mode #2 -//===--------------------------------------------------------------------===// - -void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op + 1); - const MCOperand &MO3 = MI->getOperand(Op + 2); - - O << markup(""); - } - O << "]" << markup(">"); - return; - } - - O << ", "; - O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())); - printRegName(O, MO2.getReg()); - - printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()), - ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup); - O << "]" << markup(">"); -} - -void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op + 1); - O << markup(""); -} - -void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op + 1); - O << markup("") << "]" << markup(">"); -} - -void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, STI, O); - return; - } - -#ifndef NDEBUG - const MCOperand &MO3 = MI->getOperand(Op + 2); - unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm()); - assert(IdxMode != ARMII::IndexModePost && "Should be pre or offset index op"); -#endif - - printAM2PreOrOffsetIndexOp(MI, Op, STI, O); -} - -void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - if (!MO1.getReg()) { - unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); - O << markup(""); - return; - } - - O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())); - printRegName(O, MO1.getReg()); - - printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()), - ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup); -} - -//===--------------------------------------------------------------------===// -// Addressing Mode #3 -//===--------------------------------------------------------------------===// - -void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, - raw_ostream &O, - bool AlwaysPrintImm0) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op + 1); - const MCOperand &MO3 = MI->getOperand(Op + 2); - - O << markup(""); - return; - } - - // If the op is sub we have to print the immediate even if it is 0 - unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); - ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm()); - - if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) { - O << ", " << markup(""); - } - O << ']' << markup(">"); -} - -template -void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - if (!MO1.isReg()) { // For label symbolic references. - printOperand(MI, Op, STI, O); - return; - } - - assert(ARM_AM::getAM3IdxMode(MI->getOperand(Op + 2).getImm()) != - ARMII::IndexModePost && - "unexpected idxmode"); - printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0); -} - -void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - if (MO1.getReg()) { - O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())); - printRegName(O, MO1.getReg()); - return; - } - - unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); - O << markup(""); -} - -void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - unsigned Imm = MO.getImm(); - O << markup(""); -} - -void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - O << (MO2.getImm() ? "" : "-"); - printRegName(O, MO1.getReg()); -} - -void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - unsigned Imm = MO.getImm(); - O << markup(""); -} - -void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - ARM_AM::AMSubMode Mode = - ARM_AM::getAM4SubMode(MI->getOperand(OpNum).getImm()); - O << ARM_AM::getAMSubModeStr(Mode); -} - -template -void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, OpNum, STI, O); - return; - } - - O << markup(""); - } - O << "]" << markup(">"); -} - -template -void ARMInstPrinter::printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum+1); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, OpNum, STI, O); - return; - } - - O << markup(""); - } - O << "]" << markup(">"); -} - -void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - O << markup(""); -} - -void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - O << markup(""); -} - -void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - if (MO.getReg() == 0) - O << "!"; - else { - O << ", "; - printRegName(O, MO.getReg()); - } -} - -void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - uint32_t v = ~MO.getImm(); - int32_t lsb = countTrailingZeros(v); - int32_t width = (32 - countLeadingZeros(v)) - lsb; - assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); - O << markup("") << ", " << markup(""); -} - -void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned val = MI->getOperand(OpNum).getImm(); - O << ARM_MB::MemBOptToString(val, STI.getFeatureBits()[ARM::HasV8Ops]); -} - -void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned val = MI->getOperand(OpNum).getImm(); - O << ARM_ISB::InstSyncBOptToString(val); -} - -void ARMInstPrinter::printTraceSyncBOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned val = MI->getOperand(OpNum).getImm(); - O << ARM_TSB::TraceSyncBOptToString(val); -} - -void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned ShiftOp = MI->getOperand(OpNum).getImm(); - bool isASR = (ShiftOp & (1 << 5)) != 0; - unsigned Amt = ShiftOp & 0x1f; - if (isASR) { - O << ", asr " << markup(""); - } else if (Amt) { - O << ", lsl " << markup(""); - } -} - -void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - if (Imm == 0) - return; - assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!"); - O << ", lsl " << markup(""); -} - -void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - // A shift amount of 32 is encoded as 0. - if (Imm == 0) - Imm = 32; - assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!"); - O << ", asr " << markup(""); -} - -void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - assert(std::is_sorted(MI->begin() + OpNum, MI->end(), - [&](const MCOperand &LHS, const MCOperand &RHS) { - return MRI.getEncodingValue(LHS.getReg()) < - MRI.getEncodingValue(RHS.getReg()); - })); - - O << "{"; - for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { - if (i != OpNum) - O << ", "; - printRegName(O, MI->getOperand(i).getReg()); - } - O << "}"; -} - -void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0)); - O << ", "; - printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1)); -} - -void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - if (Op.getImm()) - O << "be"; - else - O << "le"; -} - -void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - O << ARM_PROC::IModToString(Op.getImm()); -} - -void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - unsigned IFlags = Op.getImm(); - for (int i = 2; i >= 0; --i) - if (IFlags & (1 << i)) - O << ARM_PROC::IFlagsToString(1 << i); - - if (IFlags == 0) - O << "none"; -} - -void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - const FeatureBitset &FeatureBits = STI.getFeatureBits(); - if (FeatureBits[ARM::FeatureMClass]) { - - unsigned SYSm = Op.getImm() & 0xFFF; // 12-bit SYSm - unsigned Opcode = MI->getOpcode(); - - // For writes, handle extended mask bits if the DSP extension is present. - if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) { - auto TheReg =ARMSysReg::lookupMClassSysRegBy12bitSYSmValue(SYSm); - if (TheReg && TheReg->isInRequiredFeatures({ARM::FeatureDSP})) { - O << TheReg->Name; - return; - } - } - - // Handle the basic 8-bit mask. - SYSm &= 0xff; - if (Opcode == ARM::t2MSR_M && FeatureBits [ARM::HasV7Ops]) { - // ARMv7-M deprecates using MSR APSR without a _ qualifier as an - // alias for MSR APSR_nzcvq. - auto TheReg = ARMSysReg::lookupMClassSysRegAPSRNonDeprecated(SYSm); - if (TheReg) { - O << TheReg->Name; - return; - } - } - - auto TheReg = ARMSysReg::lookupMClassSysRegBy8bitSYSmValue(SYSm); - if (TheReg) { - O << TheReg->Name; - return; - } - - O << SYSm; - - return; - } - - // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as - // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively. - unsigned SpecRegRBit = Op.getImm() >> 4; - unsigned Mask = Op.getImm() & 0xf; - - if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) { - O << "APSR_"; - switch (Mask) { - default: - llvm_unreachable("Unexpected mask value!"); - case 4: - O << "g"; - return; - case 8: - O << "nzcvq"; - return; - case 12: - O << "nzcvqg"; - return; - } - } - - if (SpecRegRBit) - O << "SPSR"; - else - O << "CPSR"; - - if (Mask) { - O << '_'; - if (Mask & 8) - O << 'f'; - if (Mask & 4) - O << 's'; - if (Mask & 2) - O << 'x'; - if (Mask & 1) - O << 'c'; - } -} - -void ARMInstPrinter::printBankedRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint32_t Banked = MI->getOperand(OpNum).getImm(); - auto TheReg = ARMBankedReg::lookupBankedRegByEncoding(Banked); - assert(TheReg && "invalid banked register operand"); - std::string Name = TheReg->Name; - - uint32_t isSPSR = (Banked & 0x20) >> 5; - if (isSPSR) - Name.replace(0, 4, "SPSR"); // convert 'spsr_' to 'SPSR_' - O << Name; -} - -void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); - // Handle the undefined 15 CC value here for printing so we don't abort(). - if ((unsigned)CC == 15) - O << ""; - else if (CC != ARMCC::AL) - O << ARMCondCodeToString(CC); -} - -void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); - O << ARMCondCodeToString(CC); -} - -void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNum).getReg()) { - assert(MI->getOperand(OpNum).getReg() == ARM::CPSR && - "Expect ARM CPSR register!"); - O << 's'; - } -} - -void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << MI->getOperand(OpNum).getImm(); -} - -void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "p" << MI->getOperand(OpNum).getImm(); -} - -void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "c" << MI->getOperand(OpNum).getImm(); -} - -void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "{" << MI->getOperand(OpNum).getImm() << "}"; -} - -void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - llvm_unreachable("Unhandled PC-relative pseudo-instruction!"); -} - -template -void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - - if (MO.isExpr()) { - MO.getExpr()->print(O, &MAI); - return; - } - - int32_t OffImm = (int32_t)MO.getImm() << scale; - - O << markup(""); -} - -void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << markup("getOperand(OpNum).getImm() * 4) - << markup(">"); -} - -void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - O << markup(""); -} - -void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // (3 - the number of trailing zeros) is the number of then / else. - unsigned Mask = MI->getOperand(OpNum).getImm(); - unsigned Firstcond = MI->getOperand(OpNum - 1).getImm(); - unsigned CondBit0 = Firstcond & 1; - unsigned NumTZ = countTrailingZeros(Mask); - assert(NumTZ <= 3 && "Invalid IT mask!"); - for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { - bool T = ((Mask >> Pos) & 1) == CondBit0; - if (T) - O << 't'; - else - O << 'e'; - } -} - -void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op + 1); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, STI, O); - return; - } - - O << markup(""); -} - -void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI, - unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O, - unsigned Scale) { - const MCOperand &MO1 = MI->getOperand(Op); - const MCOperand &MO2 = MI->getOperand(Op + 1); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, STI, O); - return; - } - - O << markup(""); - } - O << "]" << markup(">"); -} - -void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI, - unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printThumbAddrModeImm5SOperand(MI, Op, STI, O, 1); -} - -void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI, - unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printThumbAddrModeImm5SOperand(MI, Op, STI, O, 2); -} - -void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI, - unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4); -} - -void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4); -} - -// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 -// register with shift forms. -// REG 0 0 - e.g. R5 -// REG IMM, SH_OPC - e.g. R5, LSL #3 -void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - unsigned Reg = MO1.getReg(); - printRegName(O, Reg); - - // Print the shift opc. - assert(MO2.isImm() && "Not a valid t2_so_reg value!"); - printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()), - ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); -} - -template -void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, OpNum, STI, O); - return; - } - - O << markup(""); - } else if (AlwaysPrintImm0 || OffImm > 0) { - O << ", " << markup(""); - } - O << "]" << markup(">"); -} - -template -void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - O << markup(""); - } else if (AlwaysPrintImm0 || OffImm > 0) { - O << ", " << markup(""); - } - O << "]" << markup(">"); -} - -template -void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - if (!MO1.isReg()) { // For label symbolic references. - printOperand(MI, OpNum, STI, O); - return; - } - - O << markup(""); - } else if (AlwaysPrintImm0 || OffImm > 0) { - O << ", " << markup(""); - } - O << "]" << markup(">"); -} - -void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand( - const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - - O << markup(""); - } - O << "]" << markup(">"); -} - -void ARMInstPrinter::printT2AddrModeImm8OffsetOperand( - const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - int32_t OffImm = (int32_t)MO1.getImm(); - O << ", " << markup(""); -} - -void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand( - const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - int32_t OffImm = (int32_t)MO1.getImm(); - - assert(((OffImm & 0x3) == 0) && "Not a valid immediate!"); - - O << ", " << markup(""); -} - -void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum + 1); - const MCOperand &MO3 = MI->getOperand(OpNum + 2); - - O << markup(""); - } - O << "]" << markup(">"); -} - -void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - O << markup(""); -} - -void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned EncodedImm = MI->getOperand(OpNum).getImm(); - unsigned EltBits; - uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); - O << markup(""); -} - -void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - O << markup(""); -} - -void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - if (Imm == 0) - return; - assert(Imm <= 3 && "illegal ror immediate!"); - O << ", ror " << markup(""); -} - -void ARMInstPrinter::printModImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - MCOperand Op = MI->getOperand(OpNum); - - // Support for fixups (MCFixup) - if (Op.isExpr()) - return printOperand(MI, OpNum, STI, O); - - unsigned Bits = Op.getImm() & 0xFF; - unsigned Rot = (Op.getImm() & 0xF00) >> 7; - - bool PrintUnsigned = false; - switch (MI->getOpcode()) { - case ARM::MOVi: - // Movs to PC should be treated unsigned - PrintUnsigned = (MI->getOperand(OpNum - 1).getReg() == ARM::PC); - break; - case ARM::MSRi: - // Movs to special registers should be treated unsigned - PrintUnsigned = true; - break; - } - - int32_t Rotated = ARM_AM::rotr32(Bits, Rot); - if (ARM_AM::getSOImmVal(Rotated) == Op.getImm()) { - // #rot has the least possible value - O << "#" << markup("(Rotated); - else - O << Rotated; - O << markup(">"); - return; - } - - // Explicit #bits, #rot implied - O << "#" << markup("") << ", #" << markup(""); -} - -void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - O << markup("getOperand(OpNum).getImm() - << markup(">"); -} - -void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O) { - O << markup("getOperand(OpNum).getImm() - << markup(">"); -} - -void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "[" << MI->getOperand(OpNum).getImm() << "]"; -} - -void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << "}"; -} - -void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); - unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1); - O << "{"; - printRegName(O, Reg0); - O << ", "; - printRegName(O, Reg1); - O << "}"; -} - -void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); - unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2); - O << "{"; - printRegName(O, Reg0); - O << ", "; - printRegName(O, Reg1); - O << "}"; -} - -void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 1); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << "}"; -} - -void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 1); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 3); - O << "}"; -} - -void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); - unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1); - O << "{"; - printRegName(O, Reg0); - O << "[], "; - printRegName(O, Reg1); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 1); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 1); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 3); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListTwoSpacedAllLanes( - const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); - unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2); - O << "{"; - printRegName(O, Reg0); - O << "[], "; - printRegName(O, Reg1); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListThreeSpacedAllLanes( - const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 4); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListFourSpacedAllLanes( - const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 4); - O << "[], "; - printRegName(O, MI->getOperand(OpNum).getReg() + 6); - O << "[]}"; -} - -void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI, - unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 4); - O << "}"; -} - -void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - // Normally, it's not safe to use register enum values directly with - // addition to get the next register, but for VFP registers, the - // sort order is guaranteed because they're all of the form D. - O << "{"; - printRegName(O, MI->getOperand(OpNum).getReg()); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 2); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 4); - O << ", "; - printRegName(O, MI->getOperand(OpNum).getReg() + 6); - O << "}"; -} - -template -void ARMInstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - O << "#" << (Val * Angle) + Remainder; -} - diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h deleted file mode 100644 index afc8515136bc..000000000000 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h +++ /dev/null @@ -1,243 +0,0 @@ -//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an ARM MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H -#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class ARMInstPrinter : public MCInstPrinter { -public: - ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, - const MCSubtargetInfo &STI, - raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - - void printSORegRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printSORegImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printAddrModeTBB(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrModeTBH(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O, - bool AlwaysPrintImm0); - void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printMemBOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printInstSyncBOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printTraceSyncBOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printShiftImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - template - void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printThumbSRImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printThumbITMask(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O, unsigned Scale); - void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printT2SOOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printSetendOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printCPSIMod(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printCPSIFlag(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printBankedRegOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPredicateOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printSBitModifierOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printRegisterList(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNoHashImmediate(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printPImmediate(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printCImmediate(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printFPImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printRotImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printModImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printGPRPairOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - - void printPCLabel(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printFBits16(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printFBits32(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorIndex(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListOne(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListTwo(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListThree(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListFour(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O); - void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); - template - void printComplexRotationOp(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H diff --git a/lib/Target/ARM/LICENSE.TXT b/lib/Target/ARM/LICENSE.TXT deleted file mode 100755 index 68afea12ed44..000000000000 --- a/lib/Target/ARM/LICENSE.TXT +++ /dev/null @@ -1,47 +0,0 @@ -ARM Limited - -Software Grant License Agreement ("Agreement") - -Except for the license granted herein to you, ARM Limited ("ARM") reserves all -right, title, and interest in and to the Software (defined below). - -Definition - -"Software" means the code and documentation as well as any original work of -authorship, including any modifications or additions to an existing work, that -is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for -inclusion in, or documentation of, any of the products owned or managed by LLVM -(the "Work"). For the purposes of this definition, "submitted" means any form of -electronic, verbal, or written communication sent to LLVM or its -representatives, including but not limited to communication on electronic -mailing lists, source code control systems, and issue tracking systems that are -managed by, or on behalf of, LLVM for the purpose of discussing and improving -the Work, but excluding communication that is conspicuously marked otherwise. - -1. Grant of Copyright License. Subject to the terms and conditions of this - Agreement, ARM hereby grants to you and to recipients of the Software - distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge, - royalty-free, irrevocable copyright license to reproduce, prepare derivative - works of, publicly display, publicly perform, sublicense, and distribute the - Software and such derivative works. - -2. Grant of Patent License. Subject to the terms and conditions of this - Agreement, ARM hereby grants you and to recipients of the Software - distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge, - royalty-free, irrevocable (except as stated in this section) patent license - to make, have made, use, offer to sell, sell, import, and otherwise transfer - the Work, where such license applies only to those patent claims licensable - by ARM that are necessarily infringed by ARM's Software alone or by - combination of the Software with the Work to which such Software was - submitted. If any entity institutes patent litigation against ARM or any - other entity (including a cross-claim or counterclaim in a lawsuit) alleging - that ARM's Software, or the Work to which ARM has contributed constitutes - direct or contributory patent infringement, then any patent licenses granted - to that entity under this Agreement for the Software or Work shall terminate - as of the date such litigation is filed. - -Unless required by applicable law or agreed to in writing, the software is -provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -either express or implied, including, without limitation, any warranties or -conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A -PARTICULAR PURPOSE. diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h index e1ea5964cf67..7732a6485a85 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -1,9 +1,8 @@ //===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,7 +30,8 @@ namespace ARM_AM { lsl, lsr, ror, - rrx + rrx, + uxtw }; enum AddrOpc { @@ -49,6 +49,7 @@ namespace ARM_AM { case ARM_AM::lsr: return "lsr"; case ARM_AM::ror: return "ror"; case ARM_AM::rrx: return "rrx"; + case ARM_AM::uxtw: return "uxtw"; } } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index c2a07d4ddcef..aeab5be78ab4 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -30,6 +29,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/Support/Debug.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" @@ -47,6 +47,13 @@ public: }; } // end anonymous namespace +Optional ARMAsmBackend::getFixupKind(StringRef Name) const { + if (STI.getTargetTriple().isOSBinFormatELF() && Name == "R_ARM_NONE") + return FK_NONE; + + return MCAsmBackend::getFixupKind(Name); +} + const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = { // This table *must* be in the order that the fixup_* kinds are defined in @@ -98,6 +105,13 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { {"fixup_t2_movw_lo16", 0, 20, 0}, {"fixup_arm_mod_imm", 0, 12, 0}, {"fixup_t2_so_imm", 0, 26, 0}, + {"fixup_bf_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bf_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bfl_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bfc_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bfcsel_else_target", 0, 32, 0}, + {"fixup_wls", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel} }; const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = { // This table *must* be in the order that the fixup_* kinds are defined in @@ -149,6 +163,13 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { {"fixup_t2_movw_lo16", 12, 20, 0}, {"fixup_arm_mod_imm", 20, 12, 0}, {"fixup_t2_so_imm", 26, 6, 0}, + {"fixup_bf_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bf_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bfl_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bfc_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_bfcsel_else_target", 0, 32, 0}, + {"fixup_wls", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel} }; if (Kind < FirstTargetFixupKind) @@ -203,6 +224,13 @@ bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst, return false; } +static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) { + int64_t Offset = int64_t(Value) - 4; + if (Offset < Min || Offset > Max) + return "out of range pc-relative fixup value"; + return nullptr; +} + const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, uint64_t Value) const { switch ((unsigned)Fixup.getKind()) { @@ -250,6 +278,32 @@ const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, return "will be converted to nop"; break; } + case ARM::fixup_bf_branch: + return checkPCRelOffset(Value, 0, 30); + case ARM::fixup_bf_target: + return checkPCRelOffset(Value, -0x10000, +0xfffe); + case ARM::fixup_bfl_target: + return checkPCRelOffset(Value, -0x40000, +0x3fffe); + case ARM::fixup_bfc_target: + return checkPCRelOffset(Value, -0x1000, +0xffe); + case ARM::fixup_wls: + return checkPCRelOffset(Value, 0, +0xffe); + case ARM::fixup_le: + // The offset field in the LE and LETP instructions is an 11-bit + // value shifted left by 2 (i.e. 0,2,4,...,4094), and it is + // interpreted as a negative offset from the value read from pc, + // i.e. from instruction_address+4. + // + // So an LE instruction can in principle address the instruction + // immediately after itself, or (not very usefully) the address + // half way through the 4-byte LE. + return checkPCRelOffset(Value, -0xffe, 0); + case ARM::fixup_bfcsel_else_target: { + if (Value != 2 && Value != 4) + return "out of range label-relative fixup value"; + break; + } + default: llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!"); } @@ -384,6 +438,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, default: Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type"); return 0; + case FK_NONE: case FK_Data_1: case FK_Data_2: case FK_Data_4: @@ -753,6 +808,60 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, EncValue |= (Value & 0xff); return swapHalfWords(EncValue, Endian == support::little); } + case ARM::fixup_bf_branch: { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + uint32_t out = (((Value - 4) >> 1) & 0xf) << 23; + return swapHalfWords(out, Endian == support::little); + } + case ARM::fixup_bf_target: + case ARM::fixup_bfl_target: + case ARM::fixup_bfc_target: { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + uint32_t out = 0; + uint32_t HighBitMask = (Kind == ARM::fixup_bf_target ? 0xf800 : + Kind == ARM::fixup_bfl_target ? 0x3f800 : 0x800); + out |= (((Value - 4) >> 1) & 0x1) << 11; + out |= (((Value - 4) >> 1) & 0x7fe); + out |= (((Value - 4) >> 1) & HighBitMask) << 5; + return swapHalfWords(out, Endian == support::little); + } + case ARM::fixup_bfcsel_else_target: { + // If this is a fixup of a branch future's else target then it should be a + // constant MCExpr representing the distance between the branch targetted + // and the instruction after that same branch. + Value = Target.getConstant(); + + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + uint32_t out = ((Value >> 2) & 1) << 17; + return swapHalfWords(out, Endian == support::little); + } + case ARM::fixup_wls: + case ARM::fixup_le: { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + uint64_t real_value = Value - 4; + uint32_t out = 0; + if (Kind == ARM::fixup_le) + real_value = -real_value; + out |= ((real_value >> 1) & 0x1) << 11; + out |= ((real_value >> 1) & 0x7fe); + return swapHalfWords(out, Endian == support::little); + } } } @@ -762,7 +871,9 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCSymbolRefExpr *A = Target.getSymA(); const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; const unsigned FixupKind = Fixup.getKind() ; - if ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { + if (FixupKind == FK_NONE) + return true; + if (FixupKind == ARM::fixup_arm_thumb_bl) { assert(Sym && "How did we resolve this?"); // If the symbol is external the linker will handle it. @@ -804,6 +915,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); + case FK_NONE: + return 0; + case FK_Data_1: case ARM::fixup_arm_thumb_bcc: case ARM::fixup_arm_thumb_cp: @@ -842,6 +956,13 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case ARM::fixup_t2_movt_hi16: case ARM::fixup_t2_movw_lo16: case ARM::fixup_t2_so_imm: + case ARM::fixup_bf_branch: + case ARM::fixup_bf_target: + case ARM::fixup_bfl_target: + case ARM::fixup_bfc_target: + case ARM::fixup_bfcsel_else_target: + case ARM::fixup_wls: + case ARM::fixup_le: return 4; case FK_SecRel_2: @@ -858,6 +979,9 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); + case FK_NONE: + return 0; + case FK_Data_1: return 1; case FK_Data_2: @@ -876,6 +1000,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { case ARM::fixup_arm_pcrel_10_unscaled: case ARM::fixup_arm_ldst_pcrel_12: case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_pcrel_9: case ARM::fixup_arm_adr_pcrel_12: case ARM::fixup_arm_uncondbl: case ARM::fixup_arm_condbl: @@ -895,6 +1020,13 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { case ARM::fixup_t2_movw_lo16: case ARM::fixup_arm_mod_imm: case ARM::fixup_t2_so_imm: + case ARM::fixup_bf_branch: + case ARM::fixup_bf_target: + case ARM::fixup_bfl_target: + case ARM::fixup_bfc_target: + case ARM::fixup_bfcsel_else_target: + case ARM::fixup_wls: + case ARM::fixup_le: // Instruction size is 4 bytes. return 4; } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 88c476bf65f4..67722a5e5b64 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -1,9 +1,8 @@ //===-- ARMAsmBackend.h - ARM Assembler Backend -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -38,6 +37,8 @@ public: // different. bool hasNOP() const { return STI.getFeatureBits()[ARM::HasV6T2Ops]; } + Optional getFixupKind(StringRef Name) const override; + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h index de1bfaf203e4..87e56940f46d 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -1,9 +1,8 @@ //===-- ARMAsmBackendDarwin.h ARM Asm Backend Darwin ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h index 86a583b19cf7..5d735114d441 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h @@ -1,9 +1,8 @@ //===-- ARMAsmBackendELF.h ARM Asm Backend ELF -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h index 553922d20f43..8cd7a4a00ead 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h @@ -1,9 +1,8 @@ //===-- ARMAsmBackendWinCOFF.h - ARM Asm Backend WinCOFF --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 33c32d5464af..c4daafe8ee97 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -1,9 +1,8 @@ //===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -203,6 +202,9 @@ namespace ARMII { AddrMode_i12 = 16, AddrMode5FP16 = 17, // i8 * 2 AddrModeT2_ldrex = 18, // i8 * 4, with unscaled offset in MCInst + AddrModeT2_i7s4 = 19, // i7 * 4 + AddrModeT2_i7s2 = 20, // i7 * 2 + AddrModeT2_i7 = 21, // i7 * 1 }; inline static const char *AddrModeToString(AddrMode addrmode) { @@ -226,6 +228,9 @@ namespace ARMII { case AddrModeT2_i8s4: return "AddrModeT2_i8s4"; case AddrMode_i12: return "AddrMode_i12"; case AddrModeT2_ldrex:return "AddrModeT2_ldrex"; + case AddrModeT2_i7s4: return "AddrModeT2_i7s4"; + case AddrModeT2_i7s2: return "AddrModeT2_i7s2"; + case AddrModeT2_i7: return "AddrModeT2_i7"; } } @@ -386,16 +391,17 @@ namespace ARMII { // instruction. Used by the parser to determine whether to require the 'S' // suffix on the mnemonic (when not in an IT block) or preclude it (when // in an IT block). - ThumbArithFlagSetting = 1 << 18, + ThumbArithFlagSetting = 1 << 19, //===------------------------------------------------------------------===// // Code domain. DomainShift = 15, - DomainMask = 7 << DomainShift, + DomainMask = 15 << DomainShift, DomainGeneral = 0 << DomainShift, DomainVFP = 1 << DomainShift, DomainNEON = 2 << DomainShift, DomainNEONA8 = 4 << DomainShift, + DomainMVE = 8 << DomainShift, //===------------------------------------------------------------------===// // Field shifts - such shifts are used to set field while generating diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index b8ba7584911b..fda19eea1de6 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -138,12 +137,20 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, default: return ELF::R_ARM_THM_CALL; } + case ARM::fixup_bf_target: + return ELF::R_ARM_THM_BF16; + case ARM::fixup_bfc_target: + return ELF::R_ARM_THM_BF12; + case ARM::fixup_bfl_target: + return ELF::R_ARM_THM_BF18; } } switch ((unsigned)Fixup.getKind()) { default: Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; + case FK_NONE: + return ELF::R_ARM_NONE; case FK_Data_1: switch (Modifier) { default: diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index d3744fffac32..f51fbdcd84da 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1,9 +1,8 @@ //===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -485,8 +484,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - bool) override { + void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) override { if (IsThumb) EmitThumbMappingSymbol(); else diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h index 831589ba0581..bdf04a208b24 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h +++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h @@ -1,9 +1,8 @@ //===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -104,6 +103,15 @@ enum Fixups { // Fixup for Thumb2 8-bit rotated operand fixup_t2_so_imm, + // Fixups for Branch Future. + fixup_bf_branch, + fixup_bf_target, + fixup_bfl_target, + fixup_bfc_target, + fixup_bfcsel_else_target, + fixup_wls, + fixup_le, + // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp new file mode 100644 index 000000000000..45be1ee96342 --- /dev/null +++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp @@ -0,0 +1,1678 @@ +//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "ARMInstPrinter.h" +#include "Utils/ARMBaseInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#define PRINT_ALIAS_INSTR +#include "ARMGenAsmWriter.inc" + +/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing. +/// +/// getSORegOffset returns an integer from 0-31, representing '32' as 0. +static unsigned translateShiftImm(unsigned imm) { + // lsr #32 and asr #32 exist, but should be encoded as a 0. + assert((imm & ~0x1f) == 0 && "Invalid shift encoding"); + + if (imm == 0) + return 32; + return imm; +} + +/// Prints the shift value with an immediate value. +static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc, + unsigned ShImm, bool UseMarkup) { + if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm)) + return; + O << ", "; + + assert(!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0"); + O << getShiftOpcStr(ShOpc); + + if (ShOpc != ARM_AM::rrx) { + O << " "; + if (UseMarkup) + O << ""; + } +} + +ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + +bool ARMInstPrinter::applyTargetSpecificCLOption(StringRef Opt) { + if (Opt == "reg-names-std") { + DefaultAltIdx = ARM::NoRegAltName; + return true; + } + if (Opt == "reg-names-raw") { + DefaultAltIdx = ARM::RegNamesRaw; + return true; + } + return false; +} + +void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << markup(""); +} + +void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + unsigned Opcode = MI->getOpcode(); + + switch (Opcode) { + // Check for MOVs and print canonical forms, instead. + case ARM::MOVsr: { + // FIXME: Thumb variants? + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + const MCOperand &MO3 = MI->getOperand(3); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())); + printSBitModifierOperand(MI, 6, STI, O); + printPredicateOperand(MI, 4, STI, O); + + O << '\t'; + printRegName(O, Dst.getReg()); + O << ", "; + printRegName(O, MO1.getReg()); + + O << ", "; + printRegName(O, MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + printAnnotation(O, Annot); + return; + } + + case ARM::MOVsi: { + // FIXME: Thumb variants? + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm())); + printSBitModifierOperand(MI, 5, STI, O); + printPredicateOperand(MI, 3, STI, O); + + O << '\t'; + printRegName(O, Dst.getReg()); + O << ", "; + printRegName(O, MO1.getReg()); + + if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) { + printAnnotation(O, Annot); + return; + } + + O << ", " << markup(""); + printAnnotation(O, Annot); + return; + } + + // A8.6.123 PUSH + case ARM::STMDB_UPD: + case ARM::t2STMDB_UPD: + if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) { + // Should only print PUSH if there are at least two registers in the list. + O << '\t' << "push"; + printPredicateOperand(MI, 2, STI, O); + if (Opcode == ARM::t2STMDB_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + case ARM::STR_PRE_IMM: + if (MI->getOperand(2).getReg() == ARM::SP && + MI->getOperand(3).getImm() == -4) { + O << '\t' << "push"; + printPredicateOperand(MI, 4, STI, O); + O << "\t{"; + printRegName(O, MI->getOperand(1).getReg()); + O << "}"; + printAnnotation(O, Annot); + return; + } else + break; + + // A8.6.122 POP + case ARM::LDMIA_UPD: + case ARM::t2LDMIA_UPD: + if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) { + // Should only print POP if there are at least two registers in the list. + O << '\t' << "pop"; + printPredicateOperand(MI, 2, STI, O); + if (Opcode == ARM::t2LDMIA_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + case ARM::LDR_POST_IMM: + if (MI->getOperand(2).getReg() == ARM::SP && + MI->getOperand(4).getImm() == 4) { + O << '\t' << "pop"; + printPredicateOperand(MI, 5, STI, O); + O << "\t{"; + printRegName(O, MI->getOperand(0).getReg()); + O << "}"; + printAnnotation(O, Annot); + return; + } else + break; + + // A8.6.355 VPUSH + case ARM::VSTMSDB_UPD: + case ARM::VSTMDDB_UPD: + if (MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "vpush"; + printPredicateOperand(MI, 2, STI, O); + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + // A8.6.354 VPOP + case ARM::VLDMSIA_UPD: + case ARM::VLDMDIA_UPD: + if (MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "vpop"; + printPredicateOperand(MI, 2, STI, O); + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + case ARM::tLDMIA: { + bool Writeback = true; + unsigned BaseReg = MI->getOperand(0).getReg(); + for (unsigned i = 3; i < MI->getNumOperands(); ++i) { + if (MI->getOperand(i).getReg() == BaseReg) + Writeback = false; + } + + O << "\tldm"; + + printPredicateOperand(MI, 1, STI, O); + O << '\t'; + printRegName(O, BaseReg); + if (Writeback) + O << "!"; + O << ", "; + printRegisterList(MI, 3, STI, O); + printAnnotation(O, Annot); + return; + } + + // Combine 2 GPRs from disassember into a GPRPair to match with instr def. + // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, + // a single GPRPair reg operand is used in the .td file to replace the two + // GPRs. However, when decoding them, the two GRPs cannot be automatically + // expressed as a GPRPair, so we have to manually merge them. + // FIXME: We would really like to be able to tablegen'erate this. + case ARM::LDREXD: + case ARM::STREXD: + case ARM::LDAEXD: + case ARM::STLEXD: { + const MCRegisterClass &MRC = MRI.getRegClass(ARM::GPRRegClassID); + bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD; + unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg(); + if (MRC.contains(Reg)) { + MCInst NewMI; + MCOperand NewReg; + NewMI.setOpcode(Opcode); + + if (isStore) + NewMI.addOperand(MI->getOperand(0)); + NewReg = MCOperand::createReg(MRI.getMatchingSuperReg( + Reg, ARM::gsub_0, &MRI.getRegClass(ARM::GPRPairRegClassID))); + NewMI.addOperand(NewReg); + + // Copy the rest operands into NewMI. + for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i) + NewMI.addOperand(MI->getOperand(i)); + printInstruction(&NewMI, STI, O); + return; + } + break; + } + case ARM::TSB: + case ARM::t2TSB: + O << "\ttsb\tcsync"; + return; + case ARM::t2DSB: + switch (MI->getOperand(0).getImm()) { + default: + if (!printAliasInstr(MI, STI, O)) + printInstruction(MI, STI, O); + break; + case 0: + O << "\tssbb"; + break; + case 4: + O << "\tpssbb"; + break; + } + printAnnotation(O, Annot); + return; + } + + if (!printAliasInstr(MI, STI, O)) + printInstruction(MI, STI, O); + + printAnnotation(O, Annot); +} + +void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + printRegName(O, Reg); + } else if (Op.isImm()) { + O << markup(""); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + const MCExpr *Expr = Op.getExpr(); + switch (Expr->getKind()) { + case MCExpr::Binary: + O << '#'; + Expr->print(O, &MAI); + break; + case MCExpr::Constant: { + // If a symbolic branch target was added as a constant expression then + // print that address in hex. And only print 32 unsigned bits for the + // address. + const MCConstantExpr *Constant = cast(Expr); + int64_t TargetAddress; + if (!Constant->evaluateAsAbsolute(TargetAddress)) { + O << '#'; + Expr->print(O, &MAI); + } else { + O << "0x"; + O.write_hex(static_cast(TargetAddress)); + } + break; + } + default: + // FIXME: Should we always treat this as if it is a constant literal and + // prefix it with '#'? + Expr->print(O, &MAI); + break; + } + } +} + +void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + if (MO1.isExpr()) { + MO1.getExpr()->print(O, &MAI); + return; + } + + O << markup(""); + } else { + O << markup(""); + } + O << "]" << markup(">"); +} + +// so_reg is a 4-operand unit corresponding to register forms of the A5.1 +// "Addressing Mode 1 - Data-processing operands" forms. This includes: +// REG 0 0 - e.g. R5 +// REG REG 0,SH_OPC - e.g. R5, ROR R3 +// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + const MCOperand &MO3 = MI->getOperand(OpNum + 2); + + printRegName(O, MO1.getReg()); + + // Print the shift opc. + ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm()); + O << ", " << ARM_AM::getShiftOpcStr(ShOpc); + if (ShOpc == ARM_AM::rrx) + return; + + O << ' '; + printRegName(O, MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); +} + +void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + printRegName(O, MO1.getReg()); + + // Print the shift opc. + printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()), + ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); +} + +//===--------------------------------------------------------------------===// +// Addressing Mode #2 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + const MCOperand &MO3 = MI->getOperand(Op + 2); + + O << markup(""); + } + O << "]" << markup(">"); + return; + } + + O << ", "; + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())); + printRegName(O, MO2.getReg()); + + printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()), + ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup); + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + O << markup(""); +} + +void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + O << markup("") << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, STI, O); + return; + } + +#ifndef NDEBUG + const MCOperand &MO3 = MI->getOperand(Op + 2); + unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm()); + assert(IdxMode != ARMII::IndexModePost && "Should be pre or offset index op"); +#endif + + printAM2PreOrOffsetIndexOp(MI, Op, STI, O); +} + +void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); + O << markup(""); + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())); + printRegName(O, MO1.getReg()); + + printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()), + ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup); +} + +//===--------------------------------------------------------------------===// +// Addressing Mode #3 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O, + bool AlwaysPrintImm0) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + const MCOperand &MO3 = MI->getOperand(Op + 2); + + O << markup(""); + return; + } + + // If the op is sub we have to print the immediate even if it is 0 + unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); + ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm()); + + if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) { + O << ", " << markup(""); + } + O << ']' << markup(">"); +} + +template +void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + if (!MO1.isReg()) { // For label symbolic references. + printOperand(MI, Op, STI, O); + return; + } + + assert(ARM_AM::getAM3IdxMode(MI->getOperand(Op + 2).getImm()) != + ARMII::IndexModePost && + "unexpected idxmode"); + printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0); +} + +void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (MO1.getReg()) { + O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())); + printRegName(O, MO1.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); + O << markup(""); +} + +void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + unsigned Imm = MO.getImm(); + O << markup(""); +} + +void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << (MO2.getImm() ? "" : "-"); + printRegName(O, MO1.getReg()); +} + +void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + unsigned Imm = MO.getImm(); + O << markup(""); +} + +template +void ARMInstPrinter::printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup(" 0) + printRegImmShift(O, ARM_AM::uxtw, shift, UseMarkup); + + O << "]" << markup(">"); +} + +void ARMInstPrinter::printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup(""); + + O << "]" << markup(">"); +} + +void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARM_AM::AMSubMode Mode = + ARM_AM::getAM4SubMode(MI->getOperand(OpNum).getImm()); + O << ARM_AM::getAMSubModeStr(Mode); +} + +template +void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup(""); + } + O << "]" << markup(">"); +} + +template +void ARMInstPrinter::printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup(""); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup(""); +} + +void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + O << markup(""); +} + +void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.getReg() == 0) + O << "!"; + else { + O << ", "; + printRegName(O, MO.getReg()); + } +} + +void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + uint32_t v = ~MO.getImm(); + int32_t lsb = countTrailingZeros(v); + int32_t width = (32 - countLeadingZeros(v)) - lsb; + assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); + O << markup("") << ", " << markup(""); +} + +void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned val = MI->getOperand(OpNum).getImm(); + O << ARM_MB::MemBOptToString(val, STI.getFeatureBits()[ARM::HasV8Ops]); +} + +void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned val = MI->getOperand(OpNum).getImm(); + O << ARM_ISB::InstSyncBOptToString(val); +} + +void ARMInstPrinter::printTraceSyncBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned val = MI->getOperand(OpNum).getImm(); + O << ARM_TSB::TraceSyncBOptToString(val); +} + +void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned ShiftOp = MI->getOperand(OpNum).getImm(); + bool isASR = (ShiftOp & (1 << 5)) != 0; + unsigned Amt = ShiftOp & 0x1f; + if (isASR) { + O << ", asr " << markup(""); + } else if (Amt) { + O << ", lsl " << markup(""); + } +} + +void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + if (Imm == 0) + return; + assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!"); + O << ", lsl " << markup(""); +} + +void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + // A shift amount of 32 is encoded as 0. + if (Imm == 0) + Imm = 32; + assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!"); + O << ", asr " << markup(""); +} + +void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOpcode() != ARM::t2CLRM) { + assert(std::is_sorted(MI->begin() + OpNum, MI->end(), + [&](const MCOperand &LHS, const MCOperand &RHS) { + return MRI.getEncodingValue(LHS.getReg()) < + MRI.getEncodingValue(RHS.getReg()); + })); + } + + O << "{"; + for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + if (i != OpNum) + O << ", "; + printRegName(O, MI->getOperand(i).getReg()); + } + O << "}"; +} + +void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0)); + O << ", "; + printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1)); +} + +void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + if (Op.getImm()) + O << "be"; + else + O << "le"; +} + +void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + O << ARM_PROC::IModToString(Op.getImm()); +} + +void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned IFlags = Op.getImm(); + for (int i = 2; i >= 0; --i) + if (IFlags & (1 << i)) + O << ARM_PROC::IFlagsToString(1 << i); + + if (IFlags == 0) + O << "none"; +} + +void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + const FeatureBitset &FeatureBits = STI.getFeatureBits(); + if (FeatureBits[ARM::FeatureMClass]) { + + unsigned SYSm = Op.getImm() & 0xFFF; // 12-bit SYSm + unsigned Opcode = MI->getOpcode(); + + // For writes, handle extended mask bits if the DSP extension is present. + if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) { + auto TheReg =ARMSysReg::lookupMClassSysRegBy12bitSYSmValue(SYSm); + if (TheReg && TheReg->isInRequiredFeatures({ARM::FeatureDSP})) { + O << TheReg->Name; + return; + } + } + + // Handle the basic 8-bit mask. + SYSm &= 0xff; + if (Opcode == ARM::t2MSR_M && FeatureBits [ARM::HasV7Ops]) { + // ARMv7-M deprecates using MSR APSR without a _ qualifier as an + // alias for MSR APSR_nzcvq. + auto TheReg = ARMSysReg::lookupMClassSysRegAPSRNonDeprecated(SYSm); + if (TheReg) { + O << TheReg->Name; + return; + } + } + + auto TheReg = ARMSysReg::lookupMClassSysRegBy8bitSYSmValue(SYSm); + if (TheReg) { + O << TheReg->Name; + return; + } + + O << SYSm; + + return; + } + + // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as + // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively. + unsigned SpecRegRBit = Op.getImm() >> 4; + unsigned Mask = Op.getImm() & 0xf; + + if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) { + O << "APSR_"; + switch (Mask) { + default: + llvm_unreachable("Unexpected mask value!"); + case 4: + O << "g"; + return; + case 8: + O << "nzcvq"; + return; + case 12: + O << "nzcvqg"; + return; + } + } + + if (SpecRegRBit) + O << "SPSR"; + else + O << "CPSR"; + + if (Mask) { + O << '_'; + if (Mask & 8) + O << 'f'; + if (Mask & 4) + O << 's'; + if (Mask & 2) + O << 'x'; + if (Mask & 1) + O << 'c'; + } +} + +void ARMInstPrinter::printBankedRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint32_t Banked = MI->getOperand(OpNum).getImm(); + auto TheReg = ARMBankedReg::lookupBankedRegByEncoding(Banked); + assert(TheReg && "invalid banked register operand"); + std::string Name = TheReg->Name; + + uint32_t isSPSR = (Banked & 0x20) >> 5; + if (isSPSR) + Name.replace(0, 4, "SPSR"); // convert 'spsr_' to 'SPSR_' + O << Name; +} + +void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + // Handle the undefined 15 CC value here for printing so we don't abort(). + if ((unsigned)CC == 15) + O << ""; + else if (CC != ARMCC::AL) + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printMandatoryRestrictedPredicateOperand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + if ((ARMCC::CondCodes)MI->getOperand(OpNum).getImm() == ARMCC::HS) + O << "cs"; + else + printMandatoryPredicateOperand(MI, OpNum, STI, O); +} + +void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printMandatoryInvertedPredicateOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + O << ARMCondCodeToString(ARMCC::getOppositeCondition(CC)); +} + +void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNum).getReg()) { + assert(MI->getOperand(OpNum).getReg() == ARM::CPSR && + "Expect ARM CPSR register!"); + O << 's'; + } +} + +void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "p" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "c" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{" << MI->getOperand(OpNum).getImm() << "}"; +} + +void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + llvm_unreachable("Unhandled PC-relative pseudo-instruction!"); +} + +template +void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + + if (MO.isExpr()) { + MO.getExpr()->print(O, &MAI); + return; + } + + int32_t OffImm = (int32_t)MO.getImm() << scale; + + O << markup(""); +} + +void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << markup("getOperand(OpNum).getImm() * 4) + << markup(">"); +} + +void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + O << markup(""); +} + +void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned Mask = MI->getOperand(OpNum).getImm(); + unsigned NumTZ = countTrailingZeros(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + if ((Mask >> Pos) & 1) + O << 'e'; + else + O << 't'; + } +} + +void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, STI, O); + return; + } + + O << markup(""); +} + +void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O, + unsigned Scale) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, STI, O); + return; + } + + O << markup(""); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 1); +} + +void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 2); +} + +void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4); +} + +void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4); +} + +// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 +// register with shift forms. +// REG 0 0 - e.g. R5 +// REG IMM, SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + unsigned Reg = MO1.getReg(); + printRegName(O, Reg); + + // Print the shift opc. + assert(MO2.isImm() && "Not a valid t2_so_reg value!"); + printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()), + ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); +} + +template +void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup(""); + } else if (AlwaysPrintImm0 || OffImm > 0) { + O << ", " << markup(""); + } + O << "]" << markup(">"); +} + +template +void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup(""); + } else if (AlwaysPrintImm0 || OffImm > 0) { + O << ", " << markup(""); + } + O << "]" << markup(">"); +} + +template +void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.isReg()) { // For label symbolic references. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup(""); + } else if (AlwaysPrintImm0 || OffImm > 0) { + O << ", " << markup(""); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup(""); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printT2AddrModeImm8OffsetOperand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + O << ", " << markup(""); +} + +void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + + assert(((OffImm & 0x3) == 0) && "Not a valid immediate!"); + + O << ", " << markup(""); +} + +void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + const MCOperand &MO3 = MI->getOperand(OpNum + 2); + + O << markup(""); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + O << markup(""); +} + +void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned EncodedImm = MI->getOperand(OpNum).getImm(); + unsigned EltBits; + uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + O << markup(""); +} + +void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + O << markup(""); +} + +void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + if (Imm == 0) + return; + assert(Imm <= 3 && "illegal ror immediate!"); + O << ", ror " << markup(""); +} + +void ARMInstPrinter::printModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + MCOperand Op = MI->getOperand(OpNum); + + // Support for fixups (MCFixup) + if (Op.isExpr()) + return printOperand(MI, OpNum, STI, O); + + unsigned Bits = Op.getImm() & 0xFF; + unsigned Rot = (Op.getImm() & 0xF00) >> 7; + + bool PrintUnsigned = false; + switch (MI->getOpcode()) { + case ARM::MOVi: + // Movs to PC should be treated unsigned + PrintUnsigned = (MI->getOperand(OpNum - 1).getReg() == ARM::PC); + break; + case ARM::MSRi: + // Movs to special registers should be treated unsigned + PrintUnsigned = true; + break; + } + + int32_t Rotated = ARM_AM::rotr32(Bits, Rot); + if (ARM_AM::getSOImmVal(Rotated) == Op.getImm()) { + // #rot has the least possible value + O << "#" << markup("(Rotated); + else + O << Rotated; + O << markup(">"); + return; + } + + // Explicit #bits, #rot implied + O << "#" << markup("") << ", #" << markup(""); +} + +void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + O << markup("getOperand(OpNum).getImm() + << markup(">"); +} + +void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + O << markup("getOperand(OpNum).getImm() + << markup(">"); +} + +void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "[" << MI->getOperand(OpNum).getImm() << "]"; +} + +void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "}"; +} + +void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1); + O << "{"; + printRegName(O, Reg0); + O << ", "; + printRegName(O, Reg1); + O << "}"; +} + +void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2); + O << "{"; + printRegName(O, Reg0); + O << ", "; + printRegName(O, Reg1); + O << "}"; +} + +void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "}"; +} + +void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 3); + O << "}"; +} + +void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1); + O << "{"; + printRegName(O, Reg0); + O << "[], "; + printRegName(O, Reg1); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 3); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListTwoSpacedAllLanes( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2); + O << "{"; + printRegName(O, Reg0); + O << "[], "; + printRegName(O, Reg1); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListThreeSpacedAllLanes( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListFourSpacedAllLanes( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 6); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << "}"; +} + +void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 6); + O << "}"; +} + +template +void ARMInstPrinter::printMVEVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + const char *Prefix = "{"; + for (unsigned i = 0; i < NumRegs; i++) { + O << Prefix; + printRegName(O, MRI.getSubReg(Reg, ARM::qsub_0 + i)); + Prefix = ", "; + } + O << "}"; +} + +template +void ARMInstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + O << "#" << (Val * Angle) + Remainder; +} + +void ARMInstPrinter::printVPTPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARMVCC::VPTCodes CC = (ARMVCC::VPTCodes)MI->getOperand(OpNum).getImm(); + if (CC != ARMVCC::None) + O << ARMVPTPredToString(CC); +} + +void ARMInstPrinter::printVPTMask(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // (3 - the number of trailing zeroes) is the number of them / else. + unsigned Mask = MI->getOperand(OpNum).getImm(); + unsigned NumTZ = countTrailingZeros(Mask); + assert(NumTZ <= 3 && "Invalid VPT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = ((Mask >> Pos) & 1) == 0; + if (T) + O << 't'; + else + O << 'e'; + } +} + +void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint32_t Val = MI->getOperand(OpNum).getImm(); + O << markup(""); +} diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h new file mode 100644 index 000000000000..69026956b60e --- /dev/null +++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h @@ -0,0 +1,272 @@ +//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMINSTPRINTER_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMINSTPRINTER_H + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class ARMInstPrinter : public MCInstPrinter { +public: + ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + bool applyTargetSpecificCLOption(StringRef Opt) override; + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, + raw_ostream &O); + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = ARM::NoRegAltName); + + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + + void printSORegRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSORegImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printAddrModeTBB(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrModeTBH(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O, + bool AlwaysPrintImm0); + void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printMemBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInstSyncBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printTraceSyncBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printShiftImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + template + void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbSRImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbITMask(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O, unsigned Scale); + void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printT2SOOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printSetendOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCPSIMod(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCPSIFlag(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBankedRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printMandatoryRestrictedPredicateOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printMandatoryInvertedPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRegisterList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNoHashImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRotImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printGPRPairOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printPCLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFBits16(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFBits32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListOne(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwo(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListThree(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListFour(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printMVEVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printComplexRotationOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + // MVE + void printVPTPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVPTMask(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template + void printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpandedImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + +private: + unsigned DefaultAltIdx = ARM::NoRegAltName; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMINSTPRINTER_H diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 3ee63ac374b3..d30d15df3d00 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h index 5e548162bec6..55d7b299674d 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index b37b8073548f..dca6fe37d49a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,7 +49,7 @@ namespace { class ARMMCCodeEmitter : public MCCodeEmitter { const MCInstrInfo &MCII; - const MCContext &CTX; + MCContext &CTX; bool IsLittleEndian; public: @@ -163,6 +162,15 @@ public: SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + uint32_t getITMaskOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMVEShiftImmOpValue - Return encoding info for the 'sz:imm5' + /// operand. + uint32_t getMVEShiftImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' /// operand. @@ -181,18 +189,37 @@ public: SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + /// getT2AddrModeImm7s4OpValue - Return encoding info for 'reg +/- imm7<<2' + /// operand. + uint32_t getT2AddrModeImm7s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2' /// operand. uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; - /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2' + /// getT2ScaledImmOpValue - Return encoding info for '+/- immX< &Fixups, - const MCSubtargetInfo &STI) const; + template + uint32_t getT2ScaledImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + /// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg' + /// operand. + uint32_t getMveAddrModeRQOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMveAddrModeQOpValue - Return encoding info for 'reg +/- imm7<<{shift}' + /// operand. + template + uint32_t getMveAddrModeQOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm' /// operand as needed by load/store instructions. @@ -224,8 +251,9 @@ public: case ARM_AM::asr: return 2; case ARM_AM::ror: case ARM_AM::rrx: return 3; + default: + llvm_unreachable("Invalid ShiftOpc!"); } - llvm_unreachable("Invalid ShiftOpc!"); } /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands. @@ -283,40 +311,6 @@ public: return MI.getOperand(Op).getReg() == ARM::CPSR; } - /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value. - unsigned getSOImmOpValue(const MCInst &MI, unsigned Op, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(Op); - - // We expect MO to be an immediate or an expression, - // if it is an immediate - that's fine, just encode the value. - // Otherwise - create a Fixup. - if (MO.isExpr()) { - const MCExpr *Expr = MO.getExpr(); - // In instruction code this value always encoded as lowest 12 bits, - // so we don't have to perform any specific adjustments. - // Due to requirements of relocatable records we have to use FK_Data_4. - // See ARMELFObjectWriter::ExplicitRelSym and - // ARMELFObjectWriter::GetRelocTypeInner for more details. - MCFixupKind Kind = MCFixupKind(FK_Data_4); - Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); - return 0; - } - - unsigned SoImm = MO.getImm(); - int SoImmVal = ARM_AM::getSOImmVal(SoImm); - assert(SoImmVal != -1 && "Not a valid so_imm value!"); - - // Encode rotate_imm. - unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) - << ARMII::SoRotImmShift; - - // Encode immed_8. - Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); - return Binary; - } - unsigned getModImmOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups, const MCSubtargetInfo &ST) const { @@ -358,7 +352,8 @@ public: unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; - unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + template + unsigned getT2AddrModeImmOpValue(const MCInst &MI, unsigned OpNum, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, @@ -418,6 +413,14 @@ public: unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + template + unsigned getExpandedImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + static_assert(shift <= 32, "Shift count must be less than or equal to 32."); + const MCOperand MO = MI.getOperand(Op); + return (invert ? (MO.getImm() ^ 0xff) : MO.getImm()) >> shift; + } unsigned NEONThumb2DataIPostEncoder(const MCInst &MI, unsigned EncodedValue, @@ -436,6 +439,10 @@ public: unsigned EncodedValue, const MCSubtargetInfo &STI) const; + uint32_t getPowerTwoOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; } @@ -451,6 +458,26 @@ public: void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const override; + + template + uint32_t getBFTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getBFAfterTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getVPTMaskOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getRestrictedCondCodeOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + template + uint32_t getMVEPairVectorIndexOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; }; } // end anonymous namespace @@ -537,7 +564,15 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, unsigned Reg = MO.getReg(); unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg); - // Q registers are encoded as 2x their register number. + // In NEON, Q registers are encoded as 2x their register number, + // because they're using the same indices as the D registers they + // overlap. In MVE, there are no 64-bit vector instructions, so + // the encodings all refer to Q-registers by their literal + // register number. + + if (STI.getFeatureBits()[ARM::HasMVEIntegerOps]) + return RegNo; + switch (Reg) { default: return RegNo; @@ -849,6 +884,33 @@ getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, return Val; } +/// getITMaskOpValue - Return the architectural encoding of an IT +/// predication mask, given the MCOperand format. +uint32_t ARMMCCodeEmitter:: +getITMaskOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MaskMO = MI.getOperand(OpIdx); + assert(MaskMO.isImm() && "Unexpected operand type!"); + + unsigned Mask = MaskMO.getImm(); + + // IT masks are encoded as a sequence of replacement low-order bits + // for the condition code. So if the low bit of the starting + // condition code is 1, then we have to flip all the bits above the + // terminating bit (which is the lowest 1 bit). + assert(OpIdx > 0 && "IT mask appears first!"); + const MCOperand CondMO = MI.getOperand(OpIdx-1); + assert(CondMO.isImm() && "Unexpected operand type!"); + if (CondMO.getImm() & 1) { + unsigned LowBit = Mask & -Mask; + unsigned BitsAboveLowBit = 0xF & (-LowBit << 1); + Mask ^= BitsAboveLowBit; + } + + return Mask; +} + /// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label /// target. uint32_t ARMMCCodeEmitter:: @@ -878,6 +940,41 @@ getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, return (Rm << 3) | Rn; } +/// getMVEShiftImmOpValue - Return encoding info for the 'sz:imm5' +/// operand. +uint32_t +ARMMCCodeEmitter::getMVEShiftImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + // {4-0} = szimm5 + // The value we are trying to encode is an immediate between either the + // range of [1-7] or [1-15] depending on whether we are dealing with the + // u8/s8 or the u16/s16 variants respectively. + // This value is encoded as follows, if ShiftImm is the value within those + // ranges then the encoding szimm5 = ShiftImm + size, where size is either 8 + // or 16. + + unsigned Size, ShiftImm; + switch(MI.getOpcode()) { + case ARM::MVE_VSHLL_imms16bh: + case ARM::MVE_VSHLL_imms16th: + case ARM::MVE_VSHLL_immu16bh: + case ARM::MVE_VSHLL_immu16th: + Size = 16; + break; + case ARM::MVE_VSHLL_imms8bh: + case ARM::MVE_VSHLL_imms8th: + case ARM::MVE_VSHLL_immu8bh: + case ARM::MVE_VSHLL_immu8th: + Size = 8; + break; + default: + llvm_unreachable("Use of operand not supported by this instruction"); + } + ShiftImm = MI.getOperand(OpIdx).getImm(); + return Size + ShiftImm; +} + /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand. uint32_t ARMMCCodeEmitter:: getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, @@ -929,12 +1026,11 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, return Binary; } -/// getT2Imm8s4OpValue - Return encoding info for -/// '+/- imm8<<2' operand. +template uint32_t ARMMCCodeEmitter:: -getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +getT2ScaledImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { // FIXME: The immediate operand should have already been encoded like this // before ever getting here. The encoder method should just need to combine // the MI operands for the register and the offset into a single @@ -942,25 +1038,75 @@ getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, // style, unfortunately. As-is, we can't represent the distinct encoding // for #-0. - // {8} = (U)nsigned (add == '1', sub == '0') - // {7-0} = imm8 - int32_t Imm8 = MI.getOperand(OpIdx).getImm(); - bool isAdd = Imm8 >= 0; + // {Bits} = (U)nsigned (add == '1', sub == '0') + // {(Bits-1)-0} = immediate + int32_t Imm = MI.getOperand(OpIdx).getImm(); + bool isAdd = Imm >= 0; // Immediate is always encoded as positive. The 'U' bit controls add vs sub. - if (Imm8 < 0) - Imm8 = -(uint32_t)Imm8; + if (Imm < 0) + Imm = -(uint32_t)Imm; - // Scaled by 4. - Imm8 /= 4; + Imm >>= Shift; - uint32_t Binary = Imm8 & 0xff; + uint32_t Binary = Imm & ((1U << Bits) - 1); // Immediate is always encoded as positive. The 'U' bit controls add vs sub. if (isAdd) - Binary |= (1 << 8); + Binary |= (1U << Bits); return Binary; } +/// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg' +/// operand. +uint32_t ARMMCCodeEmitter:: +getMveAddrModeRQOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + // {6-3} Rn + // {2-0} Qm + const MCOperand &M0 = MI.getOperand(OpIdx); + const MCOperand &M1 = MI.getOperand(OpIdx + 1); + + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(M0.getReg()); + unsigned Qm = CTX.getRegisterInfo()->getEncodingValue(M1.getReg()); + + assert(Qm < 8 && "Qm is supposed to be encodable in 3 bits"); + + return (Rn << 3) | Qm; +} + +/// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg' +/// operand. +template +uint32_t ARMMCCodeEmitter:: +getMveAddrModeQOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + // {10-8} Qm + // {7-0} Imm + const MCOperand &M0 = MI.getOperand(OpIdx); + const MCOperand &M1 = MI.getOperand(OpIdx + 1); + + unsigned Qm = CTX.getRegisterInfo()->getEncodingValue(M0.getReg()); + int32_t Imm = M1.getImm(); + + bool isAdd = Imm >= 0; + + Imm >>= shift; + + if (!isAdd) + Imm = -(uint32_t)Imm; + + Imm &= 0x7f; + + if (isAdd) + Imm |= 0x80; + + assert(Qm < 8 && "Qm is supposed to be encodable in 3 bits"); + + return (Qm << 8) | Imm; +} + /// getT2AddrModeImm8s4OpValue - Return encoding info for /// 'reg +/- imm8<<2' operand. uint32_t ARMMCCodeEmitter:: @@ -1002,6 +1148,33 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, return Binary; } +/// getT2AddrModeImm7s4OpValue - Return encoding info for +/// 'reg +/- imm7<<2' operand. +uint32_t +ARMMCCodeEmitter::getT2AddrModeImm7s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + // {11-8} = reg + // {7} = (A)dd (add == '1', sub == '0') + // {6-0} = imm7 + unsigned Reg, Imm7; + // If The first operand isn't a register, we have a label reference. + bool isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm7, Fixups, STI); + + // FIXME: The immediate operand should have already been encoded like this + // before ever getting here. The encoder method should just need to combine + // the MI operands for the register and the offset into a single + // representation for the complex operand in the .td file. This isn't just + // style, unfortunately. As-is, we can't represent the distinct encoding + // for #-0. + uint32_t Binary = (Imm7 >> 2) & 0xff; + // Immediate is always encoded as positive. The 'A' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 7); + Binary |= (Reg << 8); + return Binary; +} + /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for /// 'reg + imm8<<2' operand. uint32_t ARMMCCodeEmitter:: @@ -1434,25 +1607,29 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, return Value; } +template unsigned ARMMCCodeEmitter:: -getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +getT2AddrModeImmOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { const MCOperand &MO1 = MI.getOperand(OpNum); const MCOperand &MO2 = MI.getOperand(OpNum+1); // FIXME: Needs fixup support. unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg()); - // Even though the immediate is 8 bits long, we need 9 bits in order + // If the immediate is B bits long, we need B+1 bits in order // to represent the (inverse of the) sign bit. - Value <<= 9; + Value <<= (Bits + 1); int32_t tmp = (int32_t)MO2.getImm(); - if (tmp < 0) + if (tmp == INT32_MIN) { // represents subtracting zero rather than adding it + tmp = 0; + } else if (tmp < 0) { tmp = abs(tmp); - else - Value |= 256; // Set the ADD bit - Value |= tmp & 255; + } else { + Value |= (1U << Bits); // Set the ADD bit + } + Value |= (tmp >> Shift) & ((1U << Bits) - 1); return Value; } @@ -1534,7 +1711,7 @@ unsigned ARMMCCodeEmitter:: getRegisterListOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { - // VLDM/VSTM: + // VLDM/VSTM/VSCCLRM: // {12-8} = Vd // {7-0} = Number of registers // @@ -1543,28 +1720,40 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op, unsigned Reg = MI.getOperand(Op).getReg(); bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg); bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg); + bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM; unsigned Binary = 0; if (SPRRegs || DPRRegs) { - // VLDM/VSTM + // VLDM/VSTM/VSCCLRM unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg); unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff; Binary |= (RegNo & 0x1f) << 8; + + // Ignore VPR + if (MI.getOpcode() == ARM::VSCCLRMD || MI.getOpcode() == ARM::VSCCLRMS) + --NumRegs; if (SPRRegs) Binary |= NumRegs; else Binary |= NumRegs * 2; } else { const MCRegisterInfo &MRI = *CTX.getRegisterInfo(); - assert(std::is_sorted(MI.begin() + Op, MI.end(), - [&](const MCOperand &LHS, const MCOperand &RHS) { - return MRI.getEncodingValue(LHS.getReg()) < - MRI.getEncodingValue(RHS.getReg()); - })); + if (!CLRMRegs) { + assert(std::is_sorted(MI.begin() + Op, MI.end(), + [&](const MCOperand &LHS, const MCOperand &RHS) { + return MRI.getEncodingValue(LHS.getReg()) < + MRI.getEncodingValue(RHS.getReg()); + })); + } for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { - unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg()); + unsigned RegNo; + if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) { + RegNo = 15; + } else { + RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg()); + } Binary |= 1 << RegNo; } } @@ -1710,6 +1899,120 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, ++MCNumEmitted; // Keep track of the # of mi's emitted. } +template +uint32_t +ARMMCCodeEmitter::getBFTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, fixup, Fixups, STI); + return isNeg ? -(MO.getImm() >> 1) : (MO.getImm() >> 1); +} + +uint32_t +ARMMCCodeEmitter::getBFAfterTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + const MCOperand BranchMO = MI.getOperand(0); + + if (MO.isExpr()) { + assert(BranchMO.isExpr()); + const MCExpr *DiffExpr = MCBinaryExpr::createSub( + MO.getExpr(), BranchMO.getExpr(), CTX); + MCFixupKind Kind = MCFixupKind(ARM::fixup_bfcsel_else_target); + Fixups.push_back(llvm::MCFixup::create(0, DiffExpr, Kind, MI.getLoc())); + return 0; + } + + assert(MO.isImm() && BranchMO.isImm()); + int Diff = MO.getImm() - BranchMO.getImm(); + assert(Diff == 4 || Diff == 2); + + return Diff == 4; +} + +uint32_t ARMMCCodeEmitter::getVPTMaskOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI)const { + const MCOperand MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Unexpected operand type!"); + + int Value = MO.getImm(); + int Imm = 0; + + // VPT Masks are actually encoded as a series of invert/don't invert bits, + // rather than true/false bits. + unsigned PrevBit = 0; + for (int i = 3; i >= 0; --i) { + unsigned Bit = (Value >> i) & 1; + + // Check if we are at the end of the mask. + if ((Value & ~(~0U << i)) == 0) { + Imm |= (1 << i); + break; + } + + // Convert the bit in the mask based on the previous bit. + if (Bit != PrevBit) + Imm |= (1 << i); + + PrevBit = Bit; + } + + return Imm; +} + +uint32_t ARMMCCodeEmitter::getRestrictedCondCodeOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + + const MCOperand MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Unexpected operand type!"); + + switch (MO.getImm()) { + default: + assert(0 && "Unexpected Condition!"); + return 0; + case ARMCC::HS: + case ARMCC::EQ: + return 0; + case ARMCC::HI: + case ARMCC::NE: + return 1; + case ARMCC::GE: + return 4; + case ARMCC::LT: + return 5; + case ARMCC::GT: + return 6; + case ARMCC::LE: + return 7; + } +} + +uint32_t ARMMCCodeEmitter:: +getPowerTwoOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Unexpected operand type!"); + return countTrailingZeros((uint64_t)MO.getImm()); +} + +template +uint32_t ARMMCCodeEmitter:: +getMVEPairVectorIndexOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Unexpected operand type!"); + + int Value = MO.getImm(); + return Value - start; +} + #include "ARMGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp index 306f068312f5..fbad05fb1759 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp @@ -1,9 +1,8 @@ //===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h index 75dde8008fca..033a43288f3e 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -1,9 +1,8 @@ //===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 46434007a854..90022a8d88a6 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,8 +12,9 @@ #include "ARMMCTargetDesc.h" #include "ARMBaseInfo.h" +#include "ARMInstPrinter.h" #include "ARMMCAsmInfo.h" -#include "InstPrinter/ARMInstPrinter.h" +#include "TargetInfo/ARMTargetInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" @@ -277,14 +277,29 @@ class ThumbMCInstrAnalysis : public ARMMCInstrAnalysis { public: ThumbMCInstrAnalysis(const MCInstrInfo *Info) : ARMMCInstrAnalysis(Info) {} - bool evaluateBranch(const MCInst &Inst, uint64_t Addr, - uint64_t Size, uint64_t &Target) const override { + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const override { + unsigned OpId; + switch (Inst.getOpcode()) { + default: + OpId = 0; + break; + case ARM::t2WLS: + case ARM::t2LEUpdate: + OpId = 2; + break; + case ARM::t2LE: + OpId = 1; + break; + } + // We only handle PCRel branches for now. - if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL) + if (Info->get(Inst.getOpcode()).OpInfo[OpId].OperandType != + MCOI::OPERAND_PCREL) return false; - int64_t Imm = Inst.getOperand(0).getImm(); - Target = Addr+Imm+4; // In Thumb mode the PC is always off by 4 bytes. + // In Thumb mode the PC is always off by 4 bytes. + Target = Addr + Inst.getOperand(OpId).getImm() + 4; return true; } }; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 3ee004592ac6..9cbbd56225ef 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,6 +14,7 @@ #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H #include "llvm/Support/DataTypes.h" +#include "llvm/MC/MCInstrDesc.h" #include #include @@ -39,11 +39,6 @@ class Triple; class raw_ostream; class raw_pwrite_stream; -Target &getTheARMLETarget(); -Target &getTheThumbLETarget(); -Target &getTheARMBETarget(); -Target &getTheThumbBETarget(); - namespace ARM_MC { std::string ParseARMTriple(const Triple &TT, StringRef CPU); @@ -100,6 +95,20 @@ createARMWinCOFFObjectWriter(bool Is64Bit); /// Construct ARM Mach-O relocation info. MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx); + +namespace ARM { +enum OperandType { + OPERAND_VPRED_R = MCOI::OPERAND_FIRST_TARGET, + OPERAND_VPRED_N, +}; +inline bool isVpred(OperandType op) { + return op == OPERAND_VPRED_R || op == OPERAND_VPRED_N; +} +inline bool isVpred(uint8_t op) { + return isVpred(static_cast(op)); +} +} // end namespace ARM + } // End llvm namespace // Defines symbolic names for ARM registers. This defines a mapping from diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp index 6259c98321f4..886b7e7bc84e 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp @@ -1,9 +1,8 @@ //===- ARMMachORelocationInfo.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 0ced8195790d..c49885023cb2 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 91836cff95c8..b863517c0cca 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -1,9 +1,8 @@ //===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -125,7 +124,9 @@ static ARMBuildAttrs::CPUArch getArchForCPU(const MCSubtargetInfo &STI) { if (STI.hasFeature(ARM::FeatureRClass)) return ARMBuildAttrs::v8_R; return ARMBuildAttrs::v8_A; - } else if (STI.hasFeature(ARM::HasV8MMainlineOps)) + } else if (STI.hasFeature(ARM::HasV8_1MMainlineOps)) + return ARMBuildAttrs::v8_1_M_Main; + else if (STI.hasFeature(ARM::HasV8MMainlineOps)) return ARMBuildAttrs::v8_M_Main; else if (STI.hasFeature(ARM::HasV7Ops)) { if (STI.hasFeature(ARM::FeatureMClass) && STI.hasFeature(ARM::FeatureDSP)) @@ -223,37 +224,37 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) { ? ARMBuildAttrs::AllowNeonARMv8_1a : ARMBuildAttrs::AllowNeonARMv8); } else { - if (STI.hasFeature(ARM::FeatureFPARMv8)) + if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP)) // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one // FPU, but there are two different names for it depending on the CPU. - emitFPU(STI.hasFeature(ARM::FeatureD16) - ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV5_SP_D16 - : ARM::FK_FPV5_D16) - : ARM::FK_FP_ARMV8); - else if (STI.hasFeature(ARM::FeatureVFP4)) - emitFPU(STI.hasFeature(ARM::FeatureD16) - ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV4_SP_D16 - : ARM::FK_VFPV4_D16) - : ARM::FK_VFPV4); - else if (STI.hasFeature(ARM::FeatureVFP3)) + emitFPU(STI.hasFeature(ARM::FeatureD32) + ? ARM::FK_FP_ARMV8 + : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16 + : ARM::FK_FPV5_SP_D16)); + else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP)) + emitFPU(STI.hasFeature(ARM::FeatureD32) + ? ARM::FK_VFPV4 + : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_VFPV4_D16 + : ARM::FK_FPV4_SP_D16)); + else if (STI.hasFeature(ARM::FeatureVFP3_D16_SP)) emitFPU( - STI.hasFeature(ARM::FeatureD16) - // +d16 - ? (STI.hasFeature(ARM::FeatureVFPOnlySP) - ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16 - : ARM::FK_VFPV3XD) - : (STI.hasFeature(ARM::FeatureFP16) + STI.hasFeature(ARM::FeatureD32) + // +d32 + ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_FP16 + : ARM::FK_VFPV3) + // -d32 + : (STI.hasFeature(ARM::FeatureFP64) + ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_D16_FP16 - : ARM::FK_VFPV3_D16)) - // -d16 - : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_FP16 - : ARM::FK_VFPV3)); - else if (STI.hasFeature(ARM::FeatureVFP2)) + : ARM::FK_VFPV3_D16) + : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16 + : ARM::FK_VFPV3XD))); + else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP)) emitFPU(ARM::FK_VFPV2); } // ABI_HardFP_use attribute to indicate single precision FP. - if (STI.hasFeature(ARM::FeatureVFPOnlySP)) + if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64)) emitAttribute(ARMBuildAttrs::ABI_HardFP_use, ARMBuildAttrs::HardFPSinglePrecision); @@ -263,6 +264,11 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) { if (STI.hasFeature(ARM::FeatureMP)) emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP); + if (STI.hasFeature(ARM::HasMVEFloatOps)) + emitAttribute(ARMBuildAttrs::MVE_arch, ARMBuildAttrs::AllowMVEIntegerAndFloat); + else if (STI.hasFeature(ARM::HasMVEIntegerOps)) + emitAttribute(ARMBuildAttrs::MVE_arch, ARMBuildAttrs::AllowMVEInteger); + // Hardware divide in ARM mode is part of base arch, starting from ARMv8. // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M). // It is not possible to produce DisallowDIV: if hwdiv is present in the base diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp index d3ab83bbccbc..38667d686b85 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -1,9 +1,8 @@ //===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h index a7bfbdf4938e..c3134c04b33a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h +++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h @@ -1,9 +1,8 @@ //===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 30cbde1ca71f..054a95dd1e12 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index 32cb3dcdcad8..2e816bea5e91 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -1,9 +1,8 @@ //===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 7f03e1463c1d..4b25986b90a7 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -1,9 +1,8 @@ //===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp index b0491a4108a6..86cb907abfa3 100644 --- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp +++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp @@ -1,13 +1,12 @@ //===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "TargetInfo/ARMTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.h b/lib/Target/ARM/TargetInfo/ARMTargetInfo.h new file mode 100644 index 000000000000..c217dd5c4612 --- /dev/null +++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.h @@ -0,0 +1,23 @@ +//===-- ARMTargetInfo.h - ARM Target Implementation -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_TARGETINFO_ARMTARGETINFO_H +#define LLVM_LIB_TARGET_ARM_TARGETINFO_ARMTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheARMLETarget(); +Target &getTheARMBETarget(); +Target &getTheThumbLETarget(); +Target &getTheThumbBETarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_ARM_TARGETINFO_ARMTARGETINFO_H diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 5c745e112b2e..426e9a0ed9b8 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -1,9 +1,8 @@ //===- Thumb1FrameLowering.cpp - Thumb1 Frame Information -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,15 +63,52 @@ bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{ return !MFI.hasVarSizedObjects(); } -static void emitSPUpdate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo &TII, const DebugLoc &dl, - const ThumbRegisterInfo &MRI, int NumBytes, - unsigned MIFlags = MachineInstr::NoFlags) { +static void +emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, const DebugLoc &dl, + const ThumbRegisterInfo &MRI, int NumBytes, + unsigned ScratchReg, unsigned MIFlags) { + // If it would take more than three instructions to adjust the stack pointer + // using tADDspi/tSUBspi, load an immediate instead. + if (std::abs(NumBytes) > 508 * 3) { + // We use a different codepath here from the normal + // emitThumbRegPlusImmediate so we don't have to deal with register + // scavenging. (Scavenging could try to use the emergency spill slot + // before we've actually finished setting up the stack.) + if (ScratchReg == ARM::NoRegister) + report_fatal_error("Failed to emit Thumb1 stack adjustment"); + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &ST = MF.getSubtarget(); + if (ST.genExecuteOnly()) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ScratchReg) + .addImm(NumBytes).setMIFlags(MIFlags); + } else { + MRI.emitLoadConstPool(MBB, MBBI, dl, ScratchReg, 0, NumBytes, ARMCC::AL, + 0, MIFlags); + } + BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP) + .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill) + .add(predOps(ARMCC::AL)); + return; + } + // FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate + // won't change. emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, MRI, MIFlags); + } +static void emitCallSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, const DebugLoc &dl, + const ThumbRegisterInfo &MRI, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags) { + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI, MIFlags); +} + + MachineBasicBlock::iterator Thumb1FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -96,10 +132,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // Replace the pseudo instruction with a new instruction... unsigned Opc = Old.getOpcode(); if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { - emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount); + emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount); } else { assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); - emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount); + emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, Amount); } } } @@ -142,8 +178,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, int FramePtrSpillFI = 0; if (ArgRegsSaveSize) { - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, - MachineInstr::FrameSetup); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, + ARM::NoRegister, MachineInstr::FrameSetup); CFAOffset -= ArgRegsSaveSize; unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); @@ -154,8 +190,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) { - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize), - MachineInstr::FrameSetup); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, + -(NumBytes - ArgRegsSaveSize), + ARM::NoRegister, MachineInstr::FrameSetup); CFAOffset -= NumBytes - ArgRegsSaveSize; unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); @@ -332,8 +369,20 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes) { // Insert it after all the callee-save spills. - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, - MachineInstr::FrameSetup); + // + // For a large stack frame, we might need a scratch register to store + // the size of the frame. We know all callee-save registers are free + // at this point in the prologue, so pick one. + unsigned ScratchRegister = ARM::NoRegister; + for (auto &I : CSI) { + unsigned Reg = I.getReg(); + if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { + ScratchRegister = Reg; + break; + } + } + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + ScratchRegister, MachineInstr::FrameSetup); if (!HasFP) { CFAOffset -= NumBytes; unsigned CFIIndex = MF.addFrameInst( @@ -438,7 +487,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, + NumBytes - ArgRegsSaveSize, ARM::NoRegister, + MachineInstr::NoFlags); } else { // Unwind MBBI to point to first LDR / VLDRD. if (MBBI != MBB.begin()) { @@ -473,13 +524,27 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, .addReg(FramePtr) .add(predOps(ARMCC::AL)); } else { + // For a large stack frame, we might need a scratch register to store + // the size of the frame. We know all callee-save registers are free + // at this point in the epilogue, so pick one. + unsigned ScratchRegister = ARM::NoRegister; + bool HasFP = hasFP(MF); + for (auto &I : MFI.getCalleeSavedInfo()) { + unsigned Reg = I.getReg(); + if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { + ScratchRegister = Reg; + break; + } + } if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET && &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) { MachineBasicBlock::iterator PMBBI = std::prev(MBBI); if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes)) - emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); + emitPrologueEpilogueSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes, + ScratchRegister, MachineInstr::NoFlags); } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes)) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes, + ScratchRegister, MachineInstr::NoFlags); } } @@ -666,7 +731,9 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, // Advance past the pop instruction. MBBI++; // Increment the SP. - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize + 4); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, + ArgRegsSaveSize + 4, ARM::NoRegister, + MachineInstr::NoFlags); return true; } @@ -707,7 +774,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, .add(predOps(ARMCC::AL)) .addReg(PopReg, RegState::Define); - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize, + ARM::NoRegister, MachineInstr::NoFlags); BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) .addReg(ARM::LR, RegState::Define) @@ -821,8 +889,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd); // Create the PUSH, but don't insert it yet (the MOVs need to come first). - MachineInstrBuilder PushMIB = - BuildMI(MF, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); + MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH)) + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameSetup); SmallVector RegsToPush; while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) { @@ -835,7 +904,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr)) .addReg(*CopyReg, RegState::Define) .addReg(*HiRegToSave, getKillRegState(isKill)) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameSetup); // Record the register that must be added to the PUSH. RegsToPush.push_back(*CopyReg); diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h index a4d6451ccf12..61af48712b6c 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.h +++ b/lib/Target/ARM/Thumb1FrameLowering.h @@ -1,9 +1,8 @@ //===- Thumb1FrameLowering.h - Thumb1-specific frame info stuff ---*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 11aa285fc939..f57d93a2e83d 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -1,9 +1,8 @@ //===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index 9f04a3ed262f..bc433e7a7a93 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -1,9 +1,8 @@ //===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp index e0a5f7f04fa9..3143eb9840ed 100644 --- a/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -1,9 +1,8 @@ //===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -32,13 +31,16 @@ using namespace llvm; #define DEBUG_TYPE "thumb2-it" +#define PASS_NAME "Thumb IT blocks insertion pass" STATISTIC(NumITs, "Number of IT blocks inserted"); STATISTIC(NumMovedInsts, "Number of predicated instructions moved"); +using RegisterSet = SmallSet; + namespace { - class Thumb2ITBlockPass : public MachineFunctionPass { + class Thumb2ITBlock : public MachineFunctionPass { public: static char ID; @@ -47,7 +49,7 @@ namespace { const TargetRegisterInfo *TRI; ARMFunctionInfo *AFI; - Thumb2ITBlockPass() : MachineFunctionPass(ID) {} + Thumb2ITBlock() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &Fn) override; @@ -57,33 +59,32 @@ namespace { } StringRef getPassName() const override { - return "Thumb IT blocks insertion pass"; + return PASS_NAME; } private: bool MoveCopyOutOfITBlock(MachineInstr *MI, ARMCC::CondCodes CC, ARMCC::CondCodes OCC, - SmallSet &Defs, - SmallSet &Uses); - bool InsertITInstructions(MachineBasicBlock &MBB); + RegisterSet &Defs, RegisterSet &Uses); + bool InsertITInstructions(MachineBasicBlock &Block); }; - char Thumb2ITBlockPass::ID = 0; + char Thumb2ITBlock::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(Thumb2ITBlock, DEBUG_TYPE, PASS_NAME, false, false) + /// TrackDefUses - Tracking what registers are being defined and used by /// instructions in the IT block. This also tracks "dependencies", i.e. uses /// in the IT block that are defined before the IT instruction. -static void TrackDefUses(MachineInstr *MI, - SmallSet &Defs, - SmallSet &Uses, +static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses, const TargetRegisterInfo *TRI) { - SmallVector LocalDefs; - SmallVector LocalUses; + using RegList = SmallVector; + RegList LocalDefs; + RegList LocalUses; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (auto &MO : MI->operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); @@ -95,27 +96,21 @@ static void TrackDefUses(MachineInstr *MI, LocalDefs.push_back(Reg); } - for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) { - unsigned Reg = LocalUses[i]; - for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true); - Subreg.isValid(); ++Subreg) - Uses.insert(*Subreg); - } + auto InsertUsesDefs = [&](RegList &Regs, RegisterSet &UsesDefs) { + for (unsigned Reg : Regs) + for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true); + Subreg.isValid(); ++Subreg) + UsesDefs.insert(*Subreg); + }; - for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { - unsigned Reg = LocalDefs[i]; - for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true); - Subreg.isValid(); ++Subreg) - Defs.insert(*Subreg); - if (Reg == ARM::CPSR) - continue; - } + InsertUsesDefs(LocalDefs, Defs); + InsertUsesDefs(LocalUses, Uses); } /// Clear kill flags for any uses in the given set. This will likely /// conservatively remove more kill flags than are necessary, but removing them /// is safer than incorrect kill flags remaining on instructions. -static void ClearKillFlags(MachineInstr *MI, SmallSet &Uses) { +static void ClearKillFlags(MachineInstr *MI, RegisterSet &Uses) { for (MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.isDef() || !MO.isKill()) continue; @@ -138,10 +133,9 @@ static bool isCopy(MachineInstr *MI) { } bool -Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI, - ARMCC::CondCodes CC, ARMCC::CondCodes OCC, - SmallSet &Defs, - SmallSet &Uses) { +Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + RegisterSet &Defs, RegisterSet &Uses) { if (!isCopy(MI)) return false; // llvm models select's as two-address instructions. That means a copy @@ -181,10 +175,13 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI, // Then peek at the next instruction to see if it's predicated on CC or OCC. // If not, then there is nothing to be gained by moving the copy. - MachineBasicBlock::iterator I = MI; ++I; + MachineBasicBlock::iterator I = MI; + ++I; MachineBasicBlock::iterator E = MI->getParent()->end(); + while (I != E && I->isDebugInstr()) ++I; + if (I != E) { unsigned NPredReg = 0; ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg); @@ -194,12 +191,11 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI, return false; } -bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { +bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) { bool Modified = false; - - SmallSet Defs; - SmallSet Uses; + RegisterSet Defs, Uses; MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { MachineInstr *MI = &*MBBI; DebugLoc dl = MI->getDebugLoc(); @@ -246,7 +242,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { unsigned NPredReg = 0; ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg); if (NCC == CC || NCC == OCC) { - Mask |= (NCC & 1) << Pos; + Mask |= ((NCC ^ CC) & 1) << Pos; // Add implicit use of ITSTATE. NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, true/*isImp*/, false/*isKill*/)); @@ -270,8 +266,6 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { // Finalize IT mask. Mask |= (1 << Pos); - // Tag along (firstcond[0] << 4) with the mask. - Mask |= (CC & 1) << 4; MIB.addImm(Mask); // Last instruction in IT block kills ITSTATE. @@ -288,7 +282,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { return Modified; } -bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { +bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) { const ARMSubtarget &STI = static_cast(Fn.getSubtarget()); if (!STI.isThumb2()) @@ -302,11 +296,8 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { return false; bool Modified = false; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) { - MachineBasicBlock &MBB = *MFI; - ++MFI; + for (auto &MBB : Fn ) Modified |= InsertITInstructions(MBB); - } if (Modified) AFI->setHasITBlocks(true); @@ -316,6 +307,132 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { /// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks /// insertion pass. -FunctionPass *llvm::createThumb2ITBlockPass() { - return new Thumb2ITBlockPass(); +FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); } + +#undef DEBUG_TYPE +#define DEBUG_TYPE "arm-mve-vpt" + +namespace { + class MVEVPTBlock : public MachineFunctionPass { + public: + static char ID; + const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; + + MVEVPTBlock() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return "MVE VPT block insertion pass"; + } + + private: + bool InsertVPTBlocks(MachineBasicBlock &MBB); + }; + + char MVEVPTBlock::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) + +enum VPTMaskValue { + T = 8, // 0b1000 + TT = 4, // 0b0100 + TE = 12, // 0b1100 + TTT = 2, // 0b0010 + TTE = 6, // 0b0110 + TEE = 10, // 0b1010 + TET = 14, // 0b1110 + TTTT = 1, // 0b0001 + TTTE = 3, // 0b0011 + TTEE = 5, // 0b0101 + TTET = 7, // 0b0111 + TEEE = 9, // 0b1001 + TEET = 11, // 0b1011 + TETT = 13, // 0b1101 + TETE = 15 // 0b1111 +}; + +bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { + bool Modified = false; + MachineBasicBlock::iterator MBIter = Block.begin(); + MachineBasicBlock::iterator EndIter = Block.end(); + + while (MBIter != EndIter) { + MachineInstr *MI = &*MBIter; + unsigned PredReg = 0; + DebugLoc dl = MI->getDebugLoc(); + + ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg); + + // The idea of the predicate is that None, Then and Else are for use when + // handling assembly language: they correspond to the three possible + // suffixes "", "t" and "e" on the mnemonic. So when instructions are read + // from assembly source or disassembled from object code, you expect to see + // a mixture whenever there's a long VPT block. But in code generation, we + // hope we'll never generate an Else as input to this pass. + + assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds"); + + if (Pred == ARMVCC::None) { + ++MBIter; + continue; + } + + MachineInstrBuilder MIBuilder = + BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST)); + // The mask value for the VPST instruction is T = 0b1000 = 8 + MIBuilder.addImm(VPTMaskValue::T); + + MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr(); + int VPTInstCnt = 1; + ARMVCC::VPTCodes NextPred; + + do { + ++MBIter; + NextPred = getVPTInstrPredicate(*MBIter, PredReg); + } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4); + + MachineInstr *LastMI = &*MBIter; + finalizeBundle(Block, VPSTInsertPos.getInstrIterator(), + ++LastMI->getIterator()); + + Modified = true; + LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump();); + + ++MBIter; + } + return Modified; +} + +bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = + static_cast(Fn.getSubtarget()); + + if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) + return false; + + TII = static_cast(STI.getInstrInfo()); + TRI = STI.getRegisterInfo(); + + LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" + << "********** Function: " << Fn.getName() << '\n'); + + bool Modified = false; + for (MachineBasicBlock &MBB : Fn) + Modified |= InsertVPTBlocks(MBB); + + LLVM_DEBUG(dbgs() << "**************************************\n"); + return Modified; } + +/// createMVEVPTBlock - Returns an instance of the MVE VPT block +/// insertion pass. +FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); } diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index d567d3339049..5a965f7a6b9b 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -1,9 +1,8 @@ //===- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -162,7 +161,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // otherwise). if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { MachineRegisterInfo *MRI = &MF.getRegInfo(); - MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass); + MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass); } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); @@ -204,7 +203,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (TargetRegisterInfo::isVirtualRegister(DestReg)) { MachineRegisterInfo *MRI = &MF.getRegInfo(); MRI->constrainRegClass(DestReg, - &ARM::GPRPair_with_gsub_1_in_rGPRRegClass); + &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass); } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); @@ -478,7 +477,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, bool isSub = false; // Memory operands in inline assembly always use AddrModeT2_i12. - if (Opcode == ARM::INLINEASM) + if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { @@ -611,9 +610,23 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, Offset = -Offset; isSub = true; } + } else if (AddrMode == ARMII::AddrModeT2_i7s4 || + AddrMode == ARMII::AddrModeT2_i7s2 || + AddrMode == ARMII::AddrModeT2_i7) { + Offset += MI.getOperand(FrameRegIdx + 1).getImm(); + unsigned OffsetMask; + switch (AddrMode) { + case ARMII::AddrModeT2_i7s4: NumBits = 9; OffsetMask = 0x3; break; + case ARMII::AddrModeT2_i7s2: NumBits = 8; OffsetMask = 0x1; break; + default: NumBits = 7; OffsetMask = 0x0; break; + } + // MCInst operand expects already scaled value. + Scale = 1; + assert((Offset & OffsetMask) == 0 && "Can't encode this offset!"); + (void)OffsetMask; // squash unused-variable warning at -NDEBUG } else if (AddrMode == ARMII::AddrModeT2_i8s4) { Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4; - NumBits = 10; // 8 bits scaled by 4 + NumBits = 8 + 2; // MCInst operand expects already scaled value. Scale = 1; assert((Offset & 3) == 0 && "Can't encode this offset!"); @@ -639,7 +652,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // Replace the FrameIndex with fp/sp MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); if (isSub) { - if (AddrMode == ARMII::AddrMode5) + if (AddrMode == ARMII::AddrMode5 || AddrMode == ARMII::AddrMode5FP16) // FIXME: Not consistent. ImmedOffset |= 1 << NumBits; else @@ -653,7 +666,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // Otherwise, offset doesn't fit. Pull in what we can to simplify ImmedOffset = ImmedOffset & Mask; if (isSub) { - if (AddrMode == ARMII::AddrMode5) + if (AddrMode == ARMII::AddrMode5 || AddrMode == ARMII::AddrMode5FP16) // FIXME: Not consistent. ImmedOffset |= 1 << NumBits; else { @@ -678,3 +691,28 @@ ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI, return ARMCC::AL; return getInstrPredicate(MI, PredReg); } + +int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + if (!MCID.OpInfo) + return -1; + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (ARM::isVpred(MCID.OpInfo[i].OperandType)) + return i; + + return -1; +} + +ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, + unsigned &PredReg) { + int PIdx = findFirstVPTPredOperandIdx(MI); + if (PIdx == -1) { + PredReg = 0; + return ARMVCC::None; + } + + PredReg = MI.getOperand(PIdx+1).getReg(); + return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm(); +} diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h index c834ba73bfea..a6712d5a0e72 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.h +++ b/lib/Target/ARM/Thumb2InstrInfo.h @@ -1,9 +1,8 @@ //===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -69,6 +68,12 @@ private: /// to llvm::getInstrPredicate except it returns AL for conditional branch /// instructions which are "predicated", but are not in IT blocks. ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg); + +// getVPTInstrPredicate: VPT analogue of that, plus a helper function +// corresponding to MachineInstr::findFirstPredOperandIdx. +int findFirstVPTPredOperandIdx(const MachineInstr &MI); +ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI, + unsigned &PredReg); } #endif diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 65889fc4e28b..37a85fa38417 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -1,9 +1,8 @@ //===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -454,7 +453,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, break; case ARM::t2LDR_POST: case ARM::t2STR_POST: { - if (!MBB.getParent()->getFunction().optForMinSize()) + if (!MinimizeSize) return false; if (!MI->hasOneMemOperand() || @@ -1128,8 +1127,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { TII = static_cast(STI->getInstrInfo()); // Optimizing / minimizing size? Minimizing size implies optimizing for size. - OptimizeSize = MF.getFunction().optForSize(); - MinimizeSize = MF.getFunction().optForMinSize(); + OptimizeSize = MF.getFunction().hasOptSize(); + MinimizeSize = STI->hasMinSize(); BlockInfo.clear(); BlockInfo.resize(MF.getNumBlockIDs()); diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp index e4bdd40fb743..a96417ffce4d 100644 --- a/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- ThumbRegisterInfo.cpp - Thumb-1 Register Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -447,63 +446,6 @@ void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, (void)Done; } -/// saveScavengerRegister - Spill the register so it can be used by the -/// register scavenger. Return true. -bool ThumbRegisterInfo::saveScavengerRegister( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC, - unsigned Reg) const { - - const ARMSubtarget &STI = MBB.getParent()->getSubtarget(); - if (!STI.isThumb1Only()) - return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg); - - // Thumb1 can't use the emergency spill slot on the stack because - // ldr/str immediate offsets must be positive, and if we're referencing - // off the frame pointer (if, for example, there are alloca() calls in - // the function, the offset will be negative. Use R12 instead since that's - // a call clobbered register that we know won't be used in Thumb1 mode. - const TargetInstrInfo &TII = *STI.getInstrInfo(); - DebugLoc DL; - BuildMI(MBB, I, DL, TII.get(ARM::tMOVr)) - .addReg(ARM::R12, RegState::Define) - .addReg(Reg, RegState::Kill) - .add(predOps(ARMCC::AL)); - - // The UseMI is where we would like to restore the register. If there's - // interference with R12 before then, however, we'll need to restore it - // before that instead and adjust the UseMI. - bool done = false; - for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) { - if (II->isDebugInstr()) - continue; - // If this instruction affects R12, adjust our restore point. - for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = II->getOperand(i); - if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) { - UseMI = II; - done = true; - break; - } - if (!MO.isReg() || MO.isUndef() || !MO.getReg() || - TargetRegisterInfo::isVirtualRegister(MO.getReg())) - continue; - if (MO.getReg() == ARM::R12) { - UseMI = II; - done = true; - break; - } - } - } - // Restore the register from R12 - BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)) - .addReg(Reg, RegState::Define) - .addReg(ARM::R12, RegState::Kill) - .add(predOps(ARMCC::AL)); - - return true; -} - void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -619,3 +561,14 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (MI.isPredicable()) MIB.add(predOps(ARMCC::AL)); } + +bool +ThumbRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { + if (MF.getSubtarget().isThumb1Only()) { + // For Thumb1, the emergency spill slot must be some small positive + // offset from the base/stack pointer. + return false; + } + // For Thumb2, put the emergency spill slot next to FP. + return true; +} diff --git a/lib/Target/ARM/ThumbRegisterInfo.h b/lib/Target/ARM/ThumbRegisterInfo.h index 75c3fe9ae8ad..08cf67284d4c 100644 --- a/lib/Target/ARM/ThumbRegisterInfo.h +++ b/lib/Target/ARM/ThumbRegisterInfo.h @@ -1,9 +1,8 @@ //===- ThumbRegisterInfo.h - Thumb Register Information Impl -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -52,14 +51,10 @@ public: const ARMBaseInstrInfo &TII) const; void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const override; - bool saveScavengerRegister(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator &UseMI, - const TargetRegisterClass *RC, - unsigned Reg) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + bool useFPForScavengingIndex(const MachineFunction &MF) const override; }; } diff --git a/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/lib/Target/ARM/Utils/ARMBaseInfo.cpp index 534f78c6d4d2..4ace61cccd0f 100644 --- a/lib/Target/ARM/Utils/ARMBaseInfo.cpp +++ b/lib/Target/ARM/Utils/ARMBaseInfo.cpp @@ -1,9 +1,8 @@ //===-- ARMBaseInfo.cpp - ARM Base encoding information------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/ARM/Utils/ARMBaseInfo.h b/lib/Target/ARM/Utils/ARMBaseInfo.h index f32d8223f53c..aa3aca359cb8 100644 --- a/lib/Target/ARM/Utils/ARMBaseInfo.h +++ b/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -1,9 +1,8 @@ //===-- ARMBaseInfo.h - Top level definitions for ARM ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -67,6 +66,30 @@ inline static CondCodes getOppositeCondition(CondCodes CC) { } } // end namespace ARMCC +namespace ARMVCC { + enum VPTCodes { + None = 0, + Then, + Else + }; +} + +inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) { + switch (CC) { + case ARMVCC::None: return "none"; + case ARMVCC::Then: return "t"; + case ARMVCC::Else: return "e"; + } + llvm_unreachable("Unknown VPT code"); +} + +inline static unsigned ARMVectorCondCodeFromString(StringRef CC) { + return StringSwitch(CC.lower()) + .Case("t", ARMVCC::Then) + .Case("e", ARMVCC::Else) + .Default(~0U); +} + inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { switch (CC) { case ARMCC::EQ: return "eq"; diff --git a/lib/Target/AVR/AVR.h b/lib/Target/AVR/AVR.h index 48327fd377b2..f0746d73c95f 100644 --- a/lib/Target/AVR/AVR.h +++ b/lib/Target/AVR/AVR.h @@ -1,9 +1,8 @@ //===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVR.td b/lib/Target/AVR/AVR.td index d03b983aa70b..53768f99df3b 100644 --- a/lib/Target/AVR/AVR.td +++ b/lib/Target/AVR/AVR.td @@ -1,9 +1,8 @@ //===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // This is the top level entry point for the AVR target. diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp index f9a6e77387b2..7586bd7b78fc 100644 --- a/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/lib/Target/AVR/AVRAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- AVRAsmPrinter.cpp - AVR LLVM assembly writer ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,8 @@ #include "AVR.h" #include "AVRMCInstLower.h" #include "AVRSubtarget.h" -#include "InstPrinter/AVRInstPrinter.h" +#include "MCTargetDesc/AVRInstPrinter.h" +#include "TargetInfo/AVRTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFunction.h" @@ -43,16 +43,13 @@ public: StringRef getPassName() const override { return "AVR Assembly Printer"; } - void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = 0); + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void EmitInstruction(const MachineInstr *MI) override; @@ -61,7 +58,7 @@ private: }; void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); switch (MO.getType()) { @@ -86,11 +83,10 @@ void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, } bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) { + const char *ExtraCode, raw_ostream &O) { // Default asm printer can only deal with some extra codes, // so try it first. - bool Error = AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O); + bool Error = AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O); if (Error && ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) @@ -138,8 +134,7 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, } bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNum, unsigned AsmVariant, - const char *ExtraCode, + unsigned OpNum, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) { llvm_unreachable("This branch is not implemented yet"); diff --git a/lib/Target/AVR/AVRCallingConv.td b/lib/Target/AVR/AVRCallingConv.td index 68dbce02706f..213e35fca66d 100644 --- a/lib/Target/AVR/AVRCallingConv.td +++ b/lib/Target/AVR/AVRCallingConv.td @@ -1,9 +1,8 @@ //===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for AVR architecture. diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp index 536a54759c77..c45b2d0e39c1 100644 --- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -1,9 +1,8 @@ //===-- AVRExpandPseudoInsts.cpp - Expand pseudo instructions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -583,8 +582,8 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { unsigned TmpReg = 0; // 0 for no temporary register unsigned SrcReg = MI.getOperand(1).getReg(); bool SrcIsKill = MI.getOperand(1).isKill(); - OpLo = AVR::LDRdPtrPi; - OpHi = AVR::LDRdPtr; + OpLo = AVR::LDRdPtr; + OpHi = AVR::LDDRdPtrQ; TRI->splitReg(DstReg, DstLoReg, DstHiReg); // Use a temporary register if src and dst registers are the same. @@ -597,8 +596,7 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { // Load low byte. auto MIBLO = buildMI(MBB, MBBI, OpLo) .addReg(CurDstLoReg, RegState::Define) - .addReg(SrcReg, RegState::Define) - .addReg(SrcReg); + .addReg(SrcReg, RegState::Define); // Push low byte onto stack if necessary. if (TmpReg) @@ -607,7 +605,8 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { // Load high byte. auto MIBHI = buildMI(MBB, MBBI, OpHi) .addReg(CurDstHiReg, RegState::Define) - .addReg(SrcReg, getKillRegState(SrcIsKill)); + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(1); if (TmpReg) { // Move the high byte into the final destination. diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp index 3b7322365772..5e91bb8632c1 100644 --- a/lib/Target/AVR/AVRFrameLowering.cpp +++ b/lib/Target/AVR/AVRFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- AVRFrameLowering.cpp - AVR Frame Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -362,13 +361,12 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { const AVRSubtarget &STI = MF.getSubtarget(); - const TargetFrameLowering &TFI = *STI.getFrameLowering(); const AVRInstrInfo &TII = *STI.getInstrInfo(); // There is nothing to insert when the call frame memory is allocated during // function entry. Delete the call frame pseudo and replace all pseudo stores // with real store instructions. - if (TFI.hasReservedCallFrame(MF)) { + if (hasReservedCallFrame(MF)) { fixStackStores(MBB, MI, TII, false); return MBB.erase(MI); } @@ -382,7 +380,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr( // For adjcallstackdown we convert it into an 'adiw reg, ' handling // the read and write of SP in I/O space. if (Amount != 0) { - assert(TFI.getStackAlignment() == 1 && "Unsupported stack alignment"); + assert(getStackAlignment() == 1 && "Unsupported stack alignment"); if (Opcode == TII.getCallFrameSetupOpcode()) { fixStackStores(MBB, MI, TII, true); diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h index a0ba6c951276..a7658438232a 100644 --- a/lib/Target/AVR/AVRFrameLowering.h +++ b/lib/Target/AVR/AVRFrameLowering.h @@ -1,9 +1,8 @@ //===-- AVRFrameLowering.h - Define frame lowering for AVR ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp index 85abf42eaa67..5cb4441c4380 100644 --- a/lib/Target/AVR/AVRISelDAGToDAG.cpp +++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- AVRISelDAGToDAG.cpp - A dag to dag inst selector for AVR ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index 57fc978b54bb..b6ba5f22fafb 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -1,9 +1,8 @@ //===-- AVRISelLowering.cpp - AVR DAG Lowering Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,19 +25,21 @@ #include "AVR.h" #include "AVRMachineFunctionInfo.h" +#include "AVRSubtarget.h" #include "AVRTargetMachine.h" #include "MCTargetDesc/AVRMCTargetDesc.h" namespace llvm { -AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm) - : TargetLowering(tm) { +AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM, + const AVRSubtarget &STI) + : TargetLowering(TM), Subtarget(STI) { // Set up the register classes. addRegisterClass(MVT::i8, &AVR::GPR8RegClass); addRegisterClass(MVT::i16, &AVR::DREGSRegClass); // Compute derived properties from the register classes. - computeRegisterProperties(tm.getSubtargetImpl()->getRegisterInfo()); + computeRegisterProperties(Subtarget.getRegisterInfo()); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); @@ -88,9 +89,9 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm) setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand); setOperationAction(ISD::ROTL, MVT::i8, Custom); - setOperationAction(ISD::ROTL, MVT::i16, Custom); + setOperationAction(ISD::ROTL, MVT::i16, Expand); setOperationAction(ISD::ROTR, MVT::i8, Custom); - setOperationAction(ISD::ROTR, MVT::i16, Custom); + setOperationAction(ISD::ROTR, MVT::i16, Expand); setOperationAction(ISD::BR_CC, MVT::i8, Custom); setOperationAction(ISD::BR_CC, MVT::i16, Custom); @@ -163,6 +164,13 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm) setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand); + // Expand multiplications to libcalls when there is + // no hardware MUL. + if (!Subtarget.supportsMultiplication()) { + setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand); + } + for (MVT VT : MVT::integer_valuetypes()) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); @@ -229,7 +237,7 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm) setLibcallName(RTLIB::COS_F32, "cos"); setMinFunctionAlignment(1); - setMinimumJumpTableEntries(INT_MAX); + setMinimumJumpTableEntries(UINT_MAX); } const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -935,7 +943,7 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI, AVR::R19R18, AVR::R17R16, AVR::R15R14, AVR::R13R12, AVR::R11R10, AVR::R9R8}; if (IsVarArg) { - // Variadic functions do not need all the analisys below. + // Variadic functions do not need all the analysis below. if (IsCall) { CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg); } else { @@ -1270,8 +1278,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // Add a register mask operand representing the call-preserved registers. - const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine(); - const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); assert(Mask && "Missing call preserved mask for calling convention"); @@ -1433,8 +1440,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, bool HasRepeatedOperand = false; MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine(); - const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); switch (MI.getOpcode()) { @@ -1574,8 +1580,7 @@ static bool isCopyMulResult(MachineBasicBlock::iterator const &I) { // it, but it works for now. MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI, MachineBasicBlock *BB) const { - const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine(); - const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineBasicBlock::iterator I(MI); ++I; // in any case insert *after* the mul instruction if (isCopyMulResult(I)) @@ -1629,6 +1634,15 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineFunction *MF = MBB->getParent(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineBasicBlock *FallThrough = MBB->getFallThrough(); + + // If the current basic block falls through to another basic block, + // we must insert an unconditional branch to the fallthrough destination + // if we are to insert basic blocks at the prior fallthrough point. + if (FallThrough != nullptr) { + BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(FallThrough); + } + MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -1838,9 +1852,6 @@ std::pair AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - auto STI = static_cast(this->getTargetMachine()) - .getSubtargetImpl(); - // We only support i8 and i16. // //:FIXME: remove this assert for now since it gets sometimes executed @@ -1884,8 +1895,8 @@ AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } } - return TargetLowering::getRegForInlineAsmConstraint(STI->getRegisterInfo(), - Constraint, VT); + return TargetLowering::getRegForInlineAsmConstraint( + Subtarget.getRegisterInfo(), Constraint, VT); } void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op, diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h index c90c65c81f70..ed2d0835903c 100644 --- a/lib/Target/AVR/AVRISelLowering.h +++ b/lib/Target/AVR/AVRISelLowering.h @@ -1,9 +1,8 @@ //===-- AVRISelLowering.h - AVR DAG Lowering Interface ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,12 +63,14 @@ enum NodeType { } // end of namespace AVRISD +class AVRSubtarget; class AVRTargetMachine; /// Performs target lowering for the AVR. class AVRTargetLowering : public TargetLowering { public: - explicit AVRTargetLowering(AVRTargetMachine &TM); + explicit AVRTargetLowering(const AVRTargetMachine &TM, + const AVRSubtarget &STI); public: MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override { @@ -127,6 +128,11 @@ public: unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + bool shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL) + const override { + return false; + } + private: SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc, SelectionDAG &DAG, SDLoc dl) const; @@ -164,6 +170,10 @@ private: const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const; +protected: + + const AVRSubtarget &Subtarget; + private: MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/lib/Target/AVR/AVRInstrFormats.td b/lib/Target/AVR/AVRInstrFormats.td index ce5e606f9787..347e683cd47f 100644 --- a/lib/Target/AVR/AVRInstrFormats.td +++ b/lib/Target/AVR/AVRInstrFormats.td @@ -1,9 +1,8 @@ //===-- AVRInstrInfo.td - AVR Instruction Formats ----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp index 0c32334167f0..ba7a95e92c5c 100644 --- a/lib/Target/AVR/AVRInstrInfo.cpp +++ b/lib/Target/AVR/AVRInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- AVRInstrInfo.cpp - AVR Instruction Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -488,7 +487,8 @@ unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: return 0; - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { const MachineFunction &MF = *MI.getParent()->getParent(); const AVRTargetMachine &TM = static_cast(MF.getTarget()); const AVRSubtarget &STI = MF.getSubtarget(); diff --git a/lib/Target/AVR/AVRInstrInfo.h b/lib/Target/AVR/AVRInstrInfo.h index 354edcec3466..ba74af325474 100644 --- a/lib/Target/AVR/AVRInstrInfo.h +++ b/lib/Target/AVR/AVRInstrInfo.h @@ -1,9 +1,8 @@ //===-- AVRInstrInfo.h - AVR Instruction Information ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index 5720af7d8df6..caca9b617609 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -1,9 +1,8 @@ //===-- AVRInstrInfo.td - AVR Instruction defs -------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -90,6 +89,22 @@ def imm0_63_neg : PatLeaf<(imm), def uimm6 : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>; +// imm_com8_XFORM - Return the complement of a imm_com8 value +def imm_com8_XFORM : SDNodeXFormgetTargetConstant(~((uint8_t)N->getZExtValue()), SDLoc(N), + MVT::i8); +}]>; + +// imm_com8 - Match an immediate that is a complement +// of a 8-bit immediate. +// Note: this pattern doesn't require an encoder method and such, as it's +// only used on aliases (Pat<> and InstAlias<>). The actual encoding +// is handled by the destination instructions, which use imm_com8. +def imm_com8_asmoperand : AsmOperandClass { let Name = "ImmCom8"; } +def imm_com8 : Operand { + let ParserMatchClass = imm_com8_asmoperand; +} + def ioaddr_XFORM : SDNodeXFormgetTargetConstant(uint8_t(N->getZExtValue()) - 0x20, SDLoc(N), MVT::i8); @@ -157,13 +172,6 @@ def memspi : Operand let MIOperandInfo = (ops GPRSP, i16imm); } -def imm_com8 : Operand -{ - let EncoderMethod = "encodeComplement"; - - let MIOperandInfo = (ops i8imm); -} - def relbrtarget_7 : Operand { let PrintMethod = "printPCRelImm"; @@ -1151,11 +1159,11 @@ isReMaterializable = 1 in // LDW Rd+1:Rd, P // // Expands to: - // ld Rd, P+ - // ld Rd+1, P + // ld Rd, P + // ldd Rd+1, P+1 let Constraints = "@earlyclobber $reg" in def LDWRdPtr : Pseudo<(outs DREGS:$reg), - (ins PTRREGS:$ptrreg), + (ins PTRDISPREGS:$ptrreg), "ldw\t$reg, $ptrreg", [(set i16:$reg, (load i16:$ptrreg))]>, Requires<[HasSRAM]>; @@ -1222,7 +1230,7 @@ isReMaterializable = 1 in // ldd Rd, P+q // ldd Rd+1, P+q+1 let Constraints = "@earlyclobber $dst" in - def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_Z_WORKAROUND:$dst), + def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_YZ_WORKAROUND:$dst), (ins memri:$memri), "lddw\t$dst, $memri", [(set i16:$dst, (load addr:$memri))]>, @@ -1729,20 +1737,7 @@ def BLD : FRdB<0b00, "bld\t$rd, $b", []>; -// Set/clear bit in register operations. -let Constraints = "$src = $rd", -Defs = [SREG] in -{ - // CBR Rd, K - // Alias for `ANDI Rd, COM(K)` where COM(K) is the complement of K. - // FIXME: This uses the 'complement' encoder. We need it to also use the - // imm_ldi8 encoder. This will cause no fixups to be created on this instruction. - def CBRRdK : FRdK<0b0111, - (outs LD8:$rd), - (ins LD8:$src, imm_com8:$k), - "cbr\t$rd, $k", - []>; -} +def CBR : InstAlias<"cbr\t$rd, $k", (ANDIRdK LD8:$rd, imm_com8:$k), 0>; // CLR Rd // Alias for EOR Rd, Rd diff --git a/lib/Target/AVR/AVRMCInstLower.cpp b/lib/Target/AVR/AVRMCInstLower.cpp index dfefd09bc4b8..49a318762b63 100644 --- a/lib/Target/AVR/AVRMCInstLower.cpp +++ b/lib/Target/AVR/AVRMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- AVRMCInstLower.cpp - Convert AVR MachineInstr to an MCInst --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRMCInstLower.h b/lib/Target/AVR/AVRMCInstLower.h index 2e2d1014485e..5e0f42ac16a7 100644 --- a/lib/Target/AVR/AVRMCInstLower.h +++ b/lib/Target/AVR/AVRMCInstLower.h @@ -1,9 +1,8 @@ //===-- AVRMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/AVRMachineFunctionInfo.h b/lib/Target/AVR/AVRMachineFunctionInfo.h index cf0c73576301..5226e30491c3 100644 --- a/lib/Target/AVR/AVRMachineFunctionInfo.h +++ b/lib/Target/AVR/AVRMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 808a85e459c1..a6b36f80485d 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- AVRRegisterInfo.cpp - AVR Register Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/TargetFrameLowering.h" @@ -233,9 +233,9 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // No need to set SREG as dead here otherwise if the next instruction is a // cond branch it will be using a dead register. - New = BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28) - .addReg(AVR::R29R28, RegState::Kill) - .addImm(Offset - 63 + 1); + BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28) + .addReg(AVR::R29R28, RegState::Kill) + .addImm(Offset - 63 + 1); Offset = 62; } @@ -245,7 +245,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); } -unsigned AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); if (TFI->hasFP(MF)) { // The Y pointer register @@ -273,4 +273,18 @@ void AVRRegisterInfo::splitReg(unsigned Reg, HiReg = getSubReg(Reg, AVR::sub_hi); } +bool AVRRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC, + LiveIntervals &LIS) const { + if(this->getRegClass(AVR::PTRDISPREGSRegClassID)->hasSubClassEq(NewRC)) { + return false; + } + + return TargetRegisterInfo::shouldCoalesce(MI, SrcRC, SubReg, DstRC, DstSubReg, NewRC, LIS); +} + } // end of namespace llvm diff --git a/lib/Target/AVR/AVRRegisterInfo.h b/lib/Target/AVR/AVRRegisterInfo.h index 104b336b9c48..8e6e63af3d57 100644 --- a/lib/Target/AVR/AVRRegisterInfo.h +++ b/lib/Target/AVR/AVRRegisterInfo.h @@ -1,9 +1,8 @@ //===-- AVRRegisterInfo.h - AVR Register Information Impl -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -42,7 +41,7 @@ public: unsigned FIOperandNum, RegScavenger *RS = NULL) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, @@ -56,6 +55,13 @@ public: return true; } + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC, + LiveIntervals &LIS) const override; }; } // end namespace llvm diff --git a/lib/Target/AVR/AVRRegisterInfo.td b/lib/Target/AVR/AVRRegisterInfo.td index d55252bcac46..ea38fedd22ce 100644 --- a/lib/Target/AVR/AVRRegisterInfo.td +++ b/lib/Target/AVR/AVRRegisterInfo.td @@ -1,9 +1,8 @@ //===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -166,14 +165,14 @@ def DREGS : RegisterClass<"AVR", [i16], 8, // cannot use Z; it's simply a workaround a regalloc bug. // // More information can be found in PR39553. -def DREGS_WITHOUT_Z_WORKAROUND : RegisterClass<"AVR", [i16], 8, +def DREGS_WITHOUT_YZ_WORKAROUND : RegisterClass<"AVR", [i16], 8, ( // Return value and arguments. add R25R24, R19R18, R21R20, R23R22, // Scratch registers. R27R26, // Callee saved registers. - R29R28, R17R16, R15R14, R13R12, R11R10, + R17R16, R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2, R1R0 )>; diff --git a/lib/Target/AVR/AVRRelaxMemOperations.cpp b/lib/Target/AVR/AVRRelaxMemOperations.cpp index fdb09897eda8..6be901743e82 100644 --- a/lib/Target/AVR/AVRRelaxMemOperations.cpp +++ b/lib/Target/AVR/AVRRelaxMemOperations.cpp @@ -1,9 +1,8 @@ //===-- AVRRelaxMemOperations.cpp - Relax out of range loads/stores -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRSelectionDAGInfo.h b/lib/Target/AVR/AVRSelectionDAGInfo.h index 6474c8779330..3e7bd57f10cf 100644 --- a/lib/Target/AVR/AVRSelectionDAGInfo.h +++ b/lib/Target/AVR/AVRSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRSubtarget.cpp b/lib/Target/AVR/AVRSubtarget.cpp index 556d69ec5234..6a41036fdd6c 100644 --- a/lib/Target/AVR/AVRSubtarget.cpp +++ b/lib/Target/AVR/AVRSubtarget.cpp @@ -1,9 +1,8 @@ //===-- AVRSubtarget.cpp - AVR Subtarget Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,9 +28,9 @@ namespace llvm { AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, AVRTargetMachine &TM) + const std::string &FS, const AVRTargetMachine &TM) : AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(), - TLInfo(TM), TSInfo(), + TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo(), // Subtarget features m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false), @@ -44,4 +43,12 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU, ParseSubtargetFeatures(CPU, FS); } +AVRSubtarget & +AVRSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS, + const TargetMachine &TM) { + // Parse features string. + ParseSubtargetFeatures(CPU, FS); + return *this; +} + } // end of namespace llvm diff --git a/lib/Target/AVR/AVRSubtarget.h b/lib/Target/AVR/AVRSubtarget.h index fa26738da190..da9289af7c8d 100644 --- a/lib/Target/AVR/AVRSubtarget.h +++ b/lib/Target/AVR/AVRSubtarget.h @@ -1,9 +1,8 @@ //===-- AVRSubtarget.h - Define Subtarget for the AVR -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,7 +36,7 @@ public: //! \param FS The feature string. //! \param TM The target machine. AVRSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - AVRTargetMachine &TM); + const AVRTargetMachine &TM); const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; } const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; } @@ -49,6 +48,9 @@ public: /// \note Definition of function is auto generated by `tblgen`. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + AVRSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS, + const TargetMachine &TM); + // Subtarget feature getters. // See AVR.td for details. bool hasSRAM() const { return m_hasSRAM; } diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index 9828cdab68c3..a36c8b0f9649 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- AVRTargetMachine.cpp - Define TargetMachine for AVR ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,6 +21,7 @@ #include "AVR.h" #include "AVRTargetObjectFile.h" #include "MCTargetDesc/AVRMCTargetDesc.h" +#include "TargetInfo/AVRTargetInfo.h" namespace llvm { diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h index ffcf4350d45a..f9015c8741ea 100644 --- a/lib/Target/AVR/AVRTargetMachine.h +++ b/lib/Target/AVR/AVRTargetMachine.h @@ -1,9 +1,8 @@ //===-- AVRTargetMachine.h - Define TargetMachine for AVR -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/AVRTargetObjectFile.cpp b/lib/Target/AVR/AVRTargetObjectFile.cpp index 0cebb0f043f9..980096a09835 100644 --- a/lib/Target/AVR/AVRTargetObjectFile.cpp +++ b/lib/Target/AVR/AVRTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/AVRTargetObjectFile.h b/lib/Target/AVR/AVRTargetObjectFile.h index ba91036fd64c..53d8510d9a21 100644 --- a/lib/Target/AVR/AVRTargetObjectFile.h +++ b/lib/Target/AVR/AVRTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index f2bb59265271..aac5644711e2 100644 --- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -1,9 +1,8 @@ //===---- AVRAsmParser.cpp - Parse AVR assembly to MCInst instructions ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -12,6 +11,7 @@ #include "MCTargetDesc/AVRMCELFStreamer.h" #include "MCTargetDesc/AVRMCExpr.h" #include "MCTargetDesc/AVRMCTargetDesc.h" +#include "TargetInfo/AVRTargetInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/StringSwitch.h" @@ -160,6 +160,22 @@ public: addExpr(Inst, getImm()); } + void addImmCom8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The operand is actually a imm8, but we have its bitwise + // negation in the assembly source, so twiddle it here. + const MCConstantExpr *CE = dyn_cast(getImm()); + Inst.addOperand(MCOperand::createImm(~(uint8_t)CE->getValue())); + } + + bool isImmCom8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return isUInt<8>(Value); + } + bool isReg() const { return Kind == k_Register; } bool isImm() const { return Kind == k_Immediate; } bool isToken() const { return Kind == k_Token; } diff --git a/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index e69accfa9393..e203a5069c85 100644 --- a/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -1,9 +1,8 @@ //===- AVRDisassembler.cpp - Disassembler for AVR ---------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,6 +14,7 @@ #include "AVRRegisterInfo.h" #include "AVRSubtarget.h" #include "MCTargetDesc/AVRMCTargetDesc.h" +#include "TargetInfo/AVRTargetInfo.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" diff --git a/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp b/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp deleted file mode 100644 index 0f34b8e18ff9..000000000000 --- a/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp +++ /dev/null @@ -1,171 +0,0 @@ -//===-- AVRInstPrinter.cpp - Convert AVR MCInst to assembly syntax --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an AVR MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "AVRInstPrinter.h" - -#include "MCTargetDesc/AVRMCTargetDesc.h" - -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" - -#include - -#define DEBUG_TYPE "asm-printer" - -namespace llvm { - -// Include the auto-generated portion of the assembly writer. -#define PRINT_ALIAS_INSTR -#include "AVRGenAsmWriter.inc" - -void AVRInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - unsigned Opcode = MI->getOpcode(); - - // First handle load and store instructions with postinc or predec - // of the form "ld reg, X+". - // TODO: We should be able to rewrite this using TableGen data. - switch (Opcode) { - case AVR::LDRdPtr: - case AVR::LDRdPtrPi: - case AVR::LDRdPtrPd: - O << "\tld\t"; - printOperand(MI, 0, O); - O << ", "; - - if (Opcode == AVR::LDRdPtrPd) - O << '-'; - - printOperand(MI, 1, O); - - if (Opcode == AVR::LDRdPtrPi) - O << '+'; - break; - case AVR::STPtrRr: - O << "\tst\t"; - printOperand(MI, 0, O); - O << ", "; - printOperand(MI, 1, O); - break; - case AVR::STPtrPiRr: - case AVR::STPtrPdRr: - O << "\tst\t"; - - if (Opcode == AVR::STPtrPdRr) - O << '-'; - - printOperand(MI, 1, O); - - if (Opcode == AVR::STPtrPiRr) - O << '+'; - - O << ", "; - printOperand(MI, 2, O); - break; - default: - if (!printAliasInstr(MI, O)) - printInstruction(MI, O); - - printAnnotation(O, Annot); - break; - } -} - -const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum, - MCRegisterInfo const &MRI) { - // GCC prints register pairs by just printing the lower register - // If the register contains a subregister, print it instead - if (MRI.getNumSubRegIndices() > 0) { - unsigned RegLoNum = MRI.getSubReg(RegNum, AVR::sub_lo); - RegNum = (RegLoNum != AVR::NoRegister) ? RegLoNum : RegNum; - } - - return getRegisterName(RegNum); -} - -void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).OpInfo[OpNo]; - - if (Op.isReg()) { - bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) || - (MOI.RegClass == AVR::PTRDISPREGSRegClassID) || - (MOI.RegClass == AVR::ZREGRegClassID); - - if (isPtrReg) { - O << getRegisterName(Op.getReg(), AVR::ptr); - } else { - O << getPrettyRegisterName(Op.getReg(), MRI); - } - } else if (Op.isImm()) { - O << Op.getImm(); - } else { - assert(Op.isExpr() && "Unknown operand kind in printOperand"); - O << *Op.getExpr(); - } -} - -/// This is used to print an immediate value that ends up -/// being encoded as a pc-relative value. -void AVRInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - - if (Op.isImm()) { - int64_t Imm = Op.getImm(); - O << '.'; - - // Print a position sign if needed. - // Negative values have their sign printed automatically. - if (Imm >= 0) - O << '+'; - - O << Imm; - } else { - assert(Op.isExpr() && "Unknown pcrel immediate operand"); - O << *Op.getExpr(); - } -} - -void AVRInstPrinter::printMemri(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - assert(MI->getOperand(OpNo).isReg() && "Expected a register for the first operand"); - - const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); - - // Print the register. - printOperand(MI, OpNo, O); - - // Print the {+,-}offset. - if (OffsetOp.isImm()) { - int64_t Offset = OffsetOp.getImm(); - - if (Offset >= 0) - O << '+'; - - O << Offset; - } else if (OffsetOp.isExpr()) { - O << *OffsetOp.getExpr(); - } else { - llvm_unreachable("unknown type for offset"); - } -} - -} // end of namespace llvm - diff --git a/lib/Target/AVR/InstPrinter/AVRInstPrinter.h b/lib/Target/AVR/InstPrinter/AVRInstPrinter.h deleted file mode 100644 index c9f65b922745..000000000000 --- a/lib/Target/AVR/InstPrinter/AVRInstPrinter.h +++ /dev/null @@ -1,54 +0,0 @@ -//===- AVRInstPrinter.h - Convert AVR MCInst to assembly syntax -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an AVR MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_AVR_INST_PRINTER_H -#define LLVM_AVR_INST_PRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -#include "MCTargetDesc/AVRMCTargetDesc.h" - -namespace llvm { - -/// Prints AVR instructions to a textual stream. -class AVRInstPrinter : public MCInstPrinter { -public: - AVRInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - static const char *getPrettyRegisterName(unsigned RegNo, - MCRegisterInfo const &MRI); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - -private: - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = AVR::NoRegAltName); - - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - // Autogenerated by TableGen. - void printInstruction(const MCInst *MI, raw_ostream &O); - bool printAliasInstr(const MCInst *MI, raw_ostream &O); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); -}; - -} // end namespace llvm - -#endif // LLVM_AVR_INST_PRINTER_H - diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index f81a57dd71e3..e92b16c8ee9d 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- AVRAsmBackend.cpp - AVR Asm Backend ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h index d48077c3ab8e..1e713db38145 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h +++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h @@ -1,9 +1,8 @@ //===-- AVRAsmBackend.h - AVR Asm Backend --------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp index 4a921a1601a9..6025e4b2437c 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- AVRELFObjectWriter.cpp - AVR ELF Writer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h index e5df6cc34e40..461f1660c952 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h +++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h @@ -1,9 +1,8 @@ //===----- AVRELFStreamer.h - AVR Target Streamer --------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h index cdb0b215bc60..b3504b89e4d3 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h +++ b/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h @@ -1,9 +1,8 @@ //===-- AVRFixupKinds.h - AVR Specific Fixup Entries ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp new file mode 100644 index 000000000000..88ce9a25680e --- /dev/null +++ b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp @@ -0,0 +1,170 @@ +//===-- AVRInstPrinter.cpp - Convert AVR MCInst to assembly syntax --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an AVR MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "AVRInstPrinter.h" + +#include "MCTargetDesc/AVRMCTargetDesc.h" + +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" + +#include + +#define DEBUG_TYPE "asm-printer" + +namespace llvm { + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "AVRGenAsmWriter.inc" + +void AVRInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + unsigned Opcode = MI->getOpcode(); + + // First handle load and store instructions with postinc or predec + // of the form "ld reg, X+". + // TODO: We should be able to rewrite this using TableGen data. + switch (Opcode) { + case AVR::LDRdPtr: + case AVR::LDRdPtrPi: + case AVR::LDRdPtrPd: + O << "\tld\t"; + printOperand(MI, 0, O); + O << ", "; + + if (Opcode == AVR::LDRdPtrPd) + O << '-'; + + printOperand(MI, 1, O); + + if (Opcode == AVR::LDRdPtrPi) + O << '+'; + break; + case AVR::STPtrRr: + O << "\tst\t"; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + break; + case AVR::STPtrPiRr: + case AVR::STPtrPdRr: + O << "\tst\t"; + + if (Opcode == AVR::STPtrPdRr) + O << '-'; + + printOperand(MI, 1, O); + + if (Opcode == AVR::STPtrPiRr) + O << '+'; + + O << ", "; + printOperand(MI, 2, O); + break; + default: + if (!printAliasInstr(MI, O)) + printInstruction(MI, O); + + printAnnotation(O, Annot); + break; + } +} + +const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum, + MCRegisterInfo const &MRI) { + // GCC prints register pairs by just printing the lower register + // If the register contains a subregister, print it instead + if (MRI.getNumSubRegIndices() > 0) { + unsigned RegLoNum = MRI.getSubReg(RegNum, AVR::sub_lo); + RegNum = (RegLoNum != AVR::NoRegister) ? RegLoNum : RegNum; + } + + return getRegisterName(RegNum); +} + +void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).OpInfo[OpNo]; + + if (Op.isReg()) { + bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) || + (MOI.RegClass == AVR::PTRDISPREGSRegClassID) || + (MOI.RegClass == AVR::ZREGRegClassID); + + if (isPtrReg) { + O << getRegisterName(Op.getReg(), AVR::ptr); + } else { + O << getPrettyRegisterName(Op.getReg(), MRI); + } + } else if (Op.isImm()) { + O << Op.getImm(); + } else { + assert(Op.isExpr() && "Unknown operand kind in printOperand"); + O << *Op.getExpr(); + } +} + +/// This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. +void AVRInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + O << '.'; + + // Print a position sign if needed. + // Negative values have their sign printed automatically. + if (Imm >= 0) + O << '+'; + + O << Imm; + } else { + assert(Op.isExpr() && "Unknown pcrel immediate operand"); + O << *Op.getExpr(); + } +} + +void AVRInstPrinter::printMemri(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + assert(MI->getOperand(OpNo).isReg() && "Expected a register for the first operand"); + + const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); + + // Print the register. + printOperand(MI, OpNo, O); + + // Print the {+,-}offset. + if (OffsetOp.isImm()) { + int64_t Offset = OffsetOp.getImm(); + + if (Offset >= 0) + O << '+'; + + O << Offset; + } else if (OffsetOp.isExpr()) { + O << *OffsetOp.getExpr(); + } else { + llvm_unreachable("unknown type for offset"); + } +} + +} // end of namespace llvm + diff --git a/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h new file mode 100644 index 000000000000..5b758a7503c9 --- /dev/null +++ b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h @@ -0,0 +1,53 @@ +//===- AVRInstPrinter.h - Convert AVR MCInst to assembly syntax -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an AVR MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_INST_PRINTER_H +#define LLVM_AVR_INST_PRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +#include "MCTargetDesc/AVRMCTargetDesc.h" + +namespace llvm { + +/// Prints AVR instructions to a textual stream. +class AVRInstPrinter : public MCInstPrinter { +public: + AVRInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + static const char *getPrettyRegisterName(unsigned RegNo, + MCRegisterInfo const &MRI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + +private: + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AVR::NoRegAltName); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + // Autogenerated by TableGen. + void printInstruction(const MCInst *MI, raw_ostream &O); + bool printAliasInstr(const MCInst *MI, raw_ostream &O); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); +}; + +} // end namespace llvm + +#endif // LLVM_AVR_INST_PRINTER_H + diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp index 535bb012eb07..99b2172c562f 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- AVRMCAsmInfo.cpp - AVR asm properties -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,6 +23,7 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) { PrivateGlobalPrefix = ".L"; UsesELFSectionDirectiveForBSS = true; UseIntegratedAssembler = true; + SupportsDebugInformation = true; } } // end of namespace llvm diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h index cc2207a3cfae..b2fa18777bc0 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h +++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- AVRMCAsmInfo.h - AVR asm properties ---------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp index 4dbbce8c205e..bc0488778685 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- AVRMCCodeEmitter.cpp - Convert AVR Code to Machine Code -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h index 883abf8db78a..2e24d885c155 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h +++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h @@ -1,9 +1,8 @@ //===-- AVRMCCodeEmitter.h - Convert AVR Code to Machine Code -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp index 861acd47347f..d9169f90a765 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp @@ -1,9 +1,8 @@ //===--------- AVRMCELFStreamer.cpp - AVR subclass of MCELFStreamer -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h index 12e805fc7d13..37a610bc4248 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h +++ b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h @@ -1,9 +1,8 @@ //===--------- AVRMCELFStreamer.h - AVR subclass of MCELFStreamer ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index d4a67973af7f..0a53e5346779 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -1,9 +1,8 @@ //===-- AVRMCExpr.cpp - AVR specific MC expression classes ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h index a166b0946749..3b696bab1715 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h +++ b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h @@ -1,9 +1,8 @@ //===-- AVRMCExpr.h - AVR specific MC expression classes --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp index 8c39b5f4039e..f6607b26a065 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- AVRMCTargetDesc.cpp - AVR Target Descriptions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,11 +11,12 @@ //===----------------------------------------------------------------------===// #include "AVRELFStreamer.h" +#include "AVRInstPrinter.h" #include "AVRMCAsmInfo.h" #include "AVRMCELFStreamer.h" #include "AVRMCTargetDesc.h" #include "AVRTargetStreamer.h" -#include "InstPrinter/AVRInstPrinter.h" +#include "TargetInfo/AVRTargetInfo.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCELFStreamer.h" diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h index a764f15bd065..470db01ff468 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h +++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- AVRMCTargetDesc.h - AVR Target Descriptions -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,8 +32,6 @@ class Target; class Triple; class raw_pwrite_stream; -Target &getTheAVRTarget(); - MCInstrInfo *createAVRMCInstrInfo(); /// Creates a machine code emitter for AVR. diff --git a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp index 2b45d9adc7e9..3487a2bbb864 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- AVRTargetStreamer.cpp - AVR Target Streamer Methods ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h index 815088b0a5de..5c4d1a22f6c6 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h +++ b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h @@ -1,9 +1,8 @@ //===-- AVRTargetStreamer.h - AVR Target Streamer --------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp index abe9cf45fcb3..c62d5cb85bc4 100644 --- a/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp +++ b/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp @@ -1,13 +1,12 @@ //===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "llvm/IR/Module.h" +#include "TargetInfo/AVRTargetInfo.h" #include "llvm/Support/TargetRegistry.h" namespace llvm { Target &getTheAVRTarget() { diff --git a/lib/Target/AVR/TargetInfo/AVRTargetInfo.h b/lib/Target/AVR/TargetInfo/AVRTargetInfo.h new file mode 100644 index 000000000000..7e0186bbdae1 --- /dev/null +++ b/lib/Target/AVR/TargetInfo/AVRTargetInfo.h @@ -0,0 +1,18 @@ +//===-- AVRTargetInfo.h - AVR Target Implementation -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_TARGET_INFO_H +#define LLVM_AVR_TARGET_INFO_H + +namespace llvm { +class Target; + +Target &getTheAVRTarget(); +} // namespace llvm + +#endif // LLVM_AVR_TARGET_INFO_H diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index 8890fb8adf4d..75885fd058a7 100644 --- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -1,13 +1,13 @@ //===-- BPFAsmParser.cpp - Parse BPF assembly to MCInst instructions --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/BPFMCTargetDesc.h" +#include "TargetInfo/BPFTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/MC/MCContext.h" @@ -126,7 +126,7 @@ public: bool isMem() const override { return false; } bool isConstantImm() const { - return isImm() && dyn_cast(getImm()); + return isImm() && isa(getImm()); } int64_t getConstantImm() const { diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h index 9749e369c2c1..d311fc154094 100644 --- a/lib/Target/BPF/BPF.h +++ b/lib/Target/BPF/BPF.h @@ -1,9 +1,8 @@ //===-- BPF.h - Top-level interface for BPF representation ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -16,11 +15,16 @@ namespace llvm { class BPFTargetMachine; +ModulePass *createBPFAbstractMemberAccess(); + FunctionPass *createBPFISelDag(BPFTargetMachine &TM); +FunctionPass *createBPFMISimplifyPatchablePass(); FunctionPass *createBPFMIPeepholePass(); FunctionPass *createBPFMIPreEmitPeepholePass(); FunctionPass *createBPFMIPreEmitCheckingPass(); +void initializeBPFAbstractMemberAccessPass(PassRegistry&); +void initializeBPFMISimplifyPatchablePass(PassRegistry&); void initializeBPFMIPeepholePass(PassRegistry&); void initializeBPFMIPreEmitPeepholePass(PassRegistry&); void initializeBPFMIPreEmitCheckingPass(PassRegistry&); diff --git a/lib/Target/BPF/BPF.td b/lib/Target/BPF/BPF.td index 877bd15f4f2b..fad966ff5a13 100644 --- a/lib/Target/BPF/BPF.td +++ b/lib/Target/BPF/BPF.td @@ -1,9 +1,8 @@ //===-- BPF.td - Describe the BPF Target Machine -----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -21,6 +20,7 @@ class Proc Features> def : Proc<"generic", []>; def : Proc<"v1", []>; def : Proc<"v2", []>; +def : Proc<"v3", []>; def : Proc<"probe", []>; def DummyFeature : SubtargetFeature<"dummy", "isDummyMode", diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp new file mode 100644 index 000000000000..51d4cbc8a429 --- /dev/null +++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -0,0 +1,482 @@ +//===------ BPFAbstractMemberAccess.cpp - Abstracting Member Accesses -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass abstracted struct/union member accesses in order to support +// compile-once run-everywhere (CO-RE). The CO-RE intends to compile the program +// which can run on different kernels. In particular, if bpf program tries to +// access a particular kernel data structure member, the details of the +// intermediate member access will be remembered so bpf loader can do +// necessary adjustment right before program loading. +// +// For example, +// +// struct s { +// int a; +// int b; +// }; +// struct t { +// struct s c; +// int d; +// }; +// struct t e; +// +// For the member access e.c.b, the compiler will generate code +// &e + 4 +// +// The compile-once run-everywhere instead generates the following code +// r = 4 +// &e + r +// The "4" in "r = 4" can be changed based on a particular kernel version. +// For example, on a particular kernel version, if struct s is changed to +// +// struct s { +// int new_field; +// int a; +// int b; +// } +// +// By repeating the member access on the host, the bpf loader can +// adjust "r = 4" as "r = 8". +// +// This feature relies on the following three intrinsic calls: +// addr = preserve_array_access_index(base, dimension, index) +// addr = preserve_union_access_index(base, di_index) +// !llvm.preserve.access.index +// addr = preserve_struct_access_index(base, gep_index, di_index) +// !llvm.preserve.access.index +// +//===----------------------------------------------------------------------===// + +#include "BPF.h" +#include "BPFCORE.h" +#include "BPFTargetMachine.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "bpf-abstract-member-access" + +namespace llvm { +const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama"; +const std::string BPFCoreSharedInfo::PatchableExtSecName = + ".BPF.patchable_externs"; +} // namespace llvm + +using namespace llvm; + +namespace { + +class BPFAbstractMemberAccess final : public ModulePass { + StringRef getPassName() const override { + return "BPF Abstract Member Access"; + } + + bool runOnModule(Module &M) override; + +public: + static char ID; + BPFAbstractMemberAccess() : ModulePass(ID) {} + +private: + enum : uint32_t { + BPFPreserveArrayAI = 1, + BPFPreserveUnionAI = 2, + BPFPreserveStructAI = 3, + }; + + std::map GEPGlobals; + // A map to link preserve_*_access_index instrinsic calls. + std::map> AIChain; + // A map to hold all the base preserve_*_access_index instrinsic calls. + // The base call is not an input of any other preserve_*_access_index + // intrinsics. + std::map BaseAICalls; + + bool doTransformation(Module &M); + + void traceAICall(CallInst *Call, uint32_t Kind); + void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind); + void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind); + void collectAICallChains(Module &M, Function &F); + + bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind); + bool removePreserveAccessIndexIntrinsic(Module &M); + void replaceWithGEP(std::vector &CallList, + uint32_t NumOfZerosIndex, uint32_t DIIndex); + + Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr, + std::string &AccessKey, uint32_t Kind, + MDNode *&TypeMeta); + bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex); + bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind); +}; +} // End anonymous namespace + +char BPFAbstractMemberAccess::ID = 0; +INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE, + "abstracting struct/union member accessees", false, false) + +ModulePass *llvm::createBPFAbstractMemberAccess() { + return new BPFAbstractMemberAccess(); +} + +bool BPFAbstractMemberAccess::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n"); + + // Bail out if no debug info. + if (empty(M.debug_compile_units())) + return false; + + return doTransformation(M); +} + +/// Check whether a call is a preserve_*_access_index intrinsic call or not. +bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, + uint32_t &Kind) { + if (!Call) + return false; + + const auto *GV = dyn_cast(Call->getCalledValue()); + if (!GV) + return false; + if (GV->getName().startswith("llvm.preserve.array.access.index")) { + Kind = BPFPreserveArrayAI; + return true; + } + if (GV->getName().startswith("llvm.preserve.union.access.index")) { + Kind = BPFPreserveUnionAI; + return true; + } + if (GV->getName().startswith("llvm.preserve.struct.access.index")) { + Kind = BPFPreserveStructAI; + return true; + } + + return false; +} + +void BPFAbstractMemberAccess::replaceWithGEP(std::vector &CallList, + uint32_t DimensionIndex, + uint32_t GEPIndex) { + for (auto Call : CallList) { + uint32_t Dimension = 1; + if (DimensionIndex > 0) + Dimension = cast(Call->getArgOperand(DimensionIndex)) + ->getZExtValue(); + + Constant *Zero = + ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0); + SmallVector IdxList; + for (unsigned I = 0; I < Dimension; ++I) + IdxList.push_back(Zero); + IdxList.push_back(Call->getArgOperand(GEPIndex)); + + auto *GEP = GetElementPtrInst::CreateInBounds(Call->getArgOperand(0), + IdxList, "", Call); + Call->replaceAllUsesWith(GEP); + Call->eraseFromParent(); + } +} + +bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) { + std::vector PreserveArrayIndexCalls; + std::vector PreserveUnionIndexCalls; + std::vector PreserveStructIndexCalls; + bool Found = false; + + for (Function &F : M) + for (auto &BB : F) + for (auto &I : BB) { + auto *Call = dyn_cast(&I); + uint32_t Kind; + if (!IsPreserveDIAccessIndexCall(Call, Kind)) + continue; + + Found = true; + if (Kind == BPFPreserveArrayAI) + PreserveArrayIndexCalls.push_back(Call); + else if (Kind == BPFPreserveUnionAI) + PreserveUnionIndexCalls.push_back(Call); + else + PreserveStructIndexCalls.push_back(Call); + } + + // do the following transformation: + // . addr = preserve_array_access_index(base, dimension, index) + // is transformed to + // addr = GEP(base, dimenion's zero's, index) + // . addr = preserve_union_access_index(base, di_index) + // is transformed to + // addr = base, i.e., all usages of "addr" are replaced by "base". + // . addr = preserve_struct_access_index(base, gep_index, di_index) + // is transformed to + // addr = GEP(base, 0, gep_index) + replaceWithGEP(PreserveArrayIndexCalls, 1, 2); + replaceWithGEP(PreserveStructIndexCalls, 0, 1); + for (auto Call : PreserveUnionIndexCalls) { + Call->replaceAllUsesWith(Call->getArgOperand(0)); + Call->eraseFromParent(); + } + + return Found; +} + +void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) { + for (User *U : Call->users()) { + Instruction *Inst = dyn_cast(U); + if (!Inst) + continue; + + if (auto *BI = dyn_cast(Inst)) { + traceBitCast(BI, Call, Kind); + } else if (auto *CI = dyn_cast(Inst)) { + uint32_t CIKind; + if (IsPreserveDIAccessIndexCall(CI, CIKind)) { + AIChain[CI] = std::make_pair(Call, Kind); + traceAICall(CI, CIKind); + } else { + BaseAICalls[Call] = Kind; + } + } else if (auto *GI = dyn_cast(Inst)) { + if (GI->hasAllZeroIndices()) + traceGEP(GI, Call, Kind); + else + BaseAICalls[Call] = Kind; + } + } +} + +void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast, + CallInst *Parent, uint32_t Kind) { + for (User *U : BitCast->users()) { + Instruction *Inst = dyn_cast(U); + if (!Inst) + continue; + + if (auto *BI = dyn_cast(Inst)) { + traceBitCast(BI, Parent, Kind); + } else if (auto *CI = dyn_cast(Inst)) { + uint32_t CIKind; + if (IsPreserveDIAccessIndexCall(CI, CIKind)) { + AIChain[CI] = std::make_pair(Parent, Kind); + traceAICall(CI, CIKind); + } else { + BaseAICalls[Parent] = Kind; + } + } else if (auto *GI = dyn_cast(Inst)) { + if (GI->hasAllZeroIndices()) + traceGEP(GI, Parent, Kind); + else + BaseAICalls[Parent] = Kind; + } + } +} + +void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent, + uint32_t Kind) { + for (User *U : GEP->users()) { + Instruction *Inst = dyn_cast(U); + if (!Inst) + continue; + + if (auto *BI = dyn_cast(Inst)) { + traceBitCast(BI, Parent, Kind); + } else if (auto *CI = dyn_cast(Inst)) { + uint32_t CIKind; + if (IsPreserveDIAccessIndexCall(CI, CIKind)) { + AIChain[CI] = std::make_pair(Parent, Kind); + traceAICall(CI, CIKind); + } else { + BaseAICalls[Parent] = Kind; + } + } else if (auto *GI = dyn_cast(Inst)) { + if (GI->hasAllZeroIndices()) + traceGEP(GI, Parent, Kind); + else + BaseAICalls[Parent] = Kind; + } + } +} + +void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) { + AIChain.clear(); + BaseAICalls.clear(); + + for (auto &BB : F) + for (auto &I : BB) { + uint32_t Kind; + auto *Call = dyn_cast(&I); + if (!IsPreserveDIAccessIndexCall(Call, Kind) || + AIChain.find(Call) != AIChain.end()) + continue; + + traceAICall(Call, Kind); + } +} + +/// Get access index from the preserve_*_access_index intrinsic calls. +bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue, + uint64_t &AccessIndex) { + const ConstantInt *CV = dyn_cast(IndexValue); + if (!CV) + return false; + + AccessIndex = CV->getValue().getZExtValue(); + return true; +} + +/// Compute the base of the whole preserve_*_access_index chains, i.e., the base +/// pointer of the first preserve_*_access_index call, and construct the access +/// string, which will be the name of a global variable. +Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call, + std::string &AccessStr, + std::string &AccessKey, + uint32_t Kind, + MDNode *&TypeMeta) { + Value *Base = nullptr; + std::vector AccessIndices; + uint64_t TypeNameIndex = 0; + std::string LastTypeName; + + while (Call) { + // Base of original corresponding GEP + Base = Call->getArgOperand(0); + + // Type Name + std::string TypeName; + MDNode *MDN; + if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) { + MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!MDN) + return nullptr; + + DIType *Ty = dyn_cast(MDN); + if (!Ty) + return nullptr; + + TypeName = Ty->getName(); + } + + // Access Index + uint64_t AccessIndex; + uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2; + if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex)) + return nullptr; + + AccessIndices.push_back(AccessIndex); + if (TypeName.size()) { + TypeNameIndex = AccessIndices.size() - 1; + LastTypeName = TypeName; + TypeMeta = MDN; + } + + Kind = AIChain[Call].second; + Call = AIChain[Call].first; + } + + // The intial type name is required. + // FIXME: if the initial type access is an array index, e.g., + // &a[3].b.c, only one dimentional array is supported. + if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2) + return nullptr; + + // Construct the type string AccessStr. + for (unsigned I = 0; I < AccessIndices.size(); ++I) + AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr; + + if (TypeNameIndex == AccessIndices.size() - 1) + AccessStr = "0:" + AccessStr; + + // Access key is the type name + access string, uniquely identifying + // one kernel memory access. + AccessKey = LastTypeName + ":" + AccessStr; + + return Base; +} + +/// Call/Kind is the base preserve_*_access_index() call. Attempts to do +/// transformation to a chain of relocable GEPs. +bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call, + uint32_t Kind) { + std::string AccessStr, AccessKey; + MDNode *TypeMeta = nullptr; + Value *Base = + computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta); + if (!Base) + return false; + + // Do the transformation + // For any original GEP Call and Base %2 like + // %4 = bitcast %struct.net_device** %dev1 to i64* + // it is transformed to: + // %6 = load __BTF_0:sk_buff:0:0:2:0: + // %7 = bitcast %struct.sk_buff* %2 to i8* + // %8 = getelementptr i8, i8* %7, %6 + // %9 = bitcast i8* %8 to i64* + // using %9 instead of %4 + // The original Call inst is removed. + BasicBlock *BB = Call->getParent(); + GlobalVariable *GV; + + if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) { + GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false, + GlobalVariable::ExternalLinkage, NULL, AccessStr); + GV->addAttribute(BPFCoreSharedInfo::AmaAttr); + // Set the metadata (debuginfo types) for the global. + if (TypeMeta) + GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta); + GEPGlobals[AccessKey] = GV; + } else { + GV = GEPGlobals[AccessKey]; + } + + // Load the global variable. + auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV); + BB->getInstList().insert(Call->getIterator(), LDInst); + + // Generate a BitCast + auto *BCInst = new BitCastInst(Base, Type::getInt8PtrTy(BB->getContext())); + BB->getInstList().insert(Call->getIterator(), BCInst); + + // Generate a GetElementPtr + auto *GEP = GetElementPtrInst::Create(Type::getInt8Ty(BB->getContext()), + BCInst, LDInst); + BB->getInstList().insert(Call->getIterator(), GEP); + + // Generate a BitCast + auto *BCInst2 = new BitCastInst(GEP, Call->getType()); + BB->getInstList().insert(Call->getIterator(), BCInst2); + + Call->replaceAllUsesWith(BCInst2); + Call->eraseFromParent(); + + return true; +} + +bool BPFAbstractMemberAccess::doTransformation(Module &M) { + bool Transformed = false; + + for (Function &F : M) { + // Collect PreserveDIAccessIndex Intrinsic call chains. + // The call chains will be used to generate the access + // patterns similar to GEP. + collectAICallChains(M, F); + + for (auto &C : BaseAICalls) + Transformed = transformGEPChain(M, C.first, C.second) || Transformed; + } + + return removePreserveAccessIndexIntrinsic(M) || Transformed; +} diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp index ada5eb923f40..e61e73468057 100644 --- a/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/lib/Target/BPF/BPFAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- BPFAsmPrinter.cpp - BPF LLVM assembly writer ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,7 +16,8 @@ #include "BPFMCInstLower.h" #include "BPFTargetMachine.h" #include "BTFDebug.h" -#include "InstPrinter/BPFInstPrinter.h" +#include "MCTargetDesc/BPFInstPrinter.h" +#include "TargetInfo/BPFTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -38,27 +38,30 @@ class BPFAsmPrinter : public AsmPrinter { public: explicit BPFAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) - : AsmPrinter(TM, std::move(Streamer)) {} + : AsmPrinter(TM, std::move(Streamer)), BTF(nullptr) {} StringRef getPassName() const override { return "BPF Assembly Printer"; } bool doInitialization(Module &M) override; void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void EmitInstruction(const MachineInstr *MI) override; + +private: + BTFDebug *BTF; }; } // namespace bool BPFAsmPrinter::doInitialization(Module &M) { AsmPrinter::doInitialization(M); - if (MAI->doesSupportDebugInformation()) { - Handlers.push_back(HandlerInfo(new BTFDebug(this), "emit", + // Only emit BTF when debuginfo available. + if (MAI->doesSupportDebugInformation() && !empty(M.debug_compile_units())) { + BTF = new BTFDebug(this); + Handlers.push_back(HandlerInfo(std::unique_ptr(BTF), "emit", "Debug Info Emission", "BTF", "BTF Emission")); } @@ -105,18 +108,16 @@ void BPFAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, } bool BPFAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned /*AsmVariant*/, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) - return true; // BPF does not have special modifiers + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); printOperand(MI, OpNo, O); return false; } bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNum, unsigned AsmVariant, - const char *ExtraCode, + unsigned OpNum, const char *ExtraCode, raw_ostream &O) { assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands"); const MachineOperand &BaseMO = MI->getOperand(OpNum); @@ -137,11 +138,12 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) { - - BPFMCInstLower MCInstLowering(OutContext, *this); - MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); + + if (!BTF || !BTF->InstLower(MI, TmpInst)) { + BPFMCInstLower MCInstLowering(OutContext, *this); + MCInstLowering.Lower(MI, TmpInst); + } EmitToStreamer(*OutStreamer, TmpInst); } diff --git a/lib/Target/BPF/BPFCORE.h b/lib/Target/BPF/BPFCORE.h new file mode 100644 index 000000000000..e0950d95f8d7 --- /dev/null +++ b/lib/Target/BPF/BPFCORE.h @@ -0,0 +1,24 @@ +//===- BPFCORE.h - Common info for Compile-Once Run-EveryWhere -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_BPF_BPFCORE_H +#define LLVM_LIB_TARGET_BPF_BPFCORE_H + +namespace llvm { + +class BPFCoreSharedInfo { +public: + /// The attribute attached to globals representing a member offset + static const std::string AmaAttr; + /// The section name to identify a patchable external global + static const std::string PatchableExtSecName; +}; + +} // namespace llvm + +#endif diff --git a/lib/Target/BPF/BPFCallingConv.td b/lib/Target/BPF/BPFCallingConv.td index 637f9752ec42..ef4ef1930aa8 100644 --- a/lib/Target/BPF/BPFCallingConv.td +++ b/lib/Target/BPF/BPFCallingConv.td @@ -1,9 +1,8 @@ //===-- BPFCallingConv.td - Calling Conventions BPF --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFFrameLowering.cpp b/lib/Target/BPF/BPFFrameLowering.cpp index c2806c85f24f..8812cfdd86da 100644 --- a/lib/Target/BPF/BPFFrameLowering.cpp +++ b/lib/Target/BPF/BPFFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- BPFFrameLowering.cpp - BPF Frame Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h index b4ffa0713fa6..2dc6277d2244 100644 --- a/lib/Target/BPF/BPFFrameLowering.h +++ b/lib/Target/BPF/BPFFrameLowering.h @@ -1,9 +1,8 @@ //===-- BPFFrameLowering.h - Define frame lowering for BPF -----*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp index 8b9bc08e144f..1bd705c55188 100644 --- a/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- BPFISelDAGToDAG.cpp - A dag to dag inst selector for BPF ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index 9272cf692dc9..ff69941d26fb 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -1,9 +1,8 @@ //===-- BPFISelLowering.cpp - BPF DAG Lowering Implementation ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -106,7 +105,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, if (STI.getHasAlu32()) { setOperationAction(ISD::BSWAP, MVT::i32, Promote); - setOperationAction(ISD::BR_CC, MVT::i32, Promote); + setOperationAction(ISD::BR_CC, MVT::i32, + STI.getHasJmp32() ? Custom : Promote); } setOperationAction(ISD::CTTZ, MVT::i64, Custom); @@ -163,6 +163,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, // CPU/Feature control HasAlu32 = STI.getHasAlu32(); + HasJmp32 = STI.getHasJmp32(); HasJmpExt = STI.getHasJmpExt(); } @@ -507,7 +508,7 @@ SDValue BPFTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { NegateCC(LHS, RHS, CC); return DAG.getNode(BPFISD::BR_CC, DL, Op.getValueType(), Chain, LHS, RHS, - DAG.getConstant(CC, DL, MVT::i64), Dest); + DAG.getConstant(CC, DL, LHS.getValueType()), Dest); } SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -677,36 +678,23 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, int CC = MI.getOperand(3).getImm(); int NewCC; switch (CC) { - case ISD::SETGT: - NewCC = isSelectRROp ? BPF::JSGT_rr : BPF::JSGT_ri; - break; - case ISD::SETUGT: - NewCC = isSelectRROp ? BPF::JUGT_rr : BPF::JUGT_ri; - break; - case ISD::SETGE: - NewCC = isSelectRROp ? BPF::JSGE_rr : BPF::JSGE_ri; - break; - case ISD::SETUGE: - NewCC = isSelectRROp ? BPF::JUGE_rr : BPF::JUGE_ri; - break; - case ISD::SETEQ: - NewCC = isSelectRROp ? BPF::JEQ_rr : BPF::JEQ_ri; - break; - case ISD::SETNE: - NewCC = isSelectRROp ? BPF::JNE_rr : BPF::JNE_ri; - break; - case ISD::SETLT: - NewCC = isSelectRROp ? BPF::JSLT_rr : BPF::JSLT_ri; - break; - case ISD::SETULT: - NewCC = isSelectRROp ? BPF::JULT_rr : BPF::JULT_ri; - break; - case ISD::SETLE: - NewCC = isSelectRROp ? BPF::JSLE_rr : BPF::JSLE_ri; - break; - case ISD::SETULE: - NewCC = isSelectRROp ? BPF::JULE_rr : BPF::JULE_ri; - break; +#define SET_NEWCC(X, Y) \ + case ISD::X: \ + if (is32BitCmp && HasJmp32) \ + NewCC = isSelectRROp ? BPF::Y##_rr_32 : BPF::Y##_ri_32; \ + else \ + NewCC = isSelectRROp ? BPF::Y##_rr : BPF::Y##_ri; \ + break + SET_NEWCC(SETGT, JSGT); + SET_NEWCC(SETUGT, JUGT); + SET_NEWCC(SETGE, JSGE); + SET_NEWCC(SETUGE, JUGE); + SET_NEWCC(SETEQ, JEQ); + SET_NEWCC(SETNE, JNE); + SET_NEWCC(SETLT, JSLT); + SET_NEWCC(SETULT, JULT); + SET_NEWCC(SETLE, JSLE); + SET_NEWCC(SETULE, JULE); default: report_fatal_error("unimplemented select CondCode " + Twine(CC)); } @@ -724,13 +712,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // // We simply do extension for all situations in this method, but we will // try to remove those unnecessary in BPFMIPeephole pass. - if (is32BitCmp) + if (is32BitCmp && !HasJmp32) LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp); if (isSelectRROp) { unsigned RHS = MI.getOperand(2).getReg(); - if (is32BitCmp) + if (is32BitCmp && !HasJmp32) RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp); BuildMI(BB, DL, TII.get(NewCC)).addReg(LHS).addReg(RHS).addMBB(Copy1MBB); diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h index 0aa8b9ac57ac..b81bf4e1320d 100644 --- a/lib/Target/BPF/BPFISelLowering.h +++ b/lib/Target/BPF/BPFISelLowering.h @@ -1,9 +1,8 @@ //===-- BPFISelLowering.h - BPF DAG Lowering Interface ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -56,6 +55,7 @@ public: MachineBasicBlock *BB) const override; bool getHasAlu32() const { return HasAlu32; } + bool getHasJmp32() const { return HasJmp32; } bool getHasJmpExt() const { return HasJmpExt; } EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, @@ -66,6 +66,7 @@ public: private: // Control Instruction Selection Features bool HasAlu32; + bool HasJmp32; bool HasJmpExt; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; @@ -100,7 +101,7 @@ private: EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override { + const AttributeList &FuncAttributes) const override { return Size >= 8 ? MVT::i64 : MVT::i32; } diff --git a/lib/Target/BPF/BPFInstrFormats.td b/lib/Target/BPF/BPFInstrFormats.td index 92d4a62fd875..9f00dc85d789 100644 --- a/lib/Target/BPF/BPFInstrFormats.td +++ b/lib/Target/BPF/BPFInstrFormats.td @@ -1,9 +1,8 @@ //===-- BPFInstrFormats.td - BPF Instruction Formats -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -17,6 +16,7 @@ def BPF_ST : BPFOpClass<0x2>; def BPF_STX : BPFOpClass<0x3>; def BPF_ALU : BPFOpClass<0x4>; def BPF_JMP : BPFOpClass<0x5>; +def BPF_JMP32 : BPFOpClass<0x6>; def BPF_ALU64 : BPFOpClass<0x7>; class BPFSrcType val> { diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp index 4d47debdaa74..932f718d5490 100644 --- a/lib/Target/BPF/BPFInstrInfo.cpp +++ b/lib/Target/BPF/BPFInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- BPFInstrInfo.cpp - BPF Instruction Information ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h index fb65a86a6d18..e4bd757da560 100644 --- a/lib/Target/BPF/BPFInstrInfo.h +++ b/lib/Target/BPF/BPFInstrInfo.h @@ -1,9 +1,8 @@ //===-- BPFInstrInfo.h - BPF Instruction Information ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index aaef5fb706e0..c44702a78ec8 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -1,9 +1,8 @@ //===-- BPFInstrInfo.td - Target Description for BPF Target ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -102,6 +101,26 @@ def BPF_CC_LTU : PatLeaf<(i64 imm), [{return (N->getZExtValue() == ISD::SETULT);}]>; def BPF_CC_LEU : PatLeaf<(i64 imm), [{return (N->getZExtValue() == ISD::SETULE);}]>; +def BPF_CC_EQ_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETEQ);}]>; +def BPF_CC_NE_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETNE);}]>; +def BPF_CC_GE_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETGE);}]>; +def BPF_CC_GT_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETGT);}]>; +def BPF_CC_GTU_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETUGT);}]>; +def BPF_CC_GEU_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETUGE);}]>; +def BPF_CC_LE_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETLE);}]>; +def BPF_CC_LT_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETLT);}]>; +def BPF_CC_LTU_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETULT);}]>; +def BPF_CC_LEU_32 : PatLeaf<(i32 imm), + [{return (N->getZExtValue() == ISD::SETULE);}]>; // For arithmetic and jump instructions the 8-bit 'code' // field is divided into three parts: @@ -167,23 +186,57 @@ class JMP_RI let BPFClass = BPF_JMP; } -multiclass J { +class JMP_RR_32 + : TYPE_ALU_JMP { + bits<4> dst; + bits<4> src; + bits<16> BrDst; + + let Inst{55-52} = src; + let Inst{51-48} = dst; + let Inst{47-32} = BrDst; + let BPFClass = BPF_JMP32; +} + +class JMP_RI_32 + : TYPE_ALU_JMP { + bits<4> dst; + bits<16> BrDst; + bits<32> imm; + + let Inst{51-48} = dst; + let Inst{47-32} = BrDst; + let Inst{31-0} = imm; + let BPFClass = BPF_JMP32; +} + +multiclass J { def _rr : JMP_RR; def _ri : JMP_RI; + def _rr_32 : JMP_RR_32; + def _ri_32 : JMP_RI_32; } let isBranch = 1, isTerminator = 1, hasDelaySlot=0 in { // cmp+goto instructions -defm JEQ : J; -defm JUGT : J", BPF_CC_GTU>; -defm JUGE : J=", BPF_CC_GEU>; -defm JNE : J", BPF_CC_GT>; -defm JSGE : J=", BPF_CC_GE>; -defm JULT : J; -defm JULE : J; -defm JSLE : J", BPF_CC_GTU, BPF_CC_GTU_32>; +defm JUGE : J=", BPF_CC_GEU, BPF_CC_GEU_32>; +defm JNE : J", BPF_CC_GT, BPF_CC_GT_32>; +defm JSGE : J=", BPF_CC_GE, BPF_CC_GE_32>; +defm JULT : J; +defm JULE : J; +defm JSLE : J { + bits<4> dst; + bits<20> addr; + + let Inst{51-48} = addr{19-16}; // base reg + let Inst{55-52} = dst; + let Inst{47-32} = addr{15-0}; // offset + let BPFClass = BPF_STX; +} + let Constraints = "$dst = $val" in { -def XADD32 : XADD; -def XADD64 : XADD; -// undefined def XADD16 : XADD<1, "xadd16", atomic_load_add_16>; -// undefined def XADD8 : XADD<2, "xadd8", atomic_load_add_8>; + let Predicates = [BPFNoALU32] in { + def XADDW : XADD; + } + + let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { + def XADDW32 : XADD32; + } + + def XADDD : XADD; } // bswap16, bswap32, bswap64 diff --git a/lib/Target/BPF/BPFMCInstLower.cpp b/lib/Target/BPF/BPFMCInstLower.cpp index c8528e867310..846798a63cb7 100644 --- a/lib/Target/BPF/BPFMCInstLower.cpp +++ b/lib/Target/BPF/BPFMCInstLower.cpp @@ -1,9 +1,8 @@ //=-- BPFMCInstLower.cpp - Convert BPF MachineInstr to an MCInst ------------=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFMCInstLower.h b/lib/Target/BPF/BPFMCInstLower.h index eac811f4cf88..0622d20814d3 100644 --- a/lib/Target/BPF/BPFMCInstLower.h +++ b/lib/Target/BPF/BPFMCInstLower.h @@ -1,9 +1,8 @@ //===-- BPFMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/BPF/BPFMIChecking.cpp b/lib/Target/BPF/BPFMIChecking.cpp index 0a311378e777..4c46289656b4 100644 --- a/lib/Target/BPF/BPFMIChecking.cpp +++ b/lib/Target/BPF/BPFMIChecking.cpp @@ -1,9 +1,8 @@ //===-------------- BPFMIChecking.cpp - MI Checking Legality -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -62,14 +61,107 @@ void BPFMIPreEmitChecking::initialize(MachineFunction &MFParm) { LLVM_DEBUG(dbgs() << "*** BPF PreEmit checking pass ***\n\n"); } +// Make sure all Defs of XADD are dead, meaning any result of XADD insn is not +// used. +// +// NOTE: BPF backend hasn't enabled sub-register liveness track, so when the +// source and destination operands of XADD are GPR32, there is no sub-register +// dead info. If we rely on the generic MachineInstr::allDefsAreDead, then we +// will raise false alarm on GPR32 Def. +// +// To support GPR32 Def, ideally we could just enable sub-registr liveness track +// on BPF backend, then allDefsAreDead could work on GPR32 Def. This requires +// implementing TargetSubtargetInfo::enableSubRegLiveness on BPF. +// +// However, sub-register liveness tracking module inside LLVM is actually +// designed for the situation where one register could be split into more than +// one sub-registers for which case each sub-register could have their own +// liveness and kill one of them doesn't kill others. So, tracking liveness for +// each make sense. +// +// For BPF, each 64-bit register could only have one 32-bit sub-register. This +// is exactly the case which LLVM think brings no benefits for doing +// sub-register tracking, because the live range of sub-register must always +// equal to its parent register, therefore liveness tracking is disabled even +// the back-end has implemented enableSubRegLiveness. The detailed information +// is at r232695: +// +// Author: Matthias Braun +// Date: Thu Mar 19 00:21:58 2015 +0000 +// Do not track subregister liveness when it brings no benefits +// +// Hence, for BPF, we enhance MachineInstr::allDefsAreDead. Given the solo +// sub-register always has the same liveness as its parent register, LLVM is +// already attaching a implicit 64-bit register Def whenever the there is +// a sub-register Def. The liveness of the implicit 64-bit Def is available. +// For example, for "lock *(u32 *)(r0 + 4) += w9", the MachineOperand info could +// be: +// +// $w9 = XADDW32 killed $r0, 4, $w9(tied-def 0), +// implicit killed $r9, implicit-def dead $r9 +// +// Even though w9 is not marked as Dead, the parent register r9 is marked as +// Dead correctly, and it is safe to use such information or our purpose. +static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) { + const MCRegisterClass *GPR64RegClass = + &BPFMCRegisterClasses[BPF::GPRRegClassID]; + std::vector GPR32LiveDefs; + std::vector GPR64DeadDefs; + + for (const MachineOperand &MO : MI.operands()) { + bool RegIsGPR64; + + if (!MO.isReg() || MO.isUse()) + continue; + + RegIsGPR64 = GPR64RegClass->contains(MO.getReg()); + if (!MO.isDead()) { + // It is a GPR64 live Def, we are sure it is live. */ + if (RegIsGPR64) + return true; + // It is a GPR32 live Def, we are unsure whether it is really dead due to + // no sub-register liveness tracking. Push it to vector for deferred + // check. + GPR32LiveDefs.push_back(MO.getReg()); + continue; + } + + // Record any GPR64 dead Def as some unmarked GPR32 could be alias of its + // low 32-bit. + if (RegIsGPR64) + GPR64DeadDefs.push_back(MO.getReg()); + } + + // No GPR32 live Def, safe to return false. + if (GPR32LiveDefs.empty()) + return false; + + // No GPR64 dead Def, so all those GPR32 live Def can't have alias, therefore + // must be truely live, safe to return true. + if (GPR64DeadDefs.empty()) + return true; + + // Otherwise, return true if any aliased SuperReg of GPR32 is not dead. + std::vector::iterator search_begin = GPR64DeadDefs.begin(); + std::vector::iterator search_end = GPR64DeadDefs.end(); + for (auto I : GPR32LiveDefs) + for (MCSuperRegIterator SR(I, TRI); SR.isValid(); ++SR) + if (std::find(search_begin, search_end, *SR) == search_end) + return true; + + return false; +} + void BPFMIPreEmitChecking::checkingIllegalXADD(void) { for (MachineBasicBlock &MBB : *MF) { for (MachineInstr &MI : MBB) { - if (MI.getOpcode() != BPF::XADD32 && MI.getOpcode() != BPF::XADD64) + if (MI.getOpcode() != BPF::XADDW && + MI.getOpcode() != BPF::XADDD && + MI.getOpcode() != BPF::XADDW32) continue; LLVM_DEBUG(MI.dump()); - if (!MI.allDefsAreDead()) { + if (hasLiveDefs(MI, TRI)) { DebugLoc Empty; const DebugLoc &DL = MI.getDebugLoc(); if (DL != Empty) diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp index 9e984d0facfb..156ba793e359 100644 --- a/lib/Target/BPF/BPFMIPeephole.cpp +++ b/lib/Target/BPF/BPFMIPeephole.cpp @@ -1,9 +1,8 @@ //===-------------- BPFMIPeephole.cpp - MI Peephole Cleanups -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/lib/Target/BPF/BPFMISimplifyPatchable.cpp new file mode 100644 index 000000000000..e9114d7187e3 --- /dev/null +++ b/lib/Target/BPF/BPFMISimplifyPatchable.cpp @@ -0,0 +1,163 @@ +//===----- BPFMISimplifyPatchable.cpp - MI Simplify Patchable Insts -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass targets a subset of instructions like below +// ld_imm64 r1, @global +// ldd r2, r1, 0 +// add r3, struct_base_reg, r2 +// +// Here @global should either present a AMA (abstruct member access) or +// a patchable extern variable. And these two kinds of accesses +// are subject to bpf load time patching. After this pass, the +// code becomes +// ld_imm64 r1, @global +// add r3, struct_base_reg, r1 +// +// Eventually, at BTF output stage, a relocation record will be generated +// for ld_imm64 which should be replaced later by bpf loader: +// r1 = or +// add r3, struct_base_reg, r1 +// or +// ld_imm64 r1, +// add r3, struct_base_reg, r1 +// +//===----------------------------------------------------------------------===// + +#include "BPF.h" +#include "BPFCORE.h" +#include "BPFInstrInfo.h" +#include "BPFTargetMachine.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "bpf-mi-simplify-patchable" + +namespace { + +struct BPFMISimplifyPatchable : public MachineFunctionPass { + + static char ID; + const BPFInstrInfo *TII; + MachineFunction *MF; + + BPFMISimplifyPatchable() : MachineFunctionPass(ID) { + initializeBPFMISimplifyPatchablePass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + bool removeLD(void); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (!skipFunction(MF.getFunction())) { + initialize(MF); + } + return removeLD(); + } +}; + +// Initialize class variables. +void BPFMISimplifyPatchable::initialize(MachineFunction &MFParm) { + MF = &MFParm; + TII = MF->getSubtarget().getInstrInfo(); + LLVM_DEBUG(dbgs() << "*** BPF simplify patchable insts pass ***\n\n"); +} + +/// Remove unneeded Load instructions. +bool BPFMISimplifyPatchable::removeLD() { + MachineRegisterInfo *MRI = &MF->getRegInfo(); + MachineInstr *ToErase = nullptr; + bool Changed = false; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // Ensure the register format is LOAD , , 0 + if (MI.getOpcode() != BPF::LDD && MI.getOpcode() != BPF::LDW && + MI.getOpcode() != BPF::LDH && MI.getOpcode() != BPF::LDB && + MI.getOpcode() != BPF::LDW32 && MI.getOpcode() != BPF::LDH32 && + MI.getOpcode() != BPF::LDB32) + continue; + + if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg()) + continue; + + if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm()) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + int64_t ImmVal = MI.getOperand(2).getImm(); + + MachineInstr *DefInst = MRI->getUniqueVRegDef(SrcReg); + if (!DefInst) + continue; + + bool IsCandidate = false; + if (DefInst->getOpcode() == BPF::LD_imm64) { + const MachineOperand &MO = DefInst->getOperand(1); + if (MO.isGlobal()) { + const GlobalValue *GVal = MO.getGlobal(); + auto *GVar = dyn_cast(GVal); + if (GVar) { + // Global variables representing structure offset or + // patchable extern globals. + if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) { + assert(ImmVal == 0); + IsCandidate = true; + } else if (!GVar->hasInitializer() && GVar->hasExternalLinkage() && + GVar->getSection() == + BPFCoreSharedInfo::PatchableExtSecName) { + if (ImmVal == 0) + IsCandidate = true; + else + errs() << "WARNING: unhandled patchable extern " + << GVar->getName() << " with load offset " << ImmVal + << "\n"; + } + } + } + } + + if (!IsCandidate) + continue; + + auto Begin = MRI->use_begin(DstReg), End = MRI->use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + I->setReg(SrcReg); + } + + ToErase = &MI; + Changed = true; + } + } + + return Changed; +} + +} // namespace + +INITIALIZE_PASS(BPFMISimplifyPatchable, DEBUG_TYPE, + "BPF PreEmit SimplifyPatchable", false, false) + +char BPFMISimplifyPatchable::ID = 0; +FunctionPass *llvm::createBPFMISimplifyPatchablePass() { + return new BPFMISimplifyPatchable(); +} diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp index 635c11113151..714af06e11d9 100644 --- a/lib/Target/BPF/BPFRegisterInfo.cpp +++ b/lib/Target/BPF/BPFRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- BPFRegisterInfo.cpp - BPF Register Information ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -122,6 +121,6 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } -unsigned BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return BPF::R10; } diff --git a/lib/Target/BPF/BPFRegisterInfo.h b/lib/Target/BPF/BPFRegisterInfo.h index 4202850e9eb9..e7b870b720a4 100644 --- a/lib/Target/BPF/BPFRegisterInfo.h +++ b/lib/Target/BPF/BPFRegisterInfo.h @@ -1,9 +1,8 @@ //===-- BPFRegisterInfo.h - BPF Register Information Impl -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,7 +32,7 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo { unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; }; } diff --git a/lib/Target/BPF/BPFRegisterInfo.td b/lib/Target/BPF/BPFRegisterInfo.td index da1d6b505f84..88dec063be70 100644 --- a/lib/Target/BPF/BPFRegisterInfo.td +++ b/lib/Target/BPF/BPFRegisterInfo.td @@ -1,9 +1,8 @@ //===-- BPFRegisterInfo.td - BPF Register defs -------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/lib/Target/BPF/BPFSelectionDAGInfo.cpp index 24d5f59bbfd7..a711294048ba 100644 --- a/lib/Target/BPF/BPFSelectionDAGInfo.cpp +++ b/lib/Target/BPF/BPFSelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.h b/lib/Target/BPF/BPFSelectionDAGInfo.h index 19d3c5769573..fb88c32ceb0c 100644 --- a/lib/Target/BPF/BPFSelectionDAGInfo.h +++ b/lib/Target/BPF/BPFSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp index 56780bd9d46f..ab3452501b95 100644 --- a/lib/Target/BPF/BPFSubtarget.cpp +++ b/lib/Target/BPF/BPFSubtarget.cpp @@ -1,9 +1,8 @@ //===-- BPFSubtarget.cpp - BPF Subtarget Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -36,6 +35,7 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU, void BPFSubtarget::initializeEnvironment() { HasJmpExt = false; + HasJmp32 = false; HasAlu32 = false; UseDwarfRIS = false; } @@ -49,6 +49,11 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { HasJmpExt = true; return; } + if (CPU == "v3") { + HasJmpExt = true; + HasJmp32 = true; + return; + } } BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU, diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h index 60e56435fe4c..3da6a026ab7e 100644 --- a/lib/Target/BPF/BPFSubtarget.h +++ b/lib/Target/BPF/BPFSubtarget.h @@ -1,9 +1,8 @@ //===-- BPFSubtarget.h - Define Subtarget for the BPF -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -48,6 +47,10 @@ protected: // whether the cpu supports jmp ext bool HasJmpExt; + // whether the cpu supports jmp32 ext. + // NOTE: jmp32 is not enabled when alu32 enabled. + bool HasJmp32; + // whether the cpu supports alu32 instructions. bool HasAlu32; @@ -66,6 +69,7 @@ public: // subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); bool getHasJmpExt() const { return HasJmpExt; } + bool getHasJmp32() const { return HasJmp32; } bool getHasAlu32() const { return HasAlu32; } bool getUseDwarfRIS() const { return UseDwarfRIS; } diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 350465b118ed..24c0ff0f7f15 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- BPFTargetMachine.cpp - Define TargetMachine for BPF ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,6 +13,7 @@ #include "BPFTargetMachine.h" #include "BPF.h" #include "MCTargetDesc/BPFMCAsmInfo.h" +#include "TargetInfo/BPFTargetInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -34,6 +34,7 @@ extern "C" void LLVMInitializeBPFTarget() { RegisterTargetMachine Z(getTheBPFTarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeBPFAbstractMemberAccessPass(PR); initializeBPFMIPeepholePass(PR); } @@ -68,6 +69,7 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT, static_cast(const_cast(AsmInfo.get())); MAI->setDwarfUsesRelocationsAcrossSections(!Subtarget.getUseDwarfRIS()); } + namespace { // BPF Code Generator Pass Configuration Options. class BPFPassConfig : public TargetPassConfig { @@ -79,6 +81,7 @@ public: return getTM(); } + void addIRPasses() override; bool addInstSelector() override; void addMachineSSAOptimization() override; void addPreEmitPass() override; @@ -89,6 +92,13 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { return new BPFPassConfig(*this, PM); } +void BPFPassConfig::addIRPasses() { + + addPass(createBPFAbstractMemberAccess()); + + TargetPassConfig::addIRPasses(); +} + // Install an instruction selector pass using // the ISelDag to gen BPF code. bool BPFPassConfig::addInstSelector() { @@ -98,6 +108,8 @@ bool BPFPassConfig::addInstSelector() { } void BPFPassConfig::addMachineSSAOptimization() { + addPass(createBPFMISimplifyPatchablePass()); + // The default implementation must be called first as we want eBPF // Peephole ran at last. TargetPassConfig::addMachineSSAOptimization(); diff --git a/lib/Target/BPF/BPFTargetMachine.h b/lib/Target/BPF/BPFTargetMachine.h index a560dd27335a..beac7bd862da 100644 --- a/lib/Target/BPF/BPFTargetMachine.h +++ b/lib/Target/BPF/BPFTargetMachine.h @@ -1,9 +1,8 @@ //===-- BPFTargetMachine.h - Define TargetMachine for BPF --- C++ ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/BTF.def b/lib/Target/BPF/BTF.def index 54c5bc3cf092..2d2e9a04aa6d 100644 --- a/lib/Target/BPF/BTF.def +++ b/lib/Target/BPF/BTF.def @@ -1,9 +1,8 @@ //===- BTF.def - BTF definitions --------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,5 +28,7 @@ HANDLE_BTF_KIND(10, CONST) HANDLE_BTF_KIND(11, RESTRICT) HANDLE_BTF_KIND(12, FUNC) HANDLE_BTF_KIND(13, FUNC_PROTO) +HANDLE_BTF_KIND(14, VAR) +HANDLE_BTF_KIND(15, DATASEC) #undef HANDLE_BTF_KIND diff --git a/lib/Target/BPF/BTF.h b/lib/Target/BPF/BTF.h index 1e1680faf1b8..ad56716710a6 100644 --- a/lib/Target/BPF/BTF.h +++ b/lib/Target/BPF/BTF.h @@ -1,9 +1,8 @@ //===-- BTF.h --------------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -18,7 +17,7 @@ /// /// The binary layout for .BTF.ext section: /// struct ExtHeader -/// FuncInfo and LineInfo subsections +/// FuncInfo, LineInfo, OffsetReloc and ExternReloc subsections /// The FuncInfo subsection is defined as below: /// BTFFuncInfo Size /// struct SecFuncInfo for ELF section #1 @@ -33,6 +32,20 @@ /// struct SecLineInfo for ELF section #2 /// A number of struct BPFLineInfo for ELF section #2 /// ... +/// The OffsetReloc subsection is defined as below: +/// BPFOffsetReloc Size +/// struct SecOffsetReloc for ELF section #1 +/// A number of struct BPFOffsetReloc for ELF section #1 +/// struct SecOffsetReloc for ELF section #2 +/// A number of struct BPFOffsetReloc for ELF section #2 +/// ... +/// The ExternReloc subsection is defined as below: +/// BPFExternReloc Size +/// struct SecExternReloc for ELF section #1 +/// A number of struct BPFExternReloc for ELF section #1 +/// struct SecExternReloc for ELF section #2 +/// A number of struct BPFExternReloc for ELF section #2 +/// ... /// /// The section formats are also defined at /// https://github.com/torvalds/linux/blob/master/include/uapi/linux/btf.h @@ -50,16 +63,21 @@ enum : uint32_t { MAGIC = 0xeB9F, VERSION = 1 }; /// Sizes in bytes of various things in the BTF format. enum { HeaderSize = 24, - ExtHeaderSize = 24, + ExtHeaderSize = 40, CommonTypeSize = 12, BTFArraySize = 12, BTFEnumSize = 8, BTFMemberSize = 12, BTFParamSize = 8, + BTFDataSecVarSize = 12, SecFuncInfoSize = 8, SecLineInfoSize = 8, + SecOffsetRelocSize = 8, + SecExternRelocSize = 8, BPFFuncInfoSize = 8, - BPFLineInfoSize = 16 + BPFLineInfoSize = 16, + BPFOffsetRelocSize = 12, + BPFExternRelocSize = 8, }; /// The .BTF section header definition. @@ -77,7 +95,7 @@ struct Header { }; enum : uint32_t { - MAX_VLEN = 0xffff ///< Max # of struct/union/enum members or func args + MAX_VLEN = 0xffff ///< Max # of struct/union/enum members or func args }; enum TypeKinds : uint8_t { @@ -104,7 +122,7 @@ struct CommonType { /// "Size" tells the size of the type it is describing. /// /// "Type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - /// FUNC and FUNC_PROTO. + /// FUNC, FUNC_PROTO and VAR. /// "Type" is a type_id referring to another type. union { uint32_t Size; @@ -122,7 +140,11 @@ struct CommonType { // BTF_INT_BITS(VAL) : ((VAL) & 0x000000ff) /// Attributes stored in the INT_ENCODING. -enum : uint8_t { INT_SIGNED = (1 << 0), INT_CHAR = (1 << 1), INT_BOOL = (1 << 2) }; +enum : uint8_t { + INT_SIGNED = (1 << 0), + INT_CHAR = (1 << 1), + INT_BOOL = (1 << 2) +}; /// BTF_KIND_ENUM is followed by multiple "struct BTFEnum". /// The exact number of btf_enum is stored in the vlen (of the @@ -163,6 +185,23 @@ struct BTFParam { uint32_t Type; }; +/// Variable scoping information. +enum : uint8_t { + VAR_STATIC = 0, ///< Linkage: InternalLinkage + VAR_GLOBAL_ALLOCATED = 1, ///< Linkage: ExternalLinkage + VAR_GLOBAL_TENTATIVE = 2, ///< Linkage: CommonLinkage + VAR_GLOBAL_EXTERNAL = 3, ///< Linkage: ExternalLinkage +}; + +/// BTF_KIND_DATASEC are followed by multiple "struct BTFDataSecVar". +/// The exist number of BTFDataSec is stored in the vlen (of the info +/// in "struct CommonType"). +struct BTFDataSec { + uint32_t Type; ///< A BTF_KIND_VAR type + uint32_t Offset; ///< In-section offset + uint32_t Size; ///< Occupied memory size +}; + /// The .BTF.ext section header definition. struct ExtHeader { uint16_t Magic; @@ -170,10 +209,14 @@ struct ExtHeader { uint8_t Flags; uint32_t HdrLen; - uint32_t FuncInfoOff; ///< Offset of func info section - uint32_t FuncInfoLen; ///< Length of func info section - uint32_t LineInfoOff; ///< Offset of line info section - uint32_t LineInfoLen; ///< Length of line info section + uint32_t FuncInfoOff; ///< Offset of func info section + uint32_t FuncInfoLen; ///< Length of func info section + uint32_t LineInfoOff; ///< Offset of line info section + uint32_t LineInfoLen; ///< Length of line info section + uint32_t OffsetRelocOff; ///< Offset of offset reloc section + uint32_t OffsetRelocLen; ///< Length of offset reloc section + uint32_t ExternRelocOff; ///< Offset of extern reloc section + uint32_t ExternRelocLen; ///< Length of extern reloc section }; /// Specifying one function info. @@ -199,10 +242,35 @@ struct BPFLineInfo { /// Specifying line info's in one section. struct SecLineInfo { - uint32_t SecNameOff; ///< Section name index in the .BTF string tble + uint32_t SecNameOff; ///< Section name index in the .BTF string table uint32_t NumLineInfo; ///< Number of line info's in this section }; +/// Specifying one offset relocation. +struct BPFOffsetReloc { + uint32_t InsnOffset; ///< Byte offset in this section + uint32_t TypeID; ///< TypeID for the relocation + uint32_t OffsetNameOff; ///< The string to traverse types +}; + +/// Specifying offset relocation's in one section. +struct SecOffsetReloc { + uint32_t SecNameOff; ///< Section name index in the .BTF string table + uint32_t NumOffsetReloc; ///< Number of offset reloc's in this section +}; + +/// Specifying one offset relocation. +struct BPFExternReloc { + uint32_t InsnOffset; ///< Byte offset in this section + uint32_t ExternNameOff; ///< The string for external variable +}; + +/// Specifying extern relocation's in one section. +struct SecExternReloc { + uint32_t SecNameOff; ///< Section name index in the .BTF string table + uint32_t NumExternReloc; ///< Number of extern reloc's in this section +}; + } // End namespace BTF. } // End namespace llvm. diff --git a/lib/Target/BPF/BTFDebug.cpp b/lib/Target/BPF/BTFDebug.cpp index 96efea4ba8ee..fa35c6619e21 100644 --- a/lib/Target/BPF/BTFDebug.cpp +++ b/lib/Target/BPF/BTFDebug.cpp @@ -1,9 +1,8 @@ //===- BTFDebug.cpp - BTF Generator ---------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,9 @@ //===----------------------------------------------------------------------===// #include "BTFDebug.h" +#include "BPF.h" +#include "BPFCORE.h" +#include "MCTargetDesc/BPFMCTargetDesc.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -19,8 +21,7 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" -#include -#include +#include "llvm/Support/LineIterator.h" using namespace llvm; @@ -39,8 +40,9 @@ void BTFTypeBase::emitType(MCStreamer &OS) { OS.EmitIntValue(BTFType.Size, 4); } -BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag) - : DTy(DTy) { +BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag, + bool NeedsFixup) + : DTy(DTy), NeedsFixup(NeedsFixup) { switch (Tag) { case dwarf::DW_TAG_pointer_type: Kind = BTF::BTF_KIND_PTR; @@ -64,10 +66,17 @@ BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag) } void BTFTypeDerived::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + BTFType.NameOff = BDebug.addString(DTy->getName()); + if (NeedsFixup) + return; + // The base type for PTR/CONST/VOLATILE could be void. - const DIType *ResolvedType = DTy->getBaseType().resolve(); + const DIType *ResolvedType = DTy->getBaseType(); if (!ResolvedType) { assert((Kind == BTF::BTF_KIND_PTR || Kind == BTF::BTF_KIND_CONST || Kind == BTF::BTF_KIND_VOLATILE) && @@ -80,6 +89,10 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) { void BTFTypeDerived::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); } +void BTFTypeDerived::setPointeeType(uint32_t PointeeType) { + BTFType.Type = PointeeType; +} + /// Represent a struct/union forward declaration. BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) { Kind = BTF::BTF_KIND_FWD; @@ -88,6 +101,10 @@ BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) { } void BTFTypeFwd::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + BTFType.NameOff = BDebug.addString(Name); } @@ -121,6 +138,10 @@ BTFTypeInt::BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits, } void BTFTypeInt::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + BTFType.NameOff = BDebug.addString(Name); } @@ -137,6 +158,10 @@ BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) { } void BTFTypeEnum::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + BTFType.NameOff = BDebug.addString(ETy->getName()); DINodeArray Elements = ETy->getElements(); @@ -159,45 +184,29 @@ void BTFTypeEnum::emitType(MCStreamer &OS) { } } -BTFTypeArray::BTFTypeArray(const DICompositeType *ATy) : ATy(ATy) { +BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, + uint32_t NumElems) + : ElemSize(ElemSize) { Kind = BTF::BTF_KIND_ARRAY; + BTFType.NameOff = 0; BTFType.Info = Kind << 24; + BTFType.Size = 0; + + ArrayInfo.ElemType = ElemTypeId; + ArrayInfo.Nelems = NumElems; } -/// Represent a BTF array. BTF does not record array dimensions, -/// so conceptually a BTF array is a one-dimensional array. +/// Represent a BTF array. void BTFTypeArray::completeType(BTFDebug &BDebug) { - BTFType.NameOff = BDebug.addString(ATy->getName()); - BTFType.Size = 0; - - auto *BaseType = ATy->getBaseType().resolve(); - ArrayInfo.ElemType = BDebug.getTypeId(BaseType); + if (IsCompleted) + return; + IsCompleted = true; // The IR does not really have a type for the index. // A special type for array index should have been // created during initial type traversal. Just // retrieve that type id. ArrayInfo.IndexType = BDebug.getArrayIndexTypeId(); - - // Get the number of array elements. - // If the array size is 0, set the number of elements as 0. - // Otherwise, recursively traverse the base types to - // find the element size. The number of elements is - // the totoal array size in bits divided by - // element size in bits. - uint64_t ArraySizeInBits = ATy->getSizeInBits(); - if (!ArraySizeInBits) { - ArrayInfo.Nelems = 0; - } else { - uint32_t BaseTypeSize = BaseType->getSizeInBits(); - while (!BaseTypeSize) { - const auto *DDTy = cast(BaseType); - BaseType = DDTy->getBaseType().resolve(); - assert(BaseType); - BaseTypeSize = BaseType->getSizeInBits(); - } - ArrayInfo.Nelems = ATy->getSizeInBits() / BaseTypeSize; - } } void BTFTypeArray::emitType(MCStreamer &OS) { @@ -207,6 +216,12 @@ void BTFTypeArray::emitType(MCStreamer &OS) { OS.EmitIntValue(ArrayInfo.Nelems, 4); } +void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset, + uint32_t &ElementTypeId) { + ElementTypeId = ArrayInfo.ElemType; + LocOffset = Loc * ElemSize; +} + /// Represent either a struct or a union. BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct, bool HasBitField, uint32_t Vlen) @@ -217,6 +232,10 @@ BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct, } void BTFTypeStruct::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + BTFType.NameOff = BDebug.addString(STy->getName()); // Add struct/union members. @@ -232,7 +251,7 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) { } else { BTFMember.Offset = DDTy->getOffsetInBits(); } - BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType().resolve()); + BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType()); Members.push_back(BTFMember); } } @@ -247,6 +266,17 @@ void BTFTypeStruct::emitType(MCStreamer &OS) { } } +std::string BTFTypeStruct::getName() { return STy->getName(); } + +void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset, + uint32_t &MemberType) { + MemberType = Members[Loc].Type; + MemberOffset = + HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset; +} + +uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; } + /// The Func kind represents both subprogram and pointee of function /// pointers. If the FuncName is empty, it represents a pointee of function /// pointer. Otherwise, it represents a subprogram. The func arg names @@ -261,8 +291,12 @@ BTFTypeFuncProto::BTFTypeFuncProto( } void BTFTypeFuncProto::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + DITypeRefArray Elements = STy->getTypeArray(); - auto RetType = Elements[0].resolve(); + auto RetType = Elements[0]; BTFType.Type = RetType ? BDebug.getTypeId(RetType) : 0; BTFType.NameOff = 0; @@ -270,7 +304,7 @@ void BTFTypeFuncProto::completeType(BTFDebug &BDebug) { // to represent the vararg, encode the NameOff/Type to be 0. for (unsigned I = 1, N = Elements.size(); I < N; ++I) { struct BTF::BTFParam Param; - auto Element = Elements[I].resolve(); + auto Element = Elements[I]; if (Element) { Param.NameOff = BDebug.addString(FuncArgNames[I]); Param.Type = BDebug.getTypeId(Element); @@ -298,11 +332,54 @@ BTFTypeFunc::BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId) } void BTFTypeFunc::completeType(BTFDebug &BDebug) { + if (IsCompleted) + return; + IsCompleted = true; + BTFType.NameOff = BDebug.addString(Name); } void BTFTypeFunc::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); } +BTFKindVar::BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo) + : Name(VarName) { + Kind = BTF::BTF_KIND_VAR; + BTFType.Info = Kind << 24; + BTFType.Type = TypeId; + Info = VarInfo; +} + +void BTFKindVar::completeType(BTFDebug &BDebug) { + BTFType.NameOff = BDebug.addString(Name); +} + +void BTFKindVar::emitType(MCStreamer &OS) { + BTFTypeBase::emitType(OS); + OS.EmitIntValue(Info, 4); +} + +BTFKindDataSec::BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName) + : Asm(AsmPrt), Name(SecName) { + Kind = BTF::BTF_KIND_DATASEC; + BTFType.Info = Kind << 24; + BTFType.Size = 0; +} + +void BTFKindDataSec::completeType(BTFDebug &BDebug) { + BTFType.NameOff = BDebug.addString(Name); + BTFType.Info |= Vars.size(); +} + +void BTFKindDataSec::emitType(MCStreamer &OS) { + BTFTypeBase::emitType(OS); + + for (const auto &V : Vars) { + OS.EmitIntValue(std::get<0>(V), 4); + Asm->EmitLabelReference(std::get<1>(V), 4); + OS.EmitIntValue(std::get<2>(V), 4); + } +} + uint32_t BTFStringTable::addString(StringRef S) { // Check whether the string already exists. for (auto &OffsetM : OffsetToIdMap) { @@ -319,15 +396,18 @@ uint32_t BTFStringTable::addString(StringRef S) { BTFDebug::BTFDebug(AsmPrinter *AP) : DebugHandlerBase(AP), OS(*Asm->OutStreamer), SkipInstruction(false), - LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0) { + LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0), + MapDefNotCollected(true) { addString("\0"); } -void BTFDebug::addType(std::unique_ptr TypeEntry, - const DIType *Ty) { +uint32_t BTFDebug::addType(std::unique_ptr TypeEntry, + const DIType *Ty) { TypeEntry->setId(TypeEntries.size() + 1); - DIToIdMap[Ty] = TypeEntry->getId(); + uint32_t Id = TypeEntry->getId(); + DIToIdMap[Ty] = Id; TypeEntries.push_back(std::move(TypeEntry)); + return Id; } uint32_t BTFDebug::addType(std::unique_ptr TypeEntry) { @@ -337,7 +417,7 @@ uint32_t BTFDebug::addType(std::unique_ptr TypeEntry) { return Id; } -void BTFDebug::visitBasicType(const DIBasicType *BTy) { +void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) { // Only int types are supported in BTF. uint32_t Encoding = BTy->getEncoding(); if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed && @@ -350,7 +430,7 @@ void BTFDebug::visitBasicType(const DIBasicType *BTy) { // DIToIdMap for cross-type reference check. auto TypeEntry = llvm::make_unique( Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName()); - addType(std::move(TypeEntry), BTy); + TypeId = addType(std::move(TypeEntry), BTy); } /// Handle subprogram or subroutine types. @@ -371,16 +451,17 @@ void BTFDebug::visitSubroutineType( if (ForSubprog) TypeId = addType(std::move(TypeEntry)); // For subprogram else - addType(std::move(TypeEntry), STy); // For func ptr + TypeId = addType(std::move(TypeEntry), STy); // For func ptr // Visit return type and func arg types. for (const auto Element : Elements) { - visitTypeEntry(Element.resolve()); + visitTypeEntry(Element); } } /// Handle structure/union types. -void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct) { +void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct, + uint32_t &TypeId) { const DINodeArray Elements = CTy->getElements(); uint32_t VLen = Elements.size(); if (VLen > BTF::MAX_VLEN) @@ -398,16 +479,49 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct) { auto TypeEntry = llvm::make_unique(CTy, IsStruct, HasBitField, VLen); - addType(std::move(TypeEntry), CTy); + StructTypes.push_back(TypeEntry.get()); + TypeId = addType(std::move(TypeEntry), CTy); // Visit all struct members. for (const auto *Element : Elements) visitTypeEntry(cast(Element)); } -void BTFDebug::visitArrayType(const DICompositeType *CTy) { - auto TypeEntry = llvm::make_unique(CTy); - addType(std::move(TypeEntry), CTy); +void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) { + // Visit array element type. + uint32_t ElemTypeId, ElemSize; + const DIType *ElemType = CTy->getBaseType(); + visitTypeEntry(ElemType, ElemTypeId, false, false); + ElemSize = ElemType->getSizeInBits() >> 3; + + if (!CTy->getSizeInBits()) { + auto TypeEntry = llvm::make_unique(ElemTypeId, 0, 0); + ArrayTypes.push_back(TypeEntry.get()); + ElemTypeId = addType(std::move(TypeEntry), CTy); + } else { + // Visit array dimensions. + DINodeArray Elements = CTy->getElements(); + for (int I = Elements.size() - 1; I >= 0; --I) { + if (auto *Element = dyn_cast_or_null(Elements[I])) + if (Element->getTag() == dwarf::DW_TAG_subrange_type) { + const DISubrange *SR = cast(Element); + auto *CI = SR->getCount().dyn_cast(); + int64_t Count = CI->getSExtValue(); + + auto TypeEntry = + llvm::make_unique(ElemTypeId, ElemSize, Count); + ArrayTypes.push_back(TypeEntry.get()); + if (I == 0) + ElemTypeId = addType(std::move(TypeEntry), CTy); + else + ElemTypeId = addType(std::move(TypeEntry)); + ElemSize = ElemSize * Count; + } + } + } + + // The array TypeId is the type id of the outermost dimension. + TypeId = ElemTypeId; // The IR does not have a type for array index while BTF wants one. // So create an array index type if there is none. @@ -416,85 +530,162 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy) { 0, "__ARRAY_SIZE_TYPE__"); ArrayIndexTypeId = addType(std::move(TypeEntry)); } - - // Visit array element type. - visitTypeEntry(CTy->getBaseType().resolve()); } -void BTFDebug::visitEnumType(const DICompositeType *CTy) { +void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) { DINodeArray Elements = CTy->getElements(); uint32_t VLen = Elements.size(); if (VLen > BTF::MAX_VLEN) return; auto TypeEntry = llvm::make_unique(CTy, VLen); - addType(std::move(TypeEntry), CTy); + TypeId = addType(std::move(TypeEntry), CTy); // No need to visit base type as BTF does not encode it. } /// Handle structure/union forward declarations. -void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion) { +void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion, + uint32_t &TypeId) { auto TypeEntry = llvm::make_unique(CTy->getName(), IsUnion); - addType(std::move(TypeEntry), CTy); + TypeId = addType(std::move(TypeEntry), CTy); } /// Handle structure, union, array and enumeration types. -void BTFDebug::visitCompositeType(const DICompositeType *CTy) { +void BTFDebug::visitCompositeType(const DICompositeType *CTy, + uint32_t &TypeId) { auto Tag = CTy->getTag(); if (Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) { // Handle forward declaration differently as it does not have members. if (CTy->isForwardDecl()) - visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type); + visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId); else - visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type); + visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId); } else if (Tag == dwarf::DW_TAG_array_type) - visitArrayType(CTy); + visitArrayType(CTy, TypeId); else if (Tag == dwarf::DW_TAG_enumeration_type) - visitEnumType(CTy); + visitEnumType(CTy, TypeId); } /// Handle pointer, typedef, const, volatile, restrict and member types. -void BTFDebug::visitDerivedType(const DIDerivedType *DTy) { +void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, + bool CheckPointer, bool SeenPointer) { unsigned Tag = DTy->getTag(); + /// Try to avoid chasing pointees, esp. structure pointees which may + /// unnecessary bring in a lot of types. + if (CheckPointer && !SeenPointer) { + SeenPointer = Tag == dwarf::DW_TAG_pointer_type; + } + + if (CheckPointer && SeenPointer) { + const DIType *Base = DTy->getBaseType(); + if (Base) { + if (const auto *CTy = dyn_cast(Base)) { + auto CTag = CTy->getTag(); + if ((CTag == dwarf::DW_TAG_structure_type || + CTag == dwarf::DW_TAG_union_type) && + !CTy->isForwardDecl()) { + /// Find a candidate, generate a fixup. Later on the struct/union + /// pointee type will be replaced with either a real type or + /// a forward declaration. + auto TypeEntry = llvm::make_unique(DTy, Tag, true); + auto &Fixup = FixupDerivedTypes[CTy->getName()]; + Fixup.first = CTag == dwarf::DW_TAG_union_type; + Fixup.second.push_back(TypeEntry.get()); + TypeId = addType(std::move(TypeEntry), DTy); + return; + } + } + } + } + if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type || Tag == dwarf::DW_TAG_restrict_type) { - auto TypeEntry = llvm::make_unique(DTy, Tag); - addType(std::move(TypeEntry), DTy); + auto TypeEntry = llvm::make_unique(DTy, Tag, false); + TypeId = addType(std::move(TypeEntry), DTy); } else if (Tag != dwarf::DW_TAG_member) { return; } // Visit base type of pointer, typedef, const, volatile, restrict or // struct/union member. - visitTypeEntry(DTy->getBaseType().resolve()); + uint32_t TempTypeId = 0; + if (Tag == dwarf::DW_TAG_member) + visitTypeEntry(DTy->getBaseType(), TempTypeId, true, false); + else + visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer); } -void BTFDebug::visitTypeEntry(const DIType *Ty) { - if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) +void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId, + bool CheckPointer, bool SeenPointer) { + if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) { + TypeId = DIToIdMap[Ty]; return; + } - uint32_t TypeId; if (const auto *BTy = dyn_cast(Ty)) - visitBasicType(BTy); + visitBasicType(BTy, TypeId); else if (const auto *STy = dyn_cast(Ty)) visitSubroutineType(STy, false, std::unordered_map(), TypeId); else if (const auto *CTy = dyn_cast(Ty)) - visitCompositeType(CTy); + visitCompositeType(CTy, TypeId); else if (const auto *DTy = dyn_cast(Ty)) - visitDerivedType(DTy); + visitDerivedType(DTy, TypeId, CheckPointer, SeenPointer); else llvm_unreachable("Unknown DIType"); } +void BTFDebug::visitTypeEntry(const DIType *Ty) { + uint32_t TypeId; + visitTypeEntry(Ty, TypeId, false, false); +} + +void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { + if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) { + TypeId = DIToIdMap[Ty]; + return; + } + + // MapDef type is a struct type + const auto *CTy = dyn_cast(Ty); + if (!CTy) + return; + + auto Tag = CTy->getTag(); + if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl()) + return; + + // Record this type + const DINodeArray Elements = CTy->getElements(); + bool HasBitField = false; + for (const auto *Element : Elements) { + auto E = cast(Element); + if (E->isBitField()) { + HasBitField = true; + break; + } + } + + auto TypeEntry = + llvm::make_unique(CTy, true, HasBitField, Elements.size()); + StructTypes.push_back(TypeEntry.get()); + TypeId = addType(std::move(TypeEntry), CTy); + + // Visit all struct members + for (const auto *Element : Elements) { + const auto *MemberType = cast(Element); + visitTypeEntry(MemberType->getBaseType()); + } +} + /// Read file contents from the actual file or from the source std::string BTFDebug::populateFileContent(const DISubprogram *SP) { auto File = SP->getFile(); std::string FileName; - if (File->getDirectory().size()) + if (!File->getFilename().startswith("/") && File->getDirectory().size()) FileName = File->getDirectory().str() + "/" + File->getFilename().str(); else FileName = File->getFilename(); @@ -507,16 +698,16 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) { std::string Line; Content.push_back(Line); // Line 0 for empty string + std::unique_ptr Buf; auto Source = File->getSource(); - if (Source) { - std::istringstream InputString(Source.getValue()); - while (std::getline(InputString, Line)) - Content.push_back(Line); - } else { - std::ifstream InputFile(FileName); - while (std::getline(InputFile, Line)) - Content.push_back(Line); - } + if (Source) + Buf = MemoryBuffer::getMemBufferCopy(*Source); + else if (ErrorOr> BufOrErr = + MemoryBuffer::getFile(FileName)) + Buf = std::move(*BufOrErr); + if (Buf) + for (line_iterator I(*Buf, false), E; I != E; ++I) + Content.push_back(*I); FileContent[FileName] = Content; return FileName; @@ -547,6 +738,10 @@ void BTFDebug::emitCommonHeader() { } void BTFDebug::emitBTFSection() { + // Do not emit section if no types and only "" string. + if (!TypeEntries.size() && StringTable.getSize() == 1) + return; + MCContext &Ctx = OS.getContext(); OS.SwitchSection(Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0)); @@ -579,6 +774,11 @@ void BTFDebug::emitBTFSection() { } void BTFDebug::emitBTFExtSection() { + // Do not emit section if empty FuncInfoTable and LineInfoTable. + if (!FuncInfoTable.size() && !LineInfoTable.size() && + !OffsetRelocTable.size() && !ExternRelocTable.size()) + return; + MCContext &Ctx = OS.getContext(); OS.SwitchSection(Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0)); @@ -588,6 +788,8 @@ void BTFDebug::emitBTFExtSection() { // Account for FuncInfo/LineInfo record size as well. uint32_t FuncLen = 4, LineLen = 4; + // Do not account for optional OffsetReloc/ExternReloc. + uint32_t OffsetRelocLen = 0, ExternRelocLen = 0; for (const auto &FuncSec : FuncInfoTable) { FuncLen += BTF::SecFuncInfoSize; FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize; @@ -596,11 +798,28 @@ void BTFDebug::emitBTFExtSection() { LineLen += BTF::SecLineInfoSize; LineLen += LineSec.second.size() * BTF::BPFLineInfoSize; } + for (const auto &OffsetRelocSec : OffsetRelocTable) { + OffsetRelocLen += BTF::SecOffsetRelocSize; + OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize; + } + for (const auto &ExternRelocSec : ExternRelocTable) { + ExternRelocLen += BTF::SecExternRelocSize; + ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize; + } + + if (OffsetRelocLen) + OffsetRelocLen += 4; + if (ExternRelocLen) + ExternRelocLen += 4; OS.EmitIntValue(0, 4); OS.EmitIntValue(FuncLen, 4); OS.EmitIntValue(FuncLen, 4); OS.EmitIntValue(LineLen, 4); + OS.EmitIntValue(FuncLen + LineLen, 4); + OS.EmitIntValue(OffsetRelocLen, 4); + OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4); + OS.EmitIntValue(ExternRelocLen, 4); // Emit func_info table. OS.AddComment("FuncInfo"); @@ -633,6 +852,39 @@ void BTFDebug::emitBTFExtSection() { OS.EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4); } } + + // Emit offset reloc table. + if (OffsetRelocLen) { + OS.AddComment("OffsetReloc"); + OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4); + for (const auto &OffsetRelocSec : OffsetRelocTable) { + OS.AddComment("Offset reloc section string offset=" + + std::to_string(OffsetRelocSec.first)); + OS.EmitIntValue(OffsetRelocSec.first, 4); + OS.EmitIntValue(OffsetRelocSec.second.size(), 4); + for (const auto &OffsetRelocInfo : OffsetRelocSec.second) { + Asm->EmitLabelReference(OffsetRelocInfo.Label, 4); + OS.EmitIntValue(OffsetRelocInfo.TypeID, 4); + OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4); + } + } + } + + // Emit extern reloc table. + if (ExternRelocLen) { + OS.AddComment("ExternReloc"); + OS.EmitIntValue(BTF::BPFExternRelocSize, 4); + for (const auto &ExternRelocSec : ExternRelocTable) { + OS.AddComment("Extern reloc section string offset=" + + std::to_string(ExternRelocSec.first)); + OS.EmitIntValue(ExternRelocSec.first, 4); + OS.EmitIntValue(ExternRelocSec.second.size(), 4); + for (const auto &ExternRelocInfo : ExternRelocSec.second) { + Asm->EmitLabelReference(ExternRelocInfo.Label, 4); + OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4); + } + } + } } void BTFDebug::beginFunctionImpl(const MachineFunction *MF) { @@ -645,18 +897,42 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) { } SkipInstruction = false; + // Collect MapDef types. Map definition needs to collect + // pointee types. Do it first. Otherwise, for the following + // case: + // struct m { ...}; + // struct t { + // struct m *key; + // }; + // foo(struct t *arg); + // + // struct mapdef { + // ... + // struct m *key; + // ... + // } __attribute__((section(".maps"))) hash_map; + // + // If subroutine foo is traversed first, a type chain + // "ptr->struct m(fwd)" will be created and later on + // when traversing mapdef, since "ptr->struct m" exists, + // the traversal of "struct m" will be omitted. + if (MapDefNotCollected) { + processGlobals(true); + MapDefNotCollected = false; + } + // Collect all types locally referenced in this function. // Use RetainedNodes so we can collect all argument names // even if the argument is not used. std::unordered_map FuncArgNames; for (const DINode *DN : SP->getRetainedNodes()) { if (const auto *DV = dyn_cast(DN)) { - visitTypeEntry(DV->getType().resolve()); - // Collect function arguments for subprogram func type. uint32_t Arg = DV->getArg(); - if (Arg) + if (Arg) { + visitTypeEntry(DV->getType()); FuncArgNames[Arg] = DV->getName(); + } } } @@ -669,6 +945,9 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) { llvm::make_unique(SP->getName(), ProtoTypeId); uint32_t FuncTypeId = addType(std::move(FuncTypeEntry)); + for (const auto &TypeEntry : TypeEntries) + TypeEntry->completeType(*this); + // Construct funcinfo and the first lineinfo for the function. MCSymbol *FuncLabel = Asm->getFunctionBegin(); BTFFuncInfo FuncInfo; @@ -691,6 +970,133 @@ void BTFDebug::endFunctionImpl(const MachineFunction *MF) { SecNameOff = 0; } +/// On-demand populate struct types as requested from abstract member +/// accessing. +unsigned BTFDebug::populateStructType(const DIType *Ty) { + unsigned Id; + visitTypeEntry(Ty, Id, false, false); + for (const auto &TypeEntry : TypeEntries) + TypeEntry->completeType(*this); + return Id; +} + +// Find struct/array debuginfo types given a type id. +void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType, + BTFTypeArray **PrevArrayType) { + for (const auto &StructType : StructTypes) { + if (StructType->getId() == TypeId) { + *PrevStructType = StructType; + return; + } + } + for (const auto &ArrayType : ArrayTypes) { + if (ArrayType->getId() == TypeId) { + *PrevArrayType = ArrayType; + return; + } + } +} + +/// Generate a struct member offset relocation. +void BTFDebug::generateOffsetReloc(const MachineInstr *MI, + const MCSymbol *ORSym, DIType *RootTy, + StringRef AccessPattern) { + BTFTypeStruct *PrevStructType = nullptr; + BTFTypeArray *PrevArrayType = nullptr; + unsigned RootId = populateStructType(RootTy); + setTypeFromId(RootId, &PrevStructType, &PrevArrayType); + unsigned RootTySize = PrevStructType->getStructSize(); + + BTFOffsetReloc OffsetReloc; + OffsetReloc.Label = ORSym; + OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back()); + OffsetReloc.TypeID = RootId; + + uint32_t Start = 0, End = 0, Offset = 0; + bool FirstAccess = true; + for (auto C : AccessPattern) { + if (C != ':') { + End++; + } else { + std::string SubStr = AccessPattern.substr(Start, End - Start); + int Loc = std::stoi(SubStr); + + if (FirstAccess) { + Offset = Loc * RootTySize; + FirstAccess = false; + } else if (PrevStructType) { + uint32_t MemberOffset, MemberTypeId; + PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId); + + Offset += MemberOffset >> 3; + PrevStructType = nullptr; + setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType); + } else if (PrevArrayType) { + uint32_t LocOffset, ElementTypeId; + PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId); + + Offset += LocOffset; + PrevArrayType = nullptr; + setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType); + } + Start = End + 1; + End = Start; + } + } + AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset; + OffsetRelocTable[SecNameOff].push_back(OffsetReloc); +} + +void BTFDebug::processLDimm64(const MachineInstr *MI) { + // If the insn is an LD_imm64, the following two cases + // will generate an .BTF.ext record. + // + // If the insn is "r2 = LD_imm64 @__BTF_...", + // add this insn into the .BTF.ext OffsetReloc subsection. + // Relocation looks like: + // . SecName: + // . InstOffset + // . TypeID + // . OffSetNameOff + // Later, the insn is replaced with "r2 = " + // where "" equals to the offset based on current + // type definitions. + // + // If the insn is "r2 = LD_imm64 @VAR" and VAR is + // a patchable external global, add this insn into the .BTF.ext + // ExternReloc subsection. + // Relocation looks like: + // . SecName: + // . InstOffset + // . ExternNameOff + // Later, the insn is replaced with "r2 = " or + // "LD_imm64 r2, " where "" = 0. + + // check whether this is a candidate or not + const MachineOperand &MO = MI->getOperand(1); + if (MO.isGlobal()) { + const GlobalValue *GVal = MO.getGlobal(); + auto *GVar = dyn_cast(GVal); + if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) { + MCSymbol *ORSym = OS.getContext().createTempSymbol(); + OS.EmitLabel(ORSym); + + MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index); + DIType *Ty = dyn_cast(MDN); + generateOffsetReloc(MI, ORSym, Ty, GVar->getName()); + } else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() && + GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) { + MCSymbol *ORSym = OS.getContext().createTempSymbol(); + OS.EmitLabel(ORSym); + + BTFExternReloc ExternReloc; + ExternReloc.Label = ORSym; + ExternReloc.ExternNameOff = addString(GVar->getName()); + ExternRelocTable[SecNameOff].push_back(ExternReloc); + } + } +} + void BTFDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); @@ -711,6 +1117,9 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == BPF::LD_imm64) + processLDimm64(MI); + // Skip this instruction if no DebugLoc or the DebugLoc // is the same as the previous instruction. const DebugLoc &DL = MI->getDebugLoc(); @@ -739,13 +1148,145 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) { PrevInstLoc = DL; } -void BTFDebug::endModule() { +void BTFDebug::processGlobals(bool ProcessingMapDef) { // Collect all types referenced by globals. const Module *M = MMI->getModule(); - for (const DICompileUnit *CUNode : M->debug_compile_units()) { - for (const auto *GVE : CUNode->getGlobalVariables()) { - DIGlobalVariable *GV = GVE->getVariable(); - visitTypeEntry(GV->getType().resolve()); + for (const GlobalVariable &Global : M->globals()) { + // Ignore external globals for now. + if (!Global.hasInitializer() && Global.hasExternalLinkage()) + continue; + + // Decide the section name. + StringRef SecName; + if (Global.hasSection()) { + SecName = Global.getSection(); + } else { + // data, bss, or readonly sections + if (Global.isConstant()) + SecName = ".rodata"; + else + SecName = Global.getInitializer()->isZeroValue() ? ".bss" : ".data"; + } + + if (ProcessingMapDef != SecName.startswith(".maps")) + continue; + + SmallVector GVs; + Global.getDebugInfo(GVs); + uint32_t GVTypeId = 0; + for (auto *GVE : GVs) { + if (SecName.startswith(".maps")) + visitMapDefType(GVE->getVariable()->getType(), GVTypeId); + else + visitTypeEntry(GVE->getVariable()->getType(), GVTypeId, false, false); + break; + } + + // Only support the following globals: + // . static variables + // . non-static global variables with section attributes + // Essentially means: + // . .bcc/.data/.rodata DataSec entities only contain static data + // . Other DataSec entities contain static or initialized global data. + // Initialized global data are mostly used for finding map key/value type + // id's. Whether DataSec is readonly or not can be found from + // corresponding ELF section flags. + auto Linkage = Global.getLinkage(); + if (Linkage != GlobalValue::InternalLinkage && + (Linkage != GlobalValue::ExternalLinkage || !Global.hasSection())) + continue; + + uint32_t GVarInfo = Linkage == GlobalValue::ExternalLinkage + ? BTF::VAR_GLOBAL_ALLOCATED + : BTF::VAR_STATIC; + auto VarEntry = + llvm::make_unique(Global.getName(), GVTypeId, GVarInfo); + uint32_t VarId = addType(std::move(VarEntry)); + + // Find or create a DataSec + if (DataSecEntries.find(SecName) == DataSecEntries.end()) { + DataSecEntries[SecName] = llvm::make_unique(Asm, SecName); + } + + // Calculate symbol size + const DataLayout &DL = Global.getParent()->getDataLayout(); + uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType()); + + DataSecEntries[SecName]->addVar(VarId, Asm->getSymbol(&Global), Size); + } +} + +/// Emit proper patchable instructions. +bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) { + if (MI->getOpcode() == BPF::LD_imm64) { + const MachineOperand &MO = MI->getOperand(1); + if (MO.isGlobal()) { + const GlobalValue *GVal = MO.getGlobal(); + auto *GVar = dyn_cast(GVal); + if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) { + MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index); + DIType *Ty = dyn_cast(MDN); + std::string TypeName = Ty->getName(); + int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()]; + + // Emit "mov ri, " for abstract member accesses. + OutMI.setOpcode(BPF::MOV_ri); + OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + OutMI.addOperand(MCOperand::createImm(Imm)); + return true; + } else if (GVar && !GVar->hasInitializer() && + GVar->hasExternalLinkage() && + GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) { + const IntegerType *IntTy = dyn_cast(GVar->getValueType()); + assert(IntTy); + // For patchable externals, emit "LD_imm64, ri, 0" if the external + // variable is 64bit width, emit "mov ri, 0" otherwise. + if (IntTy->getBitWidth() == 64) + OutMI.setOpcode(BPF::LD_imm64); + else + OutMI.setOpcode(BPF::MOV_ri); + OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + OutMI.addOperand(MCOperand::createImm(0)); + return true; + } + } + } + return false; +} + +void BTFDebug::endModule() { + // Collect MapDef globals if not collected yet. + if (MapDefNotCollected) { + processGlobals(true); + MapDefNotCollected = false; + } + + // Collect global types/variables except MapDef globals. + processGlobals(false); + for (auto &DataSec : DataSecEntries) + addType(std::move(DataSec.second)); + + // Fixups + for (auto &Fixup : FixupDerivedTypes) { + StringRef TypeName = Fixup.first; + bool IsUnion = Fixup.second.first; + + // Search through struct types + uint32_t StructTypeId = 0; + for (const auto &StructType : StructTypes) { + if (StructType->getName() == TypeName) { + StructTypeId = StructType->getId(); + break; + } + } + + if (StructTypeId == 0) { + auto FwdTypeEntry = llvm::make_unique(TypeName, IsUnion); + StructTypeId = addType(std::move(FwdTypeEntry)); + } + + for (auto &DType : Fixup.second.second) { + DType->setPointeeType(StructTypeId); } } diff --git a/lib/Target/BPF/BTFDebug.h b/lib/Target/BPF/BTFDebug.h index afd4ed87f63d..6c0cdde17d9b 100644 --- a/lib/Target/BPF/BTFDebug.h +++ b/lib/Target/BPF/BTFDebug.h @@ -1,9 +1,8 @@ //===- BTFDebug.h -----------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -33,10 +32,12 @@ class MachineFunction; class BTFTypeBase { protected: uint8_t Kind; + bool IsCompleted; uint32_t Id; struct BTF::CommonType BTFType; public: + BTFTypeBase() : IsCompleted(false) {} virtual ~BTFTypeBase() = default; void setId(uint32_t Id) { this->Id = Id; } uint32_t getId() { return Id; } @@ -55,11 +56,13 @@ public: /// volatile, typedef and restrict. class BTFTypeDerived : public BTFTypeBase { const DIDerivedType *DTy; + bool NeedsFixup; public: - BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag); + BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag, bool NeedsFixup); void completeType(BTFDebug &BDebug); void emitType(MCStreamer &OS); + void setPointeeType(uint32_t PointeeType); }; /// Handle struct or union forward declaration. @@ -101,14 +104,15 @@ public: /// Handle array type. class BTFTypeArray : public BTFTypeBase { - const DICompositeType *ATy; + uint32_t ElemSize; struct BTF::BTFArray ArrayInfo; public: - BTFTypeArray(const DICompositeType *ATy); + BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems); uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; } void completeType(BTFDebug &BDebug); void emitType(MCStreamer &OS); + void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId); }; /// Handle struct/union type. @@ -125,6 +129,9 @@ public: } void completeType(BTFDebug &BDebug); void emitType(MCStreamer &OS); + std::string getName(); + void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType); + uint32_t getStructSize(); }; /// Handle function pointer. @@ -154,6 +161,37 @@ public: void emitType(MCStreamer &OS); }; +/// Handle variable instances +class BTFKindVar : public BTFTypeBase { + StringRef Name; + uint32_t Info; + +public: + BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo); + uint32_t getSize() { return BTFTypeBase::getSize() + 4; } + void completeType(BTFDebug &BDebug); + void emitType(MCStreamer &OS); +}; + +/// Handle data sections +class BTFKindDataSec : public BTFTypeBase { + AsmPrinter *Asm; + std::string Name; + std::vector> Vars; + +public: + BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName); + uint32_t getSize() { + return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size(); + } + void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) { + Vars.push_back(std::make_tuple(Id, Sym, Size)); + } + std::string getName() { return Name; } + void completeType(BTFDebug &BDebug); + void emitType(MCStreamer &OS); +}; + /// String table. class BTFStringTable { /// String table size in bytes. @@ -189,6 +227,19 @@ struct BTFLineInfo { uint32_t ColumnNum; ///< the column number }; +/// Represent one offset relocation. +struct BTFOffsetReloc { + const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc + uint32_t TypeID; ///< Type ID + uint32_t OffsetNameOff; ///< The string to traverse types +}; + +/// Represent one extern relocation. +struct BTFExternReloc { + const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc + uint32_t ExternNameOff; ///< The extern variable name +}; + /// Collect and emit BTF information. class BTFDebug : public DebugHandlerBase { MCStreamer &OS; @@ -196,17 +247,26 @@ class BTFDebug : public DebugHandlerBase { bool LineInfoGenerated; uint32_t SecNameOff; uint32_t ArrayIndexTypeId; + bool MapDefNotCollected; BTFStringTable StringTable; std::vector> TypeEntries; std::unordered_map DIToIdMap; - std::unordered_map> FuncInfoTable; - std::unordered_map> LineInfoTable; + std::map> FuncInfoTable; + std::map> LineInfoTable; + std::map> OffsetRelocTable; + std::map> ExternRelocTable; StringMap> FileContent; + std::map> DataSecEntries; + std::vector StructTypes; + std::vector ArrayTypes; + std::map AccessOffsets; + std::map>> + FixupDerivedTypes; /// Add types to TypeEntries. /// @{ /// Add types to TypeEntries and DIToIdMap. - void addType(std::unique_ptr TypeEntry, const DIType *Ty); + uint32_t addType(std::unique_ptr TypeEntry, const DIType *Ty); /// Add types to TypeEntries only and return type id. uint32_t addType(std::unique_ptr TypeEntry); /// @} @@ -214,17 +274,23 @@ class BTFDebug : public DebugHandlerBase { /// IR type visiting functions. /// @{ void visitTypeEntry(const DIType *Ty); - void visitBasicType(const DIBasicType *BTy); + void visitTypeEntry(const DIType *Ty, uint32_t &TypeId, bool CheckPointer, + bool SeenPointer); + void visitBasicType(const DIBasicType *BTy, uint32_t &TypeId); void visitSubroutineType( const DISubroutineType *STy, bool ForSubprog, const std::unordered_map &FuncArgNames, uint32_t &TypeId); - void visitFwdDeclType(const DICompositeType *CTy, bool IsUnion); - void visitCompositeType(const DICompositeType *CTy); - void visitStructType(const DICompositeType *STy, bool IsStruct); - void visitArrayType(const DICompositeType *ATy); - void visitEnumType(const DICompositeType *ETy); - void visitDerivedType(const DIDerivedType *DTy); + void visitFwdDeclType(const DICompositeType *CTy, bool IsUnion, + uint32_t &TypeId); + void visitCompositeType(const DICompositeType *CTy, uint32_t &TypeId); + void visitStructType(const DICompositeType *STy, bool IsStruct, + uint32_t &TypeId); + void visitArrayType(const DICompositeType *ATy, uint32_t &TypeId); + void visitEnumType(const DICompositeType *ETy, uint32_t &TypeId); + void visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, + bool CheckPointer, bool SeenPointer); + void visitMapDefType(const DIType *Ty, uint32_t &TypeId); /// @} /// Get the file content for the subprogram. Certain lines of the file @@ -235,6 +301,23 @@ class BTFDebug : public DebugHandlerBase { void constructLineInfo(const DISubprogram *SP, MCSymbol *Label, uint32_t Line, uint32_t Column); + /// Generate types and variables for globals. + void processGlobals(bool ProcessingMapDef); + + /// Generate one offset relocation record. + void generateOffsetReloc(const MachineInstr *MI, const MCSymbol *ORSym, + DIType *RootTy, StringRef AccessPattern); + + /// Set the to-be-traversed Struct/Array Type based on TypeId. + void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType, + BTFTypeArray **PrevArrayType); + + /// Populating unprocessed struct type. + unsigned populateStructType(const DIType *Ty); + + /// Process LD_imm64 instructions. + void processLDimm64(const MachineInstr *MI); + /// Emit common header of .BTF and .BTF.ext sections. void emitCommonHeader(); @@ -254,6 +337,9 @@ protected: public: BTFDebug(AsmPrinter *AP); + /// + bool InstLower(const MachineInstr *MI, MCInst &OutMI); + /// Get the special array index type id. uint32_t getArrayIndexTypeId() { assert(ArrayIndexTypeId); diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp index 9f80b762fe36..c845524ad657 100644 --- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp +++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp @@ -1,9 +1,8 @@ //===- BPFDisassembler.cpp - Disassembler for BPF ---------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/BPFMCTargetDesc.h" +#include "TargetInfo/BPFTargetInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -40,7 +40,7 @@ public: BPF_STX = 0x3, BPF_ALU = 0x4, BPF_JMP = 0x5, - BPF_RES = 0x6, + BPF_JMP32 = 0x6, BPF_ALU64 = 0x7 }; @@ -172,9 +172,10 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, if (Result == MCDisassembler::Fail) return MCDisassembler::Fail; uint8_t InstClass = getInstClass(Insn); + uint8_t InstMode = getInstMode(Insn); if ((InstClass == BPF_LDX || InstClass == BPF_STX) && getInstSize(Insn) != BPF_DW && - getInstMode(Insn) == BPF_MEM && + (InstMode == BPF_MEM || InstMode == BPF_XADD) && STI.getFeatureBits()[BPF::ALU32]) Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address, this, STI); diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp deleted file mode 100644 index 20627da38817..000000000000 --- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp +++ /dev/null @@ -1,108 +0,0 @@ -//===-- BPFInstPrinter.cpp - Convert BPF MCInst to asm syntax -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an BPF MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "BPFInstPrinter.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// Include the auto-generated portion of the assembly writer. -#include "BPFGenAsmWriter.inc" - -void BPFInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - printInstruction(MI, O); - printAnnotation(O, Annot); -} - -static void printExpr(const MCExpr *Expr, raw_ostream &O) { -#ifndef NDEBUG - const MCSymbolRefExpr *SRE; - - if (const MCBinaryExpr *BE = dyn_cast(Expr)) - SRE = dyn_cast(BE->getLHS()); - else - SRE = dyn_cast(Expr); - assert(SRE && "Unexpected MCExpr type."); - - MCSymbolRefExpr::VariantKind Kind = SRE->getKind(); - - assert(Kind == MCSymbolRefExpr::VK_None); -#endif - O << *Expr; -} - -void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { - assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - O << getRegisterName(Op.getReg()); - } else if (Op.isImm()) { - O << formatImm((int32_t)Op.getImm()); - } else { - assert(Op.isExpr() && "Expected an expression"); - printExpr(Op.getExpr(), O); - } -} - -void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier) { - const MCOperand &RegOp = MI->getOperand(OpNo); - const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); - - // register - assert(RegOp.isReg() && "Register operand not a register"); - O << getRegisterName(RegOp.getReg()); - - // offset - if (OffsetOp.isImm()) { - auto Imm = OffsetOp.getImm(); - if (Imm >= 0) - O << " + " << formatImm(Imm); - else - O << " - " << formatImm(-Imm); - } else { - assert(0 && "Expected an immediate"); - } -} - -void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) - O << formatImm(Op.getImm()); - else if (Op.isExpr()) - printExpr(Op.getExpr(), O); - else - O << Op; -} - -void BPFInstPrinter::printBrTargetOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - int16_t Imm = Op.getImm(); - O << ((Imm >= 0) ? "+" : "") << formatImm(Imm); - } else if (Op.isExpr()) { - printExpr(Op.getExpr(), O); - } else { - O << Op; - } -} diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/lib/Target/BPF/InstPrinter/BPFInstPrinter.h deleted file mode 100644 index bb0b0d71da53..000000000000 --- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h +++ /dev/null @@ -1,41 +0,0 @@ -//===-- BPFInstPrinter.h - Convert BPF MCInst to asm syntax -------*- C++ -*--// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a BPF MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H -#define LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { -class BPFInstPrinter : public MCInstPrinter { -public: - BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printBrTargetOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); -}; -} - -#endif diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 1822d8688fa2..ba35a175b9a7 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- BPFAsmBackend.cpp - BPF Assembler Backend -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -73,12 +72,12 @@ void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, bool IsResolved, const MCSubtargetInfo *STI) const { if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) { - if (Value) { - MCContext &Ctx = Asm.getContext(); - Ctx.reportError(Fixup.getLoc(), - "Unsupported relocation: try to compile with -O2 or above, " - "or check your static variable usage"); - } + // The Value is 0 for global variables, and the in-section offset + // for static variables. Write to the immediate field of the inst. + assert(Value <= UINT32_MAX); + support::endian::write(&Data[Fixup.getOffset() + 4], + static_cast(Value), + Endian); } else if (Fixup.getKind() == FK_Data_4) { support::endian::write(&Data[Fixup.getOffset()], Value, Endian); } else if (Fixup.getKind() == FK_Data_8) { diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 32e79d0f527e..057bbf5c3b06 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- BPFELFObjectWriter.cpp - BPF ELF Writer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -51,21 +50,33 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case FK_Data_8: return ELF::R_BPF_64_64; case FK_Data_4: - // .BTF.ext generates FK_Data_4 relocations for - // insn offset by creating temporary labels. - // The insn offset is within the code section and - // already been fulfilled by applyFixup(). No - // further relocation is needed. if (const MCSymbolRefExpr *A = Target.getSymA()) { - if (A->getSymbol().isTemporary()) { - MCSection &Section = A->getSymbol().getSection(); + const MCSymbol &Sym = A->getSymbol(); + + if (Sym.isDefined()) { + MCSection &Section = Sym.getSection(); const MCSectionELF *SectionELF = dyn_cast(&Section); assert(SectionELF && "Null section for reloc symbol"); - // The reloc symbol should be in text section. unsigned Flags = SectionELF->getFlags(); - if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_EXECINSTR)) - return ELF::R_BPF_NONE; + + if (Sym.isTemporary()) { + // .BTF.ext generates FK_Data_4 relocations for + // insn offset by creating temporary labels. + // The insn offset is within the code section and + // already been fulfilled by applyFixup(). No + // further relocation is needed. + // The reloc symbol should be in text section. + if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_EXECINSTR)) + return ELF::R_BPF_NONE; + } else { + // .BTF generates FK_Data_4 relocations for variable + // offset in DataSec kind. Similar to the above .BTF.ext + // insn offset, no further relocation is needed. + // The reloc symbol should be in data section. + if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_WRITE)) + return ELF::R_BPF_NONE; + } } } return ELF::R_BPF_64_32; diff --git a/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp new file mode 100644 index 000000000000..079202994c8d --- /dev/null +++ b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp @@ -0,0 +1,107 @@ +//===-- BPFInstPrinter.cpp - Convert BPF MCInst to asm syntax -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an BPF MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/BPFInstPrinter.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#include "BPFGenAsmWriter.inc" + +void BPFInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +static void printExpr(const MCExpr *Expr, raw_ostream &O) { +#ifndef NDEBUG + const MCSymbolRefExpr *SRE; + + if (const MCBinaryExpr *BE = dyn_cast(Expr)) + SRE = dyn_cast(BE->getLHS()); + else + SRE = dyn_cast(Expr); + assert(SRE && "Unexpected MCExpr type."); + + MCSymbolRefExpr::VariantKind Kind = SRE->getKind(); + + assert(Kind == MCSymbolRefExpr::VK_None); +#endif + O << *Expr; +} + +void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + O << getRegisterName(Op.getReg()); + } else if (Op.isImm()) { + O << formatImm((int32_t)Op.getImm()); + } else { + assert(Op.isExpr() && "Expected an expression"); + printExpr(Op.getExpr(), O); + } +} + +void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O, + const char *Modifier) { + const MCOperand &RegOp = MI->getOperand(OpNo); + const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); + + // register + assert(RegOp.isReg() && "Register operand not a register"); + O << getRegisterName(RegOp.getReg()); + + // offset + if (OffsetOp.isImm()) { + auto Imm = OffsetOp.getImm(); + if (Imm >= 0) + O << " + " << formatImm(Imm); + else + O << " - " << formatImm(-Imm); + } else { + assert(0 && "Expected an immediate"); + } +} + +void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << formatImm(Op.getImm()); + else if (Op.isExpr()) + printExpr(Op.getExpr(), O); + else + O << Op; +} + +void BPFInstPrinter::printBrTargetOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + int16_t Imm = Op.getImm(); + O << ((Imm >= 0) ? "+" : "") << formatImm(Imm); + } else if (Op.isExpr()) { + printExpr(Op.getExpr(), O); + } else { + O << Op; + } +} diff --git a/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h new file mode 100644 index 000000000000..8c9a0bc94cff --- /dev/null +++ b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h @@ -0,0 +1,40 @@ +//===-- BPFInstPrinter.h - Convert BPF MCInst to asm syntax -------*- C++ -*--// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints a BPF MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFINSTPRINTER_H +#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { +class BPFInstPrinter : public MCInstPrinter { +public: + BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBrTargetOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); +}; +} + +#endif diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index af3ad5315253..04a6a87cebc9 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- BPFMCAsmInfo.h - BPF asm properties -------------------*- C++ -*--====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp index 437f658caf6e..f9abe76c976b 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- BPFMCCodeEmitter.cpp - Convert BPF code to machine code -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,9 +63,10 @@ public: const MCSubtargetInfo &STI) const override; private: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index 834b57527882..fa27b335f3a1 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- BPFMCTargetDesc.cpp - BPF Target Descriptions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,9 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/BPFMCTargetDesc.h" -#include "BPF.h" -#include "InstPrinter/BPFInstPrinter.h" +#include "MCTargetDesc/BPFInstPrinter.h" #include "MCTargetDesc/BPFMCAsmInfo.h" +#include "TargetInfo/BPFTargetInfo.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h index 6d2f0a1601e6..1a391321f60d 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- BPFMCTargetDesc.h - BPF Target Descriptions -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,10 +33,6 @@ class Triple; class raw_ostream; class raw_pwrite_stream; -Target &getTheBPFleTarget(); -Target &getTheBPFbeTarget(); -Target &getTheBPFTarget(); - MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp index 1f7b8a04d589..5dfa915034ba 100644 --- a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp +++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp @@ -1,30 +1,28 @@ //===-- BPFTargetInfo.cpp - BPF Target Implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "BPF.h" +#include "TargetInfo/BPFTargetInfo.h" #include "llvm/Support/TargetRegistry.h" + using namespace llvm; -namespace llvm { -Target &getTheBPFleTarget() { +Target &llvm::getTheBPFleTarget() { static Target TheBPFleTarget; return TheBPFleTarget; } -Target &getTheBPFbeTarget() { +Target &llvm::getTheBPFbeTarget() { static Target TheBPFbeTarget; return TheBPFbeTarget; } -Target &getTheBPFTarget() { +Target &llvm::getTheBPFTarget() { static Target TheBPFTarget; return TheBPFTarget; } -} // namespace llvm extern "C" void LLVMInitializeBPFTargetInfo() { TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)", diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.h b/lib/Target/BPF/TargetInfo/BPFTargetInfo.h new file mode 100644 index 000000000000..150526c1a9db --- /dev/null +++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.h @@ -0,0 +1,22 @@ +//===-- BPFTargetInfo.h - BPF Target Implementation -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_BPF_TARGETINFO_BPFTARGETINFO_H +#define LLVM_LIB_TARGET_BPF_TARGETINFO_BPFTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheBPFleTarget(); +Target &getTheBPFbeTarget(); +Target &getTheBPFTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_BPF_TARGETINFO_BPFTARGETINFO_H diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 2eb1f0fc8bd9..0881bf841f90 100644 --- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -1,15 +1,13 @@ //===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "mcasmparser" -#include "Hexagon.h" #include "HexagonTargetStreamer.h" #include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCELFStreamer.h" @@ -17,6 +15,7 @@ #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" #include "MCTargetDesc/HexagonShuffler.h" +#include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -1684,8 +1683,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, int64_t Value; MCExpr const &Expr = *Imm.getExpr(); bool Absolute = Expr.evaluateAsAbsolute(Value); - assert(Absolute); - (void)Absolute; + if (!Absolute) + return Match_InvalidOperand; if (!HexagonMCInstrInfo::mustExtend(Expr) && ((Value <= -256) || Value >= 256)) return Match_InvalidOperand; @@ -1707,8 +1706,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, MCInst TmpInst; int64_t Value; bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); - assert(Absolute); - (void)Absolute; + if (!Absolute) + return Match_InvalidOperand; if (Value == 0) { // convert to $Rd = $Rs TmpInst.setOpcode(Hexagon::A2_tfr); MCOperand &Rd = Inst.getOperand(0); @@ -1737,8 +1736,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, MCOperand &Imm = Inst.getOperand(2); int64_t Value; bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); - assert(Absolute); - (void)Absolute; + if (!Absolute) + return Match_InvalidOperand; if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1]) MCInst TmpInst; unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg()); @@ -1861,8 +1860,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, MCOperand &Imm = Inst.getOperand(2); int64_t Value; bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); - assert(Absolute); - (void)Absolute; + if (!Absolute) + return Match_InvalidOperand; if (Value == 0) Inst.setOpcode(Hexagon::S2_vsathub); else { @@ -1881,8 +1880,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, MCOperand &Imm = Inst.getOperand(2); int64_t Value; bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); - assert(Absolute); - (void)Absolute; + if (!Absolute) + return Match_InvalidOperand; if (Value == 0) { MCInst TmpInst; unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg()); diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp index 69529b0d1162..b7e95caf24fb 100644 --- a/lib/Target/Hexagon/BitTracker.cpp +++ b/lib/Target/Hexagon/BitTracker.cpp @@ -1,9 +1,8 @@ //===- BitTracker.cpp -----------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h index 058225c0d812..efb21805b801 100644 --- a/lib/Target/Hexagon/BitTracker.h +++ b/lib/Target/Hexagon/BitTracker.h @@ -1,9 +1,8 @@ //===- BitTracker.h ---------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 428b42eba30d..99e3ee871570 100644 --- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -1,9 +1,8 @@ //===- HexagonDisassembler.cpp - Disassembler for Hexagon ISA -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,6 +12,7 @@ #include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCContext.h" @@ -149,7 +149,7 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder); static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); -#include "HexagonDepDecoders.h" +#include "HexagonDepDecoders.inc" #include "HexagonGenDisassemblerTables.inc" static MCDisassembler *createHexagonDisassembler(const Target &T, diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h index c18492da803b..58dadf012da5 100644 --- a/lib/Target/Hexagon/Hexagon.h +++ b/lib/Target/Hexagon/Hexagon.h @@ -1,9 +1,8 @@ //=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td index 868353e18832..26869391c7a3 100644 --- a/lib/Target/Hexagon/Hexagon.td +++ b/lib/Target/Hexagon/Hexagon.td @@ -1,9 +1,8 @@ //===-- Hexagon.td - Describe the Hexagon Target Machine --*- tablegen -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp index f44fb16e2d8e..b07d15609ede 100644 --- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -1,9 +1,8 @@ //===- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,6 +21,7 @@ #include "MCTargetDesc/HexagonMCExpr.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -92,9 +92,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, GetCPISymbol(MO.getIndex())->print(O, MAI); return; case MachineOperand::MO_GlobalAddress: - // Computing the address of a global symbol, not calling it. - getSymbol(MO.getGlobal())->print(O, MAI); - printOffset(MO.getOffset(), O); + PrintSymbolOperand(MO, O); return; } } @@ -114,7 +112,6 @@ bool HexagonAsmPrinter::isBlockOnlyReachableByFallthrough( /// PrintAsmOperand - Print out an operand for an inline asm expression. bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) { // Does this asm operand have a single letter operand modifier? @@ -125,11 +122,7 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS); - case 'c': // Don't print "$" before a global var name or constant. - // Hexagon never has a prefix. - printOperand(MI, OpNo, OS); - return false; + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS); case 'L': case 'H': { // The highest-numbered register of a pair. const MachineOperand &MO = MI->getOperand(OpNo); @@ -161,7 +154,6 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h index d0629d173a65..6c4b664e83f5 100755 --- a/lib/Target/Hexagon/HexagonAsmPrinter.h +++ b/lib/Target/Hexagon/HexagonAsmPrinter.h @@ -1,9 +1,8 @@ //===- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H #define LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H -#include "Hexagon.h" #include "HexagonSubtarget.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFunction.h" @@ -53,11 +51,9 @@ class TargetMachine; void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index 1bdebe557a8c..7b75d251ccd3 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1,9 +1,8 @@ //===- HexagonBitSimplify.cpp ---------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp index 92b6da871a4c..ba50faac2cf9 100644 --- a/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -1,9 +1,8 @@ //===- HexagonBitTracker.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h index f0b7c9d91950..02607d50f686 100644 --- a/lib/Target/Hexagon/HexagonBitTracker.h +++ b/lib/Target/Hexagon/HexagonBitTracker.h @@ -1,9 +1,8 @@ //===- HexagonBitTracker.h --------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp index 48a4505458ae..999150fc8c6e 100644 --- a/lib/Target/Hexagon/HexagonBlockRanges.cpp +++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp @@ -1,9 +1,8 @@ //===- HexagonBlockRanges.cpp ---------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonBlockRanges.h b/lib/Target/Hexagon/HexagonBlockRanges.h index 4da5a970a659..61115e29a708 100644 --- a/lib/Target/Hexagon/HexagonBlockRanges.h +++ b/lib/Target/Hexagon/HexagonBlockRanges.h @@ -1,9 +1,8 @@ //===- HexagonBlockRanges.h -------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp index 2fa7888dd02b..ee93739b2c7b 100644 --- a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp +++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp @@ -1,9 +1,8 @@ //===--- HexagonBranchRelaxation.cpp - Identify and relax long jumps ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index a22ac8c9fdf5..11a455ce4347 100644 --- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -1,9 +1,8 @@ //===- HexagonCFGOptimizer.cpp - CFG optimizations ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td index ed2f87570d6b..5c31a81a1e87 100644 --- a/lib/Target/Hexagon/HexagonCallingConv.td +++ b/lib/Target/Hexagon/HexagonCallingConv.td @@ -1,9 +1,8 @@ //===- HexagonCallingConv.td ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp index f315e24eba62..cf1b0a0f7daa 100644 --- a/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -1,9 +1,8 @@ //===- HexagonCommonGEP.cpp -----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -12,6 +11,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/LoopInfo.h" @@ -71,7 +71,7 @@ namespace { using NodeToValueMap = std::map; using NodeVect = std::vector; using NodeChildrenMap = std::map; - using UseSet = std::set; + using UseSet = SetVector; using NodeToUsesMap = std::map; // Numbering map for gep nodes. Used to keep track of ordering for @@ -980,15 +980,13 @@ void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U, assert(UF != Uses.end()); UseSet &Us = UF->second; UseSet NewUs; - for (UseSet::iterator I = Us.begin(); I != Us.end(); ) { - User *S = (*I)->getUser(); - UseSet::iterator Nx = std::next(I); - if (S == R) { - NewUs.insert(*I); - Us.erase(I); - } - I = Nx; + for (Use *U : Us) { + if (U->getUser() == R) + NewUs.insert(U); } + for (Use *U : NewUs) + Us.remove(U); // erase takes an iterator. + if (Us.empty()) { Node->Flags &= ~GepNode::Used; Uses.erase(UF); diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp index ba9f638796eb..cfed0ecef272 100644 --- a/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -1,9 +1,8 @@ //===- HexagonConstExtenders.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp index fa192391313e..d1fde5da5fe8 100644 --- a/lib/Target/Hexagon/HexagonConstPropagation.cpp +++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp @@ -1,9 +1,8 @@ //===- HexagonConstPropagation.cpp ----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -80,18 +79,21 @@ namespace { // A representation of a register as it can appear in a MachineOperand, // i.e. a pair register:subregister. - struct Register { + + // FIXME: Use TargetInstrInfo::RegSubRegPair. Also duplicated in + // HexagonGenPredicate + struct RegisterSubReg { unsigned Reg, SubReg; - explicit Register(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {} - explicit Register(const MachineOperand &MO) + explicit RegisterSubReg(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {} + explicit RegisterSubReg(const MachineOperand &MO) : Reg(MO.getReg()), SubReg(MO.getSubReg()) {} void print(const TargetRegisterInfo *TRI = nullptr) const { dbgs() << printReg(Reg, TRI, SubReg); } - bool operator== (const Register &R) const { + bool operator== (const RegisterSubReg &R) const { return (Reg == R.Reg) && (SubReg == R.SubReg); } }; @@ -301,7 +303,7 @@ namespace { using CellMap = MachineConstPropagator::CellMap; virtual bool evaluate(const MachineInstr &MI, const CellMap &Inputs, CellMap &Outputs) = 0; - virtual bool evaluate(const Register &R, const LatticeCell &SrcC, + virtual bool evaluate(const RegisterSubReg &R, const LatticeCell &SrcC, LatticeCell &Result) = 0; virtual bool evaluate(const MachineInstr &BrI, const CellMap &Inputs, SetVector &Targets, @@ -344,17 +346,17 @@ namespace { // Helper functions. - bool getCell(const Register &R, const CellMap &Inputs, LatticeCell &RC); + bool getCell(const RegisterSubReg &R, const CellMap &Inputs, LatticeCell &RC); bool constToInt(const Constant *C, APInt &Val) const; bool constToFloat(const Constant *C, APFloat &Val) const; const ConstantInt *intToConst(const APInt &Val) const; // Compares. - bool evaluateCMPrr(uint32_t Cmp, const Register &R1, const Register &R2, + bool evaluateCMPrr(uint32_t Cmp, const RegisterSubReg &R1, const RegisterSubReg &R2, const CellMap &Inputs, bool &Result); - bool evaluateCMPri(uint32_t Cmp, const Register &R1, const APInt &A2, + bool evaluateCMPri(uint32_t Cmp, const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, bool &Result); - bool evaluateCMPrp(uint32_t Cmp, const Register &R1, uint64_t Props2, + bool evaluateCMPrp(uint32_t Cmp, const RegisterSubReg &R1, uint64_t Props2, const CellMap &Inputs, bool &Result); bool evaluateCMPii(uint32_t Cmp, const APInt &A1, const APInt &A2, bool &Result); @@ -363,52 +365,52 @@ namespace { bool evaluateCMPpp(uint32_t Cmp, uint32_t Props1, uint32_t Props2, bool &Result); - bool evaluateCOPY(const Register &R1, const CellMap &Inputs, + bool evaluateCOPY(const RegisterSubReg &R1, const CellMap &Inputs, LatticeCell &Result); // Logical operations. - bool evaluateANDrr(const Register &R1, const Register &R2, + bool evaluateANDrr(const RegisterSubReg &R1, const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result); - bool evaluateANDri(const Register &R1, const APInt &A2, + bool evaluateANDri(const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, LatticeCell &Result); bool evaluateANDii(const APInt &A1, const APInt &A2, APInt &Result); - bool evaluateORrr(const Register &R1, const Register &R2, + bool evaluateORrr(const RegisterSubReg &R1, const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result); - bool evaluateORri(const Register &R1, const APInt &A2, + bool evaluateORri(const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, LatticeCell &Result); bool evaluateORii(const APInt &A1, const APInt &A2, APInt &Result); - bool evaluateXORrr(const Register &R1, const Register &R2, + bool evaluateXORrr(const RegisterSubReg &R1, const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result); - bool evaluateXORri(const Register &R1, const APInt &A2, + bool evaluateXORri(const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, LatticeCell &Result); bool evaluateXORii(const APInt &A1, const APInt &A2, APInt &Result); // Extensions. - bool evaluateZEXTr(const Register &R1, unsigned Width, unsigned Bits, + bool evaluateZEXTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits, const CellMap &Inputs, LatticeCell &Result); bool evaluateZEXTi(const APInt &A1, unsigned Width, unsigned Bits, APInt &Result); - bool evaluateSEXTr(const Register &R1, unsigned Width, unsigned Bits, + bool evaluateSEXTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits, const CellMap &Inputs, LatticeCell &Result); bool evaluateSEXTi(const APInt &A1, unsigned Width, unsigned Bits, APInt &Result); // Leading/trailing bits. - bool evaluateCLBr(const Register &R1, bool Zeros, bool Ones, + bool evaluateCLBr(const RegisterSubReg &R1, bool Zeros, bool Ones, const CellMap &Inputs, LatticeCell &Result); bool evaluateCLBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result); - bool evaluateCTBr(const Register &R1, bool Zeros, bool Ones, + bool evaluateCTBr(const RegisterSubReg &R1, bool Zeros, bool Ones, const CellMap &Inputs, LatticeCell &Result); bool evaluateCTBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result); // Bitfield extract. - bool evaluateEXTRACTr(const Register &R1, unsigned Width, unsigned Bits, + bool evaluateEXTRACTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits, unsigned Offset, bool Signed, const CellMap &Inputs, LatticeCell &Result); bool evaluateEXTRACTi(const APInt &A1, unsigned Bits, unsigned Offset, bool Signed, APInt &Result); // Vector operations. - bool evaluateSplatr(const Register &R1, unsigned Bits, unsigned Count, + bool evaluateSplatr(const RegisterSubReg &R1, unsigned Bits, unsigned Count, const CellMap &Inputs, LatticeCell &Result); bool evaluateSplati(const APInt &A1, unsigned Bits, unsigned Count, APInt &Result); @@ -620,7 +622,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) { LLVM_DEBUG(dbgs() << "Visiting FI(" << printMBBReference(*MB) << "): " << PN); const MachineOperand &MD = PN.getOperand(0); - Register DefR(MD); + RegisterSubReg DefR(MD); assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg)); bool Changed = false; @@ -647,7 +649,7 @@ Bottomize: continue; } const MachineOperand &SO = PN.getOperand(i); - Register UseR(SO); + RegisterSubReg UseR(SO); // If the input is not a virtual register, we don't really know what // value it holds. if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg)) @@ -690,7 +692,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - Register DefR(MO); + RegisterSubReg DefR(MO); // Only track virtual registers. if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg)) continue; @@ -1066,7 +1068,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) { // -------------------------------------------------------------------- // Machine const evaluator. -bool MachineConstEvaluator::getCell(const Register &R, const CellMap &Inputs, +bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs, LatticeCell &RC) { if (!TargetRegisterInfo::isVirtualRegister(R.Reg)) return false; @@ -1092,8 +1094,8 @@ const ConstantInt *MachineConstEvaluator::intToConst(const APInt &Val) const { return ConstantInt::get(CX, Val); } -bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const Register &R1, - const Register &R2, const CellMap &Inputs, bool &Result) { +bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const RegisterSubReg &R1, + const RegisterSubReg &R2, const CellMap &Inputs, bool &Result) { assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg)); LatticeCell LS1, LS2; if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2)) @@ -1131,7 +1133,7 @@ bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const Register &R1, return IsTrue || IsFalse; } -bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const Register &R1, +bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, bool &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS; @@ -1158,7 +1160,7 @@ bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const Register &R1, return IsTrue || IsFalse; } -bool MachineConstEvaluator::evaluateCMPrp(uint32_t Cmp, const Register &R1, +bool MachineConstEvaluator::evaluateCMPrp(uint32_t Cmp, const RegisterSubReg &R1, uint64_t Props2, const CellMap &Inputs, bool &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS; @@ -1351,13 +1353,13 @@ bool MachineConstEvaluator::evaluateCMPpp(uint32_t Cmp, uint32_t Props1, return false; } -bool MachineConstEvaluator::evaluateCOPY(const Register &R1, +bool MachineConstEvaluator::evaluateCOPY(const RegisterSubReg &R1, const CellMap &Inputs, LatticeCell &Result) { return getCell(R1, Inputs, Result); } -bool MachineConstEvaluator::evaluateANDrr(const Register &R1, - const Register &R2, const CellMap &Inputs, LatticeCell &Result) { +bool MachineConstEvaluator::evaluateANDrr(const RegisterSubReg &R1, + const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg)); const LatticeCell &L1 = Inputs.get(R2.Reg); const LatticeCell &L2 = Inputs.get(R2.Reg); @@ -1387,7 +1389,7 @@ bool MachineConstEvaluator::evaluateANDrr(const Register &R1, return !Result.isBottom(); } -bool MachineConstEvaluator::evaluateANDri(const Register &R1, +bool MachineConstEvaluator::evaluateANDri(const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); if (A2 == -1) @@ -1423,8 +1425,8 @@ bool MachineConstEvaluator::evaluateANDii(const APInt &A1, return true; } -bool MachineConstEvaluator::evaluateORrr(const Register &R1, - const Register &R2, const CellMap &Inputs, LatticeCell &Result) { +bool MachineConstEvaluator::evaluateORrr(const RegisterSubReg &R1, + const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg)); const LatticeCell &L1 = Inputs.get(R2.Reg); const LatticeCell &L2 = Inputs.get(R2.Reg); @@ -1454,7 +1456,7 @@ bool MachineConstEvaluator::evaluateORrr(const Register &R1, return !Result.isBottom(); } -bool MachineConstEvaluator::evaluateORri(const Register &R1, +bool MachineConstEvaluator::evaluateORri(const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); if (A2 == 0) @@ -1490,8 +1492,8 @@ bool MachineConstEvaluator::evaluateORii(const APInt &A1, return true; } -bool MachineConstEvaluator::evaluateXORrr(const Register &R1, - const Register &R2, const CellMap &Inputs, LatticeCell &Result) { +bool MachineConstEvaluator::evaluateXORrr(const RegisterSubReg &R1, + const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg)); LatticeCell LS1, LS2; if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2)) @@ -1519,7 +1521,7 @@ bool MachineConstEvaluator::evaluateXORrr(const Register &R1, return !Result.isBottom(); } -bool MachineConstEvaluator::evaluateXORri(const Register &R1, +bool MachineConstEvaluator::evaluateXORri(const RegisterSubReg &R1, const APInt &A2, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS1; @@ -1552,7 +1554,7 @@ bool MachineConstEvaluator::evaluateXORii(const APInt &A1, return true; } -bool MachineConstEvaluator::evaluateZEXTr(const Register &R1, unsigned Width, +bool MachineConstEvaluator::evaluateZEXTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS1; @@ -1583,7 +1585,7 @@ bool MachineConstEvaluator::evaluateZEXTi(const APInt &A1, unsigned Width, return true; } -bool MachineConstEvaluator::evaluateSEXTr(const Register &R1, unsigned Width, +bool MachineConstEvaluator::evaluateSEXTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS1; @@ -1648,7 +1650,7 @@ bool MachineConstEvaluator::evaluateSEXTi(const APInt &A1, unsigned Width, return true; } -bool MachineConstEvaluator::evaluateCLBr(const Register &R1, bool Zeros, +bool MachineConstEvaluator::evaluateCLBr(const RegisterSubReg &R1, bool Zeros, bool Ones, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS1; @@ -1683,7 +1685,7 @@ bool MachineConstEvaluator::evaluateCLBi(const APInt &A1, bool Zeros, return true; } -bool MachineConstEvaluator::evaluateCTBr(const Register &R1, bool Zeros, +bool MachineConstEvaluator::evaluateCTBr(const RegisterSubReg &R1, bool Zeros, bool Ones, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); LatticeCell LS1; @@ -1718,7 +1720,7 @@ bool MachineConstEvaluator::evaluateCTBi(const APInt &A1, bool Zeros, return true; } -bool MachineConstEvaluator::evaluateEXTRACTr(const Register &R1, +bool MachineConstEvaluator::evaluateEXTRACTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits, unsigned Offset, bool Signed, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); @@ -1776,7 +1778,7 @@ bool MachineConstEvaluator::evaluateEXTRACTi(const APInt &A1, unsigned Bits, return true; } -bool MachineConstEvaluator::evaluateSplatr(const Register &R1, +bool MachineConstEvaluator::evaluateSplatr(const RegisterSubReg &R1, unsigned Bits, unsigned Count, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(R1.Reg)); @@ -1833,7 +1835,7 @@ namespace { bool evaluate(const MachineInstr &MI, const CellMap &Inputs, CellMap &Outputs) override; - bool evaluate(const Register &R, const LatticeCell &SrcC, + bool evaluate(const RegisterSubReg &R, const LatticeCell &SrcC, LatticeCell &Result) override; bool evaluate(const MachineInstr &BrI, const CellMap &Inputs, SetVector &Targets, bool &FallsThru) @@ -1848,7 +1850,7 @@ namespace { const MachineOperand &MO); void replaceWithNop(MachineInstr &MI); - bool evaluateHexRSEQ32(Register RL, Register RH, const CellMap &Inputs, + bool evaluateHexRSEQ32(RegisterSubReg RL, RegisterSubReg RH, const CellMap &Inputs, LatticeCell &Result); bool evaluateHexCompare(const MachineInstr &MI, const CellMap &Inputs, CellMap &Outputs); @@ -1922,14 +1924,14 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, return false; unsigned Opc = MI.getOpcode(); - Register DefR(MD); + RegisterSubReg DefR(MD); assert(!DefR.SubReg); if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg)) return false; if (MI.isCopy()) { LatticeCell RC; - Register SrcR(MI.getOperand(1)); + RegisterSubReg SrcR(MI.getOperand(1)); bool Eval = evaluateCOPY(SrcR, Inputs, RC); if (!Eval) return false; @@ -1951,7 +1953,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, const MachineOperand &OpLo = LoIs1 ? MI.getOperand(1) : MI.getOperand(3); const MachineOperand &OpHi = LoIs1 ? MI.getOperand(3) : MI.getOperand(1); LatticeCell RC; - Register SrcRL(OpLo), SrcRH(OpHi); + RegisterSubReg SrcRL(OpLo), SrcRH(OpHi); bool Eval = evaluateHexRSEQ32(SrcRL, SrcRH, Inputs, RC); if (!Eval) return false; @@ -2038,7 +2040,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, int64_t B = MI.getOperand(2).getImm(); assert(B >=0 && B < 32); APInt A(32, (1ull << B), false); - Register R(MI.getOperand(1)); + RegisterSubReg R(MI.getOperand(1)); LatticeCell RC = Outputs.get(DefR.Reg); bool Eval = evaluateORri(R, A, Inputs, RC); if (!Eval) @@ -2078,7 +2080,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, using namespace Hexagon; bool Ones = (Opc == S2_ct1) || (Opc == S2_ct1p); - Register R1(MI.getOperand(1)); + RegisterSubReg R1(MI.getOperand(1)); assert(Inputs.has(R1.Reg)); LatticeCell T; bool Eval = evaluateCTBr(R1, !Ones, Ones, Inputs, T); @@ -2110,7 +2112,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, bool OnlyZeros = (Opc == S2_cl0) || (Opc == S2_cl0p); bool OnlyOnes = (Opc == S2_cl1) || (Opc == S2_cl1p); - Register R1(MI.getOperand(1)); + RegisterSubReg R1(MI.getOperand(1)); assert(Inputs.has(R1.Reg)); LatticeCell T; bool Eval = evaluateCLBr(R1, !OnlyOnes, !OnlyZeros, Inputs, T); @@ -2138,7 +2140,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, { bool Signed = (Opc == Hexagon::S4_extract) || (Opc == Hexagon::S4_extractp); - Register R1(MI.getOperand(1)); + RegisterSubReg R1(MI.getOperand(1)); unsigned BW = getRegBitWidth(R1.Reg); unsigned Bits = MI.getOperand(2).getImm(); unsigned Offset = MI.getOperand(3).getImm(); @@ -2189,7 +2191,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, return true; } -bool HexagonConstEvaluator::evaluate(const Register &R, +bool HexagonConstEvaluator::evaluate(const RegisterSubReg &R, const LatticeCell &Input, LatticeCell &Result) { if (!R.SubReg) { Result = Input; @@ -2280,7 +2282,7 @@ Undetermined: if (SimpleBranch) { const MachineOperand &MD = BrI.getOperand(0); - Register PR(MD); + RegisterSubReg PR(MD); // If the condition operand has a subregister, this is not something // we currently recognize. if (PR.SubReg) @@ -2502,7 +2504,7 @@ void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) { MI.RemoveOperand(0); } -bool HexagonConstEvaluator::evaluateHexRSEQ32(Register RL, Register RH, +bool HexagonConstEvaluator::evaluateHexRSEQ32(RegisterSubReg RL, RegisterSubReg RH, const CellMap &Inputs, LatticeCell &Result) { assert(Inputs.has(RL.Reg) && Inputs.has(RH.Reg)); LatticeCell LSL, LSH; @@ -2571,7 +2573,7 @@ bool HexagonConstEvaluator::evaluateHexCompare(const MachineInstr &MI, if (Computed) { // Only create a zero/non-zero cell. At this time there isn't really // much need for specific values. - Register DefR(MI.getOperand(0)); + RegisterSubReg DefR(MI.getOperand(0)); LatticeCell L = Outputs.get(DefR.Reg); uint32_t P = Result ? ConstantProperties::NonZero : ConstantProperties::Zero; @@ -2591,9 +2593,9 @@ bool HexagonConstEvaluator::evaluateHexCompare2(unsigned Opc, bool Reg1 = Src1.isReg(), Reg2 = Src2.isReg(); bool Imm1 = Src1.isImm(), Imm2 = Src2.isImm(); if (Reg1) { - Register R1(Src1); + RegisterSubReg R1(Src1); if (Reg2) { - Register R2(Src2); + RegisterSubReg R2(Src2); return evaluateCMPrr(Cmp, R1, R2, Inputs, Result); } else if (Imm2) { APInt A2 = getCmpImm(Opc, 2, Src2); @@ -2602,7 +2604,7 @@ bool HexagonConstEvaluator::evaluateHexCompare2(unsigned Opc, } else if (Imm1) { APInt A1 = getCmpImm(Opc, 1, Src1); if (Reg2) { - Register R2(Src2); + RegisterSubReg R2(Src2); uint32_t NegCmp = Comparison::negate(Cmp); return evaluateCMPri(NegCmp, R2, A1, Inputs, Result); } else if (Imm2) { @@ -2621,7 +2623,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI, return false; const MachineOperand &Src1 = MI.getOperand(1); const MachineOperand &Src2 = MI.getOperand(2); - Register R1(Src1); + RegisterSubReg R1(Src1); bool Eval = false; LatticeCell RC; switch (Opc) { @@ -2629,7 +2631,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI, return false; case Hexagon::A2_and: case Hexagon::A2_andp: - Eval = evaluateANDrr(R1, Register(Src2), Inputs, RC); + Eval = evaluateANDrr(R1, RegisterSubReg(Src2), Inputs, RC); break; case Hexagon::A2_andir: { if (!Src2.isImm()) @@ -2640,7 +2642,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI, } case Hexagon::A2_or: case Hexagon::A2_orp: - Eval = evaluateORrr(R1, Register(Src2), Inputs, RC); + Eval = evaluateORrr(R1, RegisterSubReg(Src2), Inputs, RC); break; case Hexagon::A2_orir: { if (!Src2.isImm()) @@ -2651,11 +2653,11 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI, } case Hexagon::A2_xor: case Hexagon::A2_xorp: - Eval = evaluateXORrr(R1, Register(Src2), Inputs, RC); + Eval = evaluateXORrr(R1, RegisterSubReg(Src2), Inputs, RC); break; } if (Eval) { - Register DefR(MI.getOperand(0)); + RegisterSubReg DefR(MI.getOperand(0)); Outputs.update(DefR.Reg, RC); } return Eval; @@ -2664,7 +2666,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI, bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI, const CellMap &Inputs, CellMap &Outputs) { // Dst0 = Cond1 ? Src2 : Src3 - Register CR(MI.getOperand(1)); + RegisterSubReg CR(MI.getOperand(1)); assert(Inputs.has(CR.Reg)); LatticeCell LS; if (!getCell(CR, Inputs, LS)) @@ -2679,7 +2681,7 @@ bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI, return false; const MachineOperand &ValOp = MI.getOperand(TakeOp); - Register DefR(MI.getOperand(0)); + RegisterSubReg DefR(MI.getOperand(0)); LatticeCell RC = Outputs.get(DefR.Reg); if (ValOp.isImm()) { @@ -2692,7 +2694,7 @@ bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI, return true; } if (ValOp.isReg()) { - Register R(ValOp); + RegisterSubReg R(ValOp); const LatticeCell &LR = Inputs.get(R.Reg); LatticeCell LSR; if (!evaluate(R, LR, LSR)) @@ -2707,7 +2709,7 @@ bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI, bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI, const CellMap &Inputs, CellMap &Outputs) { // Dst0 = ext R1 - Register R1(MI.getOperand(1)); + RegisterSubReg R1(MI.getOperand(1)); assert(Inputs.has(R1.Reg)); unsigned Opc = MI.getOpcode(); @@ -2724,6 +2726,8 @@ bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI, case Hexagon::A2_sxtw: Bits = 32; break; + default: + llvm_unreachable("Unhandled extension opcode"); } bool Signed = false; @@ -2735,7 +2739,7 @@ bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI, break; } - Register DefR(MI.getOperand(0)); + RegisterSubReg DefR(MI.getOperand(0)); unsigned BW = getRegBitWidth(DefR.Reg); LatticeCell RC = Outputs.get(DefR.Reg); bool Eval = Signed ? evaluateSEXTr(R1, BW, Bits, Inputs, RC) @@ -2749,8 +2753,8 @@ bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI, bool HexagonConstEvaluator::evaluateHexVector1(const MachineInstr &MI, const CellMap &Inputs, CellMap &Outputs) { // DefR = op R1 - Register DefR(MI.getOperand(0)); - Register R1(MI.getOperand(1)); + RegisterSubReg DefR(MI.getOperand(0)); + RegisterSubReg R1(MI.getOperand(1)); assert(Inputs.has(R1.Reg)); LatticeCell RC = Outputs.get(DefR.Reg); bool Eval; @@ -2788,7 +2792,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI, for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isUse() || MO.isImplicit()) continue; - Register R(MO); + RegisterSubReg R(MO); if (!TargetRegisterInfo::isVirtualRegister(R.Reg)) continue; HasUse = true; @@ -2954,10 +2958,10 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, // to DefR += mpyi(R, #imm), // or DefR -= mpyi(R, #imm). { - Register DefR(MI.getOperand(0)); + RegisterSubReg DefR(MI.getOperand(0)); assert(!DefR.SubReg); - Register R2(MI.getOperand(2)); - Register R3(MI.getOperand(3)); + RegisterSubReg R2(MI.getOperand(2)); + RegisterSubReg R3(MI.getOperand(3)); assert(Inputs.has(R2.Reg) && Inputs.has(R3.Reg)); LatticeCell LS2, LS3; // It is enough to get one of the input cells, since we will only try @@ -2971,7 +2975,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, if (Zero) { // DefR == R1 (tied operands). MachineOperand &Acc = MI.getOperand(1); - Register R1(Acc); + RegisterSubReg R1(Acc); unsigned NewR = R1.Reg; if (R1.SubReg) { // Generate COPY. FIXME: Replace with the register:subregister. @@ -3018,8 +3022,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, case Hexagon::A2_and: { - Register R1(MI.getOperand(1)); - Register R2(MI.getOperand(2)); + RegisterSubReg R1(MI.getOperand(1)); + RegisterSubReg R2(MI.getOperand(2)); assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg)); LatticeCell LS1, LS2; unsigned CopyOf = 0; @@ -3037,8 +3041,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, if (!CopyOf) return false; MachineOperand &SO = MI.getOperand(CopyOf); - Register SR(SO); - Register DefR(MI.getOperand(0)); + RegisterSubReg SR(SO); + RegisterSubReg DefR(MI.getOperand(0)); unsigned NewR = SR.Reg; if (SR.SubReg) { const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg); @@ -3054,8 +3058,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, case Hexagon::A2_or: { - Register R1(MI.getOperand(1)); - Register R2(MI.getOperand(2)); + RegisterSubReg R1(MI.getOperand(1)); + RegisterSubReg R2(MI.getOperand(2)); assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg)); LatticeCell LS1, LS2; unsigned CopyOf = 0; @@ -3069,8 +3073,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, if (!CopyOf) return false; MachineOperand &SO = MI.getOperand(CopyOf); - Register SR(SO); - Register DefR(MI.getOperand(0)); + RegisterSubReg SR(SO); + RegisterSubReg DefR(MI.getOperand(0)); unsigned NewR = SR.Reg; if (SR.SubReg) { const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg); diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp index 28965b69e284..a09ccab483cf 100644 --- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -1,9 +1,8 @@ //===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This pass replaces transfer instructions by combine instructions. @@ -255,8 +254,8 @@ static bool isUnsafeToMoveAcross(MachineInstr &MI, unsigned UseReg, MI.isMetaInstruction(); } -static unsigned UseReg(const MachineOperand& MO) { - return MO.isReg() ? MO.getReg() : 0; +static Register UseReg(const MachineOperand& MO) { + return MO.isReg() ? MO.getReg() : Register(); } /// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h index dff2b2f471d0..529be7ef0ac7 100644 --- a/lib/Target/Hexagon/HexagonDepArch.h +++ b/lib/Target/Hexagon/HexagonDepArch.h @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td index f1aadae555c8..115cf2383a7a 100644 --- a/lib/Target/Hexagon/HexagonDepArch.td +++ b/lib/Target/Hexagon/HexagonDepArch.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Hexagon/HexagonDepDecoders.h deleted file mode 100644 index 9f78412f45d2..000000000000 --- a/lib/Target/Hexagon/HexagonDepDecoders.h +++ /dev/null @@ -1,79 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Automatically generated file, please consult code owner before editing. -//===----------------------------------------------------------------------===// - -// clang-format off - -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-function" -#endif - -static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<4>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<14>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<8>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<7>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<12>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<3>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<13>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<6>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<9>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<5>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, - uint64_t, const void *Decoder) { - signedDecoder<6>(MI, tmp, Decoder); - return MCDisassembler::Success; -} - -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - -// clang-format on diff --git a/lib/Target/Hexagon/HexagonDepDecoders.inc b/lib/Target/Hexagon/HexagonDepDecoders.inc new file mode 100644 index 000000000000..10068abce7ec --- /dev/null +++ b/lib/Target/Hexagon/HexagonDepDecoders.inc @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Automatically generated file, please consult code owner before editing. +//===----------------------------------------------------------------------===// + +// clang-format off + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif + +static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<4>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<14>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<8>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<7>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<12>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<3>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<13>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<6>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<9>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<5>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<6>(MI, tmp, Decoder); + return MCDisassembler::Success; +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +// clang-format on diff --git a/lib/Target/Hexagon/HexagonDepIICHVX.td b/lib/Target/Hexagon/HexagonDepIICHVX.td index 9e3dea9f3e9b..fefbbfd3f1ac 100644 --- a/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepIICScalar.td b/lib/Target/Hexagon/HexagonDepIICScalar.td index 9da25952fb1c..34da0be02d19 100644 --- a/lib/Target/Hexagon/HexagonDepIICScalar.td +++ b/lib/Target/Hexagon/HexagonDepIICScalar.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepITypes.h b/lib/Target/Hexagon/HexagonDepITypes.h index 81e3971e21d2..358345e027d8 100644 --- a/lib/Target/Hexagon/HexagonDepITypes.h +++ b/lib/Target/Hexagon/HexagonDepITypes.h @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepITypes.td b/lib/Target/Hexagon/HexagonDepITypes.td index f694062a5232..91c02b84b87c 100644 --- a/lib/Target/Hexagon/HexagonDepITypes.td +++ b/lib/Target/Hexagon/HexagonDepITypes.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepInstrFormats.td b/lib/Target/Hexagon/HexagonDepInstrFormats.td index ffe212ef9d97..c08d9a388d3e 100644 --- a/lib/Target/Hexagon/HexagonDepInstrFormats.td +++ b/lib/Target/Hexagon/HexagonDepInstrFormats.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td index 3ef1c49eb7ee..a49051888c77 100644 --- a/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 2346fa572626..2ce1419e4790 100644 --- a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td index b3132d41b903..22ee495b25e6 100644 --- a/lib/Target/Hexagon/HexagonDepMappings.td +++ b/lib/Target/Hexagon/HexagonDepMappings.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td index ef2d4fa45702..fdba7b971258 100644 --- a/lib/Target/Hexagon/HexagonDepOperands.td +++ b/lib/Target/Hexagon/HexagonDepOperands.td @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonDepTimingClasses.h b/lib/Target/Hexagon/HexagonDepTimingClasses.h index 0fd55e8b7997..b6be74f848bb 100644 --- a/lib/Target/Hexagon/HexagonDepTimingClasses.h +++ b/lib/Target/Hexagon/HexagonDepTimingClasses.h @@ -1,9 +1,8 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Automatically generated file, please consult code owner before editing. diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index 8e2f5093038e..c1f32e54e98d 100644 --- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -1,9 +1,8 @@ //===- HexagonEarlyIfConv.cpp ---------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 1a762c0c9de7..c343e426ac7d 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -1,9 +1,8 @@ //===- HexagonExpandCondsets.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -734,7 +733,7 @@ bool HexagonExpandCondsets::isPredicable(MachineInstr *MI) { HasDef = true; } for (auto &Mo : MI->memoperands()) - if (Mo->isVolatile()) + if (Mo->isVolatile() || Mo->isAtomic()) return false; return true; } diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp index e9067e2285a8..f7edc168de4a 100644 --- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp +++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp @@ -1,9 +1,8 @@ //===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // The loop start address in the LOOPn instruction is encoded as a distance // from the LOOPn instruction itself. If the start address is too far from diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index f5736546a87c..3368ee4fb3b9 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1,9 +1,8 @@ //===- HexagonFrameLowering.cpp - Define frame lowering -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // //===----------------------------------------------------------------------===// @@ -375,17 +374,17 @@ static bool isRestoreCall(unsigned Opc) { } static inline bool isOptNone(const MachineFunction &MF) { - return MF.getFunction().hasFnAttribute(Attribute::OptimizeNone) || + return MF.getFunction().hasOptNone() || MF.getTarget().getOptLevel() == CodeGenOpt::None; } static inline bool isOptSize(const MachineFunction &MF) { const Function &F = MF.getFunction(); - return F.optForSize() && !F.optForMinSize(); + return F.hasOptSize() && !F.hasMinSize(); } static inline bool isMinSize(const MachineFunction &MF) { - return MF.getFunction().optForMinSize(); + return MF.getFunction().hasMinSize(); } /// Implements shrink-wrapping of the stack frame. By default, stack frame @@ -2102,7 +2101,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF, } if (!Bad) { for (auto *Mo : In.memoperands()) { - if (!Mo->isVolatile()) + if (!Mo->isVolatile() && !Mo->isAtomic()) continue; Bad = true; break; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index d65d870750f8..65e8c7686640 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -1,9 +1,8 @@ //==- HexagonFrameLowering.h - Define frame lowering for Hexagon -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp index 08a016b74650..3417c74e359b 100644 --- a/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -1,9 +1,8 @@ //===- HexagonGenExtract.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -211,7 +210,7 @@ bool HexagonGenExtract::convert(Instruction *In) { Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Module *Mod = BB->getParent()->getParent(); - Value *ExtF = Intrinsic::getDeclaration(Mod, IntId); + Function *ExtF = Intrinsic::getDeclaration(Mod, IntId); Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); if (SL != 0) NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp index e3492e7374e9..81025c1c5325 100644 --- a/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -1,9 +1,8 @@ //===- HexagonGenInsert.cpp -----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -437,7 +436,7 @@ namespace { } // end anonymous namespace void OrderedRegisterList::insert(unsigned VR) { - iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord); + iterator L = llvm::lower_bound(Seq, VR, Ord); if (L == Seq.end()) Seq.push_back(VR); else @@ -450,7 +449,7 @@ void OrderedRegisterList::insert(unsigned VR) { } void OrderedRegisterList::remove(unsigned VR) { - iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord); + iterator L = llvm::lower_bound(Seq, VR, Ord); if (L != Seq.end()) Seq.erase(L); } diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp index e5af96468af1..cdafbc20ab86 100644 --- a/lib/Target/Hexagon/HexagonGenMux.cpp +++ b/lib/Target/Hexagon/HexagonGenMux.cpp @@ -1,9 +1,8 @@ //===- HexagonGenMux.cpp --------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -304,8 +303,8 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { std::advance(It2, MaxX); MachineInstr &Def1 = *It1, &Def2 = *It2; MachineOperand *Src1 = &Def1.getOperand(2), *Src2 = &Def2.getOperand(2); - unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0; - unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0; + Register SR1 = Src1->isReg() ? Src1->getReg() : Register(); + Register SR2 = Src2->isReg() ? Src2->getReg() : Register(); bool Failure = false, CanUp = true, CanDown = true; for (unsigned X = MinX+1; X < MaxX; X++) { const DefUseInfo &DU = DUM.lookup(X); diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp index c0d2de90467a..e991fa8b61c8 100644 --- a/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -1,9 +1,8 @@ //===- HexagonGenPredicate.cpp --------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -46,17 +45,19 @@ namespace llvm { namespace { - struct Register { + // FIXME: Use TargetInstrInfo::RegSubRegPair + struct RegisterSubReg { unsigned R, S; - Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {} - Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {} + RegisterSubReg(unsigned r = 0, unsigned s = 0) : R(r), S(s) {} + RegisterSubReg(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {} + RegisterSubReg(const Register &Reg) : R(Reg), S(0) {} - bool operator== (const Register &Reg) const { + bool operator== (const RegisterSubReg &Reg) const { return R == Reg.R && S == Reg.S; } - bool operator< (const Register &Reg) const { + bool operator< (const RegisterSubReg &Reg) const { return R < Reg.R || (R == Reg.R && S < Reg.S); } }; @@ -64,10 +65,10 @@ namespace { struct PrintRegister { friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR); - PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {} + PrintRegister(RegisterSubReg R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {} private: - Register Reg; + RegisterSubReg Reg; const TargetRegisterInfo &TRI; }; @@ -99,8 +100,8 @@ namespace { private: using VectOfInst = SetVector; - using SetOfReg = std::set; - using RegToRegMap = std::map; + using SetOfReg = std::set; + using RegToRegMap = std::map; const HexagonInstrInfo *TII = nullptr; const HexagonRegisterInfo *TRI = nullptr; @@ -111,12 +112,12 @@ namespace { bool isPredReg(unsigned R); void collectPredicateGPR(MachineFunction &MF); - void processPredicateGPR(const Register &Reg); + void processPredicateGPR(const RegisterSubReg &Reg); unsigned getPredForm(unsigned Opc); bool isConvertibleToPredForm(const MachineInstr *MI); bool isScalarCmp(unsigned Opc); - bool isScalarPred(Register PredReg); - Register getPredRegFor(const Register &Reg); + bool isScalarPred(RegisterSubReg PredReg); + RegisterSubReg getPredRegFor(const RegisterSubReg &Reg); bool convertToPredForm(MachineInstr *MI); bool eliminatePredCopies(MachineFunction &MF); }; @@ -211,7 +212,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) { case Hexagon::C2_tfrpr: case TargetOpcode::COPY: if (isPredReg(MI->getOperand(1).getReg())) { - Register RD = MI->getOperand(0); + RegisterSubReg RD = MI->getOperand(0); if (TargetRegisterInfo::isVirtualRegister(RD.R)) PredGPRs.insert(RD); } @@ -221,7 +222,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) { } } -void HexagonGenPredicate::processPredicateGPR(const Register &Reg) { +void HexagonGenPredicate::processPredicateGPR(const RegisterSubReg &Reg) { LLVM_DEBUG(dbgs() << __func__ << ": " << printReg(Reg.R, TRI, Reg.S) << "\n"); using use_iterator = MachineRegisterInfo::use_iterator; @@ -240,7 +241,7 @@ void HexagonGenPredicate::processPredicateGPR(const Register &Reg) { } } -Register HexagonGenPredicate::getPredRegFor(const Register &Reg) { +RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) { // Create a predicate register for a given Reg. The newly created register // will have its value copied from Reg, so that it can be later used as // an operand in other instructions. @@ -255,7 +256,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) { unsigned Opc = DefI->getOpcode(); if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) { assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse()); - Register PR = DefI->getOperand(1); + RegisterSubReg PR = DefI->getOperand(1); G2P.insert(std::make_pair(Reg, PR)); LLVM_DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n'); return PR; @@ -272,10 +273,10 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) { MachineBasicBlock::iterator DefIt = DefI; BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR) .addReg(Reg.R, 0, Reg.S); - G2P.insert(std::make_pair(Reg, Register(NewPR))); - LLVM_DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) + G2P.insert(std::make_pair(Reg, RegisterSubReg(NewPR))); + LLVM_DEBUG(dbgs() << " -> !" << PrintRegister(RegisterSubReg(NewPR), *TRI) << '\n'); - return Register(NewPR); + return RegisterSubReg(NewPR); } llvm_unreachable("Invalid argument"); @@ -317,12 +318,12 @@ bool HexagonGenPredicate::isScalarCmp(unsigned Opc) { return false; } -bool HexagonGenPredicate::isScalarPred(Register PredReg) { - std::queue WorkQ; +bool HexagonGenPredicate::isScalarPred(RegisterSubReg PredReg) { + std::queue WorkQ; WorkQ.push(PredReg); while (!WorkQ.empty()) { - Register PR = WorkQ.front(); + RegisterSubReg PR = WorkQ.front(); WorkQ.pop(); const MachineInstr *DefI = MRI->getVRegDef(PR.R); if (!DefI) @@ -351,7 +352,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) { // Add operands to the queue. for (const MachineOperand &MO : DefI->operands()) if (MO.isReg() && MO.isUse()) - WorkQ.push(Register(MO.getReg())); + WorkQ.push(RegisterSubReg(MO.getReg())); break; // All non-vector compares are ok, everything else is bad. @@ -373,7 +374,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isUse()) continue; - Register Reg(MO); + RegisterSubReg Reg(MO); if (Reg.S && Reg.S != Hexagon::isub_lo) return false; if (!PredGPRs.count(Reg)) @@ -400,7 +401,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { // If it's a scalar predicate register, then all bits in it are // the same. Otherwise, to determine whether all bits are 0 or not // we would need to use any8. - Register PR = getPredRegFor(MI->getOperand(1)); + RegisterSubReg PR = getPredRegFor(MI->getOperand(1)); if (!isScalarPred(PR)) return false; // This will skip the immediate argument when creating the predicate @@ -411,19 +412,19 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { // Some sanity: check that def is in operand #0. MachineOperand &Op0 = MI->getOperand(0); assert(Op0.isDef()); - Register OutR(Op0); + RegisterSubReg OutR(Op0); // Don't use getPredRegFor, since it will create an association between // the argument and a created predicate register (i.e. it will insert a // copy if a new predicate register is created). const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass; - Register NewPR = MRI->createVirtualRegister(PredRC); + RegisterSubReg NewPR = MRI->createVirtualRegister(PredRC); MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R); // Add predicate counterparts of the GPRs. for (unsigned i = 1; i < NumOps; ++i) { - Register GPR = MI->getOperand(i); - Register Pred = getPredRegFor(GPR); + RegisterSubReg GPR = MI->getOperand(i); + RegisterSubReg Pred = getPredRegFor(GPR); MIB.addReg(Pred.R, 0, Pred.S); } LLVM_DEBUG(dbgs() << "generated: " << *MIB); @@ -441,7 +442,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { // then the output will be a predicate register. Do not visit the // users of it. if (!isPredReg(NewOutR)) { - Register R(NewOutR); + RegisterSubReg R(NewOutR); PredGPRs.insert(R); processPredicateGPR(R); } @@ -468,8 +469,8 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) { for (MachineInstr &MI : MBB) { if (MI.getOpcode() != TargetOpcode::COPY) continue; - Register DR = MI.getOperand(0); - Register SR = MI.getOperand(1); + RegisterSubReg DR = MI.getOperand(0); + RegisterSubReg SR = MI.getOperand(1); if (!TargetRegisterInfo::isVirtualRegister(DR.R)) continue; if (!TargetRegisterInfo::isVirtualRegister(SR.R)) diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 239cf49ca8a2..cecbaedb6d70 100644 --- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -1,9 +1,8 @@ //===- HexagonHardwareLoops.cpp - Identify and generate hardware loops ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/lib/Target/Hexagon/HexagonHazardRecognizer.cpp index 44f1f554c662..e45126bec6ef 100644 --- a/lib/Target/Hexagon/HexagonHazardRecognizer.cpp +++ b/lib/Target/Hexagon/HexagonHazardRecognizer.cpp @@ -1,9 +1,8 @@ //===-- HexagonHazardRecognizer.cpp - Hexagon Post RA Hazard Recognizer ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonHazardRecognizer.h b/lib/Target/Hexagon/HexagonHazardRecognizer.h index 2874d73ce819..53b9cb43b4b6 100644 --- a/lib/Target/Hexagon/HexagonHazardRecognizer.h +++ b/lib/Target/Hexagon/HexagonHazardRecognizer.h @@ -1,9 +1,8 @@ //===--- HexagonHazardRecognizer.h - Hexagon Post RA Hazard Recognizer ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file defines the hazard recognizer for scheduling on Hexagon. diff --git a/lib/Target/Hexagon/HexagonIICHVX.td b/lib/Target/Hexagon/HexagonIICHVX.td index a804c5a80d03..06e9c83cf306 100644 --- a/lib/Target/Hexagon/HexagonIICHVX.td +++ b/lib/Target/Hexagon/HexagonIICHVX.td @@ -1,9 +1,8 @@ //===--- HexagonIICHVX.td -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -17,12 +16,14 @@ class HVXItin { InstrStage<1, [CVI_XLANE,CVI_SHIFT, CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, - // Used by Gather Pseudo Instructions which are expanded into - // V6_vgather* and V6_vS32b_new_ai. Even though these instructions - // use CVI_ST resource, it's not included below to avoid having more than - // 4 InstrStages and thus changing 'MaxResTerms' to 5. + // Used by gather pseudo-instructions which are expanded into V6_vgather* + // and V6_vS32b_new_ai. Even though these instructions use CVI_LD resource, + // it's not included below to avoid having more than 4 InstrStages and + // thus changing 'MaxResTerms' to 5. Instead, both SLOT0 and SLOT1 are + // used, which should be sufficient. InstrItinData , - InstrStage<1, [CVI_LD], 0>, InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY01, CVI_XLSHF]>]>]; } diff --git a/lib/Target/Hexagon/HexagonIICScalar.td b/lib/Target/Hexagon/HexagonIICScalar.td index 5fe713346e38..d37cc3a2cc3e 100644 --- a/lib/Target/Hexagon/HexagonIICScalar.td +++ b/lib/Target/Hexagon/HexagonIICScalar.td @@ -1,9 +1,8 @@ //===--- HexagonIICScalar.td ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 470b05bda4c6..605fcfc25559 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -849,6 +848,9 @@ void HexagonDAGToDAGISel::SelectD2P(SDNode *N) { void HexagonDAGToDAGISel::SelectV2Q(SDNode *N) { const SDLoc &dl(N); MVT ResTy = N->getValueType(0).getSimpleVT(); + // The argument to V2Q should be a single vector. + MVT OpTy = N->getOperand(0).getValueType().getSimpleVT(); (void)OpTy; + assert(HST->getVectorLength() * 8 == OpTy.getSizeInBits()); SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32); SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C); @@ -860,6 +862,8 @@ void HexagonDAGToDAGISel::SelectV2Q(SDNode *N) { void HexagonDAGToDAGISel::SelectQ2V(SDNode *N) { const SDLoc &dl(N); MVT ResTy = N->getValueType(0).getSimpleVT(); + // The result of V2Q should be a single vector. + assert(HST->getVectorLength() * 8 == ResTy.getSizeInBits()); SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32); SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C); diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/lib/Target/Hexagon/HexagonISelDAGToDAG.h index f4f09dd4e758..65edb09603b3 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.h +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.h @@ -1,9 +1,8 @@ //===-- HexagonISelDAGToDAG.h -----------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Hexagon specific code to select Hexagon machine instructions for diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index b796e442d4fa..e7f1c345af1d 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -1,9 +1,8 @@ //===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 1edf3e498dfa..fef5a98cdb00 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1,9 +1,8 @@ //===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -579,7 +578,8 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo(); unsigned LR = HRI.getRARegister(); - if (Op.getOpcode() != ISD::INLINEASM || HMFI.hasClobberLR()) + if ((Op.getOpcode() != ISD::INLINEASM && + Op.getOpcode() != ISD::INLINEASM_BR) || HMFI.hasClobberLR()) return Op; unsigned NumOps = Op.getNumOperands(); @@ -1292,6 +1292,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); setOperationAction(ISD::INLINEASM, MVT::Other, Custom); + setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom); setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -1324,7 +1325,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, if (EmitJumpTables) setMinimumJumpTableEntries(MinimumJumpTables); else - setMinimumJumpTableEntries(std::numeric_limits::max()); + setMinimumJumpTableEntries(std::numeric_limits::max()); setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::ABS, MVT::i32, Legal); @@ -1333,8 +1334,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit, // but they only operate on i64. for (MVT VT : MVT::integer_valuetypes()) { - setOperationAction(ISD::UADDO, VT, Expand); - setOperationAction(ISD::USUBO, VT, Expand); + setOperationAction(ISD::UADDO, VT, Custom); + setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SADDO, VT, Expand); setOperationAction(ISD::SSUBO, VT, Expand); setOperationAction(ISD::ADDCARRY, VT, Expand); @@ -2619,7 +2620,6 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const SDLoc &dl(Op); const DataLayout &DL = DAG.getDataLayout(); LLVMContext &Ctx = *DAG.getContext(); - unsigned AS = LN->getAddressSpace(); // If the load aligning is disabled or the load can be broken up into two // smaller legal loads, do the default (target-independent) expansion. @@ -2629,15 +2629,15 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) DoDefault = true; if (!AlignLoads) { - if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), AS, HaveAlign)) + if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), *LN->getMemOperand())) return Op; DoDefault = true; } - if (!DoDefault && 2*HaveAlign == NeedAlign) { + if (!DoDefault && (2 * HaveAlign) == NeedAlign) { // The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)". - MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8*HaveAlign) + MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8 * HaveAlign) : MVT::getVectorVT(MVT::i8, HaveAlign); - DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, AS, HaveAlign); + DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, *LN->getMemOperand()); } if (DoDefault) { std::pair P = expandUnalignedLoad(LN, DAG); @@ -2691,6 +2691,43 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) return M; } +SDValue +HexagonTargetLowering::LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); + auto *CY = dyn_cast(Y); + if (!CY) + return SDValue(); + + const SDLoc &dl(Op); + SDVTList VTs = Op.getNode()->getVTList(); + assert(VTs.NumVTs == 2); + assert(VTs.VTs[1] == MVT::i1); + unsigned Opc = Op.getOpcode(); + + if (CY) { + uint32_t VY = CY->getZExtValue(); + assert(VY != 0 && "This should have been folded"); + // X +/- 1 + if (VY != 1) + return SDValue(); + + if (Opc == ISD::UADDO) { + SDValue Op = DAG.getNode(ISD::ADD, dl, VTs.VTs[0], {X, Y}); + SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op, getZero(dl, ty(Op), DAG), + ISD::SETEQ); + return DAG.getMergeValues({Op, Ov}, dl); + } + if (Opc == ISD::USUBO) { + SDValue Op = DAG.getNode(ISD::SUB, dl, VTs.VTs[0], {X, Y}); + SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op, + DAG.getConstant(-1, dl, ty(Op)), ISD::SETEQ); + return DAG.getMergeValues({Op, Ov}, dl); + } + } + + return SDValue(); +} + SDValue HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); @@ -2741,7 +2778,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); // Handle INLINEASM first. - if (Opc == ISD::INLINEASM) + if (Opc == ISD::INLINEASM || Opc == ISD::INLINEASM_BR) return LowerINLINEASM(Op, DAG); if (isHvxOperation(Op)) { @@ -2768,6 +2805,8 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::LOAD: return LowerLoad(Op, DAG); case ISD::STORE: return LowerStore(Op, DAG); + case ISD::UADDO: + case ISD::USUBO: return LowerUAddSubO(Op, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerAddSubCarry(Op, DAG); case ISD::SRA: @@ -2923,7 +2962,8 @@ HexagonTargetLowering::getRegForInlineAsmConstraint( /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. -bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { return true; } @@ -3047,7 +3087,7 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization( /// determined using generic target-independent logic. EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, MachineFunction &MF) const { + bool MemcpyStrSrc, const AttributeList &FuncAttributes) const { auto Aligned = [](unsigned GivenA, unsigned MinA) -> bool { return (GivenA % MinA) == 0; @@ -3063,8 +3103,9 @@ EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::Other; } -bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AS, unsigned Align, bool *Fast) const { +bool HexagonTargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { if (Fast) *Fast = false; return Subtarget.isHVXVectorType(VT.getSimpleVT()); @@ -3111,13 +3152,21 @@ Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { BasicBlock *BB = Builder.GetInsertBlock(); Module *M = BB->getParent()->getParent(); - Type *Ty = cast(Addr->getType())->getElementType(); + auto PT = cast(Addr->getType()); + Type *Ty = PT->getElementType(); unsigned SZ = Ty->getPrimitiveSizeInBits(); assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked : Intrinsic::hexagon_L4_loadd_locked; - Value *Fn = Intrinsic::getDeclaration(M, IntID); - return Builder.CreateCall(Fn, Addr, "larx"); + Function *Fn = Intrinsic::getDeclaration(M, IntID); + + PointerType *NewPtrTy + = Builder.getIntNTy(SZ)->getPointerTo(PT->getAddressSpace()); + Addr = Builder.CreateBitCast(Addr, NewPtrTy); + + Value *Call = Builder.CreateCall(Fn, Addr, "larx"); + + return Builder.CreateBitCast(Call, Ty); } /// Perform a store-conditional operation to Addr. Return the status of the @@ -3128,10 +3177,17 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Module *M = BB->getParent()->getParent(); Type *Ty = Val->getType(); unsigned SZ = Ty->getPrimitiveSizeInBits(); + + Type *CastTy = Builder.getIntNTy(SZ); assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked : Intrinsic::hexagon_S4_stored_locked; - Value *Fn = Intrinsic::getDeclaration(M, IntID); + Function *Fn = Intrinsic::getDeclaration(M, IntID); + + unsigned AS = Addr->getType()->getPointerAddressSpace(); + Addr = Builder.CreateBitCast(Addr, CastTy->getPointerTo(AS)); + Val = Builder.CreateBitCast(Val, CastTy); + Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx"); Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), ""); Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext())); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 265c37e6ae61..4e467cb22727 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -1,9 +1,8 @@ //===-- HexagonISelLowering.h - Hexagon DAG Lowering Interface --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -168,6 +167,7 @@ namespace HexagonISD { SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; @@ -285,7 +285,8 @@ namespace HexagonISD { /// is legal. It is frequently not legal in PIC relocation models. bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can @@ -295,10 +296,10 @@ namespace HexagonISD { EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, - unsigned Align, bool *Fast) const override; + unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override; /// Returns relocation base for the given PIC jumptable. SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index a6400b5d8266..345c657787a0 100644 --- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1,9 +1,8 @@ //===-- HexagonISelLoweringHVX.cpp --- Lowering HVX operations ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -1542,6 +1541,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL: case ISD::SETCC: case ISD::VSELECT: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND_INREG: return SplitHvxPairOp(Op, DAG); } diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td index 2236140d5dd7..f156de671059 100644 --- a/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/lib/Target/Hexagon/HexagonInstrFormats.td @@ -1,9 +1,8 @@ //==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV5.td b/lib/Target/Hexagon/HexagonInstrFormatsV5.td index c8de5cbcc1e0..68ef2d2d3a8a 100644 --- a/lib/Target/Hexagon/HexagonInstrFormatsV5.td +++ b/lib/Target/Hexagon/HexagonInstrFormatsV5.td @@ -1,9 +1,8 @@ //==- HexagonInstrFormatsV5.td - Hexagon Instruction Formats --*- tablegen -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/lib/Target/Hexagon/HexagonInstrFormatsV60.td index 1347a655353f..86a82183a1ad 100644 --- a/lib/Target/Hexagon/HexagonInstrFormatsV60.td +++ b/lib/Target/Hexagon/HexagonInstrFormatsV60.td @@ -1,9 +1,8 @@ //==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/lib/Target/Hexagon/HexagonInstrFormatsV65.td index cddb8777b417..eaecffe9c89e 100644 --- a/lib/Target/Hexagon/HexagonInstrFormatsV65.td +++ b/lib/Target/Hexagon/HexagonInstrFormatsV65.td @@ -1,9 +1,8 @@ //==- HexagonInstrFormatsV65.td - Hexagon Instruction Formats -*- tablegen -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index de0d6c4d9e4e..a156de5ba128 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1,9 +1,8 @@ //===- HexagonInstrInfo.cpp - Hexagon Instruction Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -698,11 +697,11 @@ bool HexagonInstrInfo::analyzeLoop(MachineLoop &L, /// Generate code to reduce the loop iteration by one and check if the loop is /// finished. Return the value/register of the new loop count. this function /// assumes the nth iteration is peeled first. -unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB, - MachineInstr *IndVar, MachineInstr &Cmp, - SmallVectorImpl &Cond, - SmallVectorImpl &PrevInsts, - unsigned Iter, unsigned MaxIter) const { +unsigned HexagonInstrInfo::reduceLoopCount( + MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar, + MachineInstr &Cmp, SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, unsigned Iter, + unsigned MaxIter) const { // We expect a hardware loop currently. This means that IndVar is set // to null, and the compare is the ENDLOOP instruction. assert((!IndVar) && isEndLoopN(Cmp.getOpcode()) @@ -1314,6 +1313,38 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } + case Hexagon::PS_crash: { + // Generate a misaligned load that is guaranteed to cause a crash. + class CrashPseudoSourceValue : public PseudoSourceValue { + public: + CrashPseudoSourceValue(const TargetInstrInfo &TII) + : PseudoSourceValue(TargetCustom, TII) {} + + bool isConstant(const MachineFrameInfo *) const override { + return false; + } + bool isAliased(const MachineFrameInfo *) const override { + return false; + } + bool mayAlias(const MachineFrameInfo *) const override { + return false; + } + void printCustom(raw_ostream &OS) const override { + OS << "MisalignedCrash"; + } + }; + + static const CrashPseudoSourceValue CrashPSV(*this); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(&CrashPSV), + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 8, 1); + BuildMI(MBB, MI, DL, get(Hexagon::PS_loadrdabs), Hexagon::D13) + .addImm(0xBADC0FEE) // Misaligned load. + .addMemOperand(MMO); + MBB.erase(MI); + return true; + } + case Hexagon::PS_tailcall_i: MI.setDesc(get(Hexagon::J2_jump)); return true; @@ -1681,17 +1712,19 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI, /// Hexagon counts the number of ##'s and adjust for that many /// constant exenders. unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str, - const MCAsmInfo &MAI) const { + const MCAsmInfo &MAI, + const TargetSubtargetInfo *STI) const { StringRef AStr(Str); // Count the number of instructions in the asm. bool atInsnStart = true; unsigned Length = 0; + const unsigned MaxInstLength = MAI.getMaxInstLength(STI); for (; *Str; ++Str) { if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(), strlen(MAI.getSeparatorString())) == 0) atInsnStart = true; if (atInsnStart && !std::isspace(static_cast(*Str))) { - Length += MAI.getMaxInstLength(); + Length += MaxInstLength; atInsnStart = false; } if (atInsnStart && strncmp(Str, MAI.getCommentString().data(), @@ -1823,7 +1856,8 @@ DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState( // S2_storeri_io %r29, 132, killed %r1; flags: mem:ST4[FixedStack1] // Currently AA considers the addresses in these instructions to be aliasing. bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint( - MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { + const MachineInstr &MIa, const MachineInstr &MIb, + AliasAnalysis *AA) const { if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; @@ -2425,7 +2459,7 @@ bool HexagonInstrInfo::isPredicated(unsigned Opcode) const { bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; - return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask; + return (F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask; } bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const { @@ -2894,7 +2928,7 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1, /// Get the base register and byte offset of a load/store instr. bool HexagonInstrInfo::getMemOperandWithOffset( - MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, + const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { unsigned AccessSize = 0; BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize); diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 9b840762e88a..e0a999d0f4c4 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -1,9 +1,8 @@ //===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -140,7 +139,7 @@ public: /// is finished. Return the value/register of the new loop count. We need /// this function when peeling off one or more iterations of a loop. This /// function assumes the nth iteration is peeled first. - unsigned reduceLoopCount(MachineBasicBlock &MBB, + unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar, MachineInstr &Cmp, SmallVectorImpl &Cond, SmallVectorImpl &PrevInsts, @@ -216,7 +215,8 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; /// Get the base register and byte offset of a load/store instr. - bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const override; @@ -264,8 +264,10 @@ public: /// Measure the specified inline asm to determine an approximation of its /// length. - unsigned getInlineAsmLength(const char *Str, - const MCAsmInfo &MAI) const override; + unsigned getInlineAsmLength( + const char *Str, + const MCAsmInfo &MAI, + const TargetSubtargetInfo *STI = nullptr) const override; /// Allocate and return a hazard recognizer to use for this target when /// scheduling the machine instructions after register allocation. @@ -296,7 +298,8 @@ public: // memory addresses. This function returns true if two MIs access different // memory addresses and false otherwise. bool - areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; /// For instructions with a base and offset, return the position of the diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index 9cab5748bef2..cabfd783effa 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -1,9 +1,8 @@ //===-- HexagonIntrinsics.td - Instruction intrinsics ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td index a852394f2160..44f39a3e9b16 100644 --- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td +++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td @@ -1,9 +1,8 @@ //===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/lib/Target/Hexagon/HexagonIntrinsicsV60.td index 5e5c77b38e8e..a60c80beb5d6 100644 --- a/lib/Target/Hexagon/HexagonIntrinsicsV60.td +++ b/lib/Target/Hexagon/HexagonIntrinsicsV60.td @@ -1,9 +1,8 @@ //=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 985f41f3a7d9..ac48e1dc30b0 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1,9 +1,8 @@ //===- HexagonLoopIdiomRecognition.cpp ------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -1001,6 +1000,7 @@ bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val, void PolynomialMultiplyRecognize::promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB) { Type *OrigTy = In->getType(); + assert(!OrigTy->isVoidTy() && "Invalid instruction to promote"); // Leave boolean values alone. if (!In->getType()->isIntegerTy(1)) @@ -1081,7 +1081,8 @@ bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB, std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns), [](Instruction &In) { return &In; }); for (Instruction *In : LoopIns) - promoteTo(In, DestTy, LoopB); + if (!In->isTerminator()) + promoteTo(In, DestTy, LoopB); // Fix up the PHI nodes in the exit block. Instruction *EndI = ExitB->getFirstNonPHI(); @@ -1522,7 +1523,7 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At, ParsedValues &PV) { IRBuilder<> B(&*At); Module *M = At->getParent()->getParent()->getParent(); - Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw); + Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw); Value *P = PV.P, *Q = PV.Q, *P0 = P; unsigned IC = PV.IterCount; @@ -2252,10 +2253,8 @@ CleanupAndExit: Type *Int32PtrTy = Type::getInt32PtrTy(Ctx); Type *VoidTy = Type::getVoidTy(Ctx); Module *M = Func->getParent(); - Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy, - Int32PtrTy, Int32PtrTy, Int32Ty); - Function *Fn = cast(CF); - Fn->setLinkage(Function::ExternalLinkage); + FunctionCallee Fn = M->getOrInsertFunction( + HexagonVolatileMemcpyName, VoidTy, Int32PtrTy, Int32PtrTy, Int32Ty); const SCEV *OneS = SE->getConstant(Int32Ty, 1); const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty); diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp index fb5752ade1de..d1a153920e5e 100644 --- a/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -1,9 +1,8 @@ //===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp index 9579c8b6df16..aabae009d7c3 100644 --- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp +++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h index d83bcbc41553..2961e16cc9dc 100644 --- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h +++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h @@ -1,9 +1,8 @@ //=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 908ce24136c7..0e6555024303 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -1,9 +1,8 @@ //===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -113,6 +112,7 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) { case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::COPY: case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: break; } @@ -168,6 +168,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) { case TargetOpcode::EH_LABEL: case TargetOpcode::COPY: case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: break; } Packet.push_back(SU); diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h index 585a7858ad2b..fb0a7abd339b 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.h +++ b/lib/Target/Hexagon/HexagonMachineScheduler.h @@ -1,9 +1,8 @@ //===- HexagonMachineScheduler.h - Custom Hexagon MI scheduler --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td index b7b0de0efaea..2fcefe6a4ef6 100644 --- a/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td +++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td @@ -1,9 +1,8 @@ //===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td index c29a75e6fe74..7293075532c6 100644 --- a/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td +++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td @@ -1,9 +1,8 @@ //===--- HexagonMapAsm2IntrinV65.gen.td -----------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp index f2a6627c99be..db44901ca706 100644 --- a/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -1,9 +1,8 @@ //===- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td index 232946ec1579..212cf03bee67 100644 --- a/lib/Target/Hexagon/HexagonOperands.td +++ b/lib/Target/Hexagon/HexagonOperands.td @@ -1,9 +1,8 @@ //===--- HexagonOperands.td -----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index c3a5bd5d57bf..547da9fd598f 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -1,9 +1,8 @@ //===- HexagonOptAddrMode.cpp ---------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This implements a Hexagon-specific pass to optimize addressing mode for diff --git a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp index 101de3d8fbee..d00fc23102a5 100644 --- a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp +++ b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp @@ -1,9 +1,8 @@ //===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 89177564057e..fb731f56bfbf 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -1,9 +1,8 @@ //==- HexagonPatterns.td - Target Description for Hexagon -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -279,7 +278,7 @@ class Su_ni1 if (hasOneUse(N)){ // Check if Op1 is an immediate operand. SDValue Op1 = N->getOperand(1); - return !dyn_cast(Op1); + return !isa(Op1); } return false;}], Op.OperandTransform>; @@ -3082,7 +3081,7 @@ def: Pat<(HexagonALLOCA I32:$Rs, (i32 imm:$A)), def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDTNone, [SDNPHasChain]>; def: Pat<(HexagonBARRIER), (Y2_barrier)>; -def: Pat<(trap), (J2_trap0 (i32 0))>; +def: Pat<(trap), (PS_crash)>; // Read cycle counter. def SDTInt64Leaf: SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>; diff --git a/lib/Target/Hexagon/HexagonPatternsV65.td b/lib/Target/Hexagon/HexagonPatternsV65.td index 50b76847b563..4cd45ecbe1a1 100644 --- a/lib/Target/Hexagon/HexagonPatternsV65.td +++ b/lib/Target/Hexagon/HexagonPatternsV65.td @@ -1,9 +1,8 @@ //==- HexagonPatternsV65.td -------------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp index 3c588a89b0da..8f761d2d4805 100644 --- a/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/lib/Target/Hexagon/HexagonPeephole.cpp @@ -1,9 +1,8 @@ //===-- HexagonPeephole.cpp - Hexagon Peephole Optimiztions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // This peephole pass optimizes in the following cases. // 1. Optimizes redundant sign extends for the following case diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index b9748c7e189c..7dd25d7d93d5 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -1,9 +1,8 @@ //===--- HexagonPseudo.td -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -560,3 +559,8 @@ defm PS_storerh : NewCircularStore; defm PS_storerf : NewCircularStore; defm PS_storeri : NewCircularStore; defm PS_storerd : NewCircularStore; + +// A pseudo that generates a runtime crash. This is used to implement +// __builtin_trap. +let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1, isSolo = 1 in +def PS_crash: InstHexagon<(outs), (ins), "", [], "", PSEUDO, TypePSEUDO>; diff --git a/lib/Target/Hexagon/HexagonRDFOpt.cpp b/lib/Target/Hexagon/HexagonRDFOpt.cpp index 413bc8edf2b6..910a17540e6e 100644 --- a/lib/Target/Hexagon/HexagonRDFOpt.cpp +++ b/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -1,9 +1,8 @@ //===- HexagonRDFOpt.cpp --------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 9b8f4e07376f..4f5f750e5842 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- HexagonRegisterInfo.cpp - Hexagon Register Information ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -287,7 +286,7 @@ unsigned HexagonRegisterInfo::getRARegister() const { } -unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction +Register HexagonRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const HexagonFrameLowering *TFI = getFrameLowering(MF); if (TFI->hasFP(MF)) diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h index 3e7b63a462f0..fc166b5a3410 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -1,9 +1,8 @@ //==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -67,7 +66,7 @@ public: // Debug information queries. unsigned getRARegister() const; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; unsigned getFrameRegister() const; unsigned getStackRegister() const; diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td index da90911e2c05..f12189052699 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -1,9 +1,8 @@ //===-- HexagonRegisterInfo.td - Hexagon Register defs -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td index 1024198e9b3f..0834e9000460 100644 --- a/lib/Target/Hexagon/HexagonSchedule.td +++ b/lib/Target/Hexagon/HexagonSchedule.td @@ -1,9 +1,8 @@ //===- HexagonSchedule.td - Hexagon Scheduling Definitions -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonScheduleV5.td b/lib/Target/Hexagon/HexagonScheduleV5.td index 9a893f6dde02..ba0da2c196ab 100644 --- a/lib/Target/Hexagon/HexagonScheduleV5.td +++ b/lib/Target/Hexagon/HexagonScheduleV5.td @@ -1,9 +1,8 @@ //=-HexagonScheduleV5.td - HexagonV5 Scheduling Definitions --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonScheduleV55.td b/lib/Target/Hexagon/HexagonScheduleV55.td index ca738be5d6ef..f88dd5d2056d 100644 --- a/lib/Target/Hexagon/HexagonScheduleV55.td +++ b/lib/Target/Hexagon/HexagonScheduleV55.td @@ -1,9 +1,8 @@ //=-HexagonScheduleV55.td - HexagonV55 Scheduling Definitions -*- tablegen -*=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td index 861a8d2b0339..c6539597a9e7 100644 --- a/lib/Target/Hexagon/HexagonScheduleV60.td +++ b/lib/Target/Hexagon/HexagonScheduleV60.td @@ -1,9 +1,8 @@ //=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonScheduleV62.td b/lib/Target/Hexagon/HexagonScheduleV62.td index 1c274191277c..782d76760992 100644 --- a/lib/Target/Hexagon/HexagonScheduleV62.td +++ b/lib/Target/Hexagon/HexagonScheduleV62.td @@ -1,9 +1,8 @@ //=-HexagonScheduleV62.td - HexagonV62 Scheduling Definitions *- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonScheduleV65.td b/lib/Target/Hexagon/HexagonScheduleV65.td index 46a79d521795..ac64410e559b 100644 --- a/lib/Target/Hexagon/HexagonScheduleV65.td +++ b/lib/Target/Hexagon/HexagonScheduleV65.td @@ -1,9 +1,8 @@ //=-HexagonScheduleV65.td - HexagonV65 Scheduling Definitions *- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonScheduleV66.td b/lib/Target/Hexagon/HexagonScheduleV66.td index 38e3d21d3701..56dc59e2a948 100644 --- a/lib/Target/Hexagon/HexagonScheduleV66.td +++ b/lib/Target/Hexagon/HexagonScheduleV66.td @@ -1,9 +1,8 @@ //=-HexagonScheduleV66.td - HexagonV66 Scheduling Definitions *- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index 002e87fb32ce..c5ba7ced4c30 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h index a83a8efb7588..af8b8318b059 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp index 55de25120943..bd4254aea276 100644 --- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp +++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp @@ -1,9 +1,8 @@ //=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp index e018785f24d8..013eede2d414 100644 --- a/lib/Target/Hexagon/HexagonSplitDouble.cpp +++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp @@ -1,9 +1,8 @@ //===- HexagonSplitDouble.cpp ---------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -153,8 +152,8 @@ bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const { } bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const { - for (auto &I : MI->memoperands()) - if (I->isVolatile()) + for (auto &MO : MI->memoperands()) + if (MO->isVolatile() || MO->isAtomic()) return true; return false; } diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp index 61c2121163b8..b8b61517ff95 100644 --- a/lib/Target/Hexagon/HexagonStoreWidening.cpp +++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp @@ -1,9 +1,8 @@ //===- HexagonStoreWidening.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Replace sequences of "narrow" stores to adjacent memory locations with @@ -338,8 +337,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin, return false; OG.push_back(FirstMI); - MachineInstr *S1 = FirstMI, *S2 = *(Begin+1); - InstrGroup::iterator I = Begin+1; + MachineInstr *S1 = FirstMI; // Pow2Num will be the largest number of elements in OG such that the sum // of sizes of stores 0...Pow2Num-1 will be a power of 2. @@ -351,8 +349,8 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin, // does not exceed the limit (MaxSize). // Keep track of when the total size covered is a power of 2, since // this is a size a single store can cover. - while (I != End) { - S2 = *I; + for (InstrGroup::iterator I = Begin + 1; I != End; ++I) { + MachineInstr *S2 = *I; // Stores are sorted, so if S1 and S2 are not adjacent, there won't be // any other store to fill the "hole". if (!storesAreAdjacent(S1, S2)) @@ -372,7 +370,6 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin, break; S1 = S2; - ++I; } // The stores don't add up to anything that can be widened. Clean up. diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 9c77135c2f2f..7ec63a642b0c 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -1,9 +1,8 @@ //===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 3a5acb53682c..007423ef1902 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -1,9 +1,8 @@ //===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index ddfda7e27793..80b8480448fe 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "HexagonMachineScheduler.h" #include "HexagonTargetObjectFile.h" #include "HexagonTargetTransformInfo.h" +#include "TargetInfo/HexagonTargetInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index a7c6a3437fbc..7ee4474e90e3 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -1,9 +1,8 @@ //=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 2185bf8eebc6..fdcc41a4ca41 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- HexagonTargetObjectFile.cpp ---------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -239,10 +238,7 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO, return false; } - Type *GType = GVar->getType(); - if (PointerType *PT = dyn_cast(GType)) - GType = PT->getElementType(); - + Type *GType = GVar->getValueType(); if (isa(GType)) { LLVM_DEBUG(dbgs() << "no, is an array\n"); return false; @@ -342,7 +338,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty, MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - const Type *GTy = GO->getType()->getElementType(); + const Type *GTy = GO->getValueType(); unsigned Size = getSmallestAddressableSize(GTy, GO, TM); // If we have -ffunction-section or -fdata-section then we should emit the diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h index 18863630fde2..b36282578950 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.h +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- HexagonTargetObjectFile.h -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonTargetStreamer.h b/lib/Target/Hexagon/HexagonTargetStreamer.h index e19c404450e6..c5200b76933e 100644 --- a/lib/Target/Hexagon/HexagonTargetStreamer.h +++ b/lib/Target/Hexagon/HexagonTargetStreamer.h @@ -1,9 +1,8 @@ //===-- HexagonTargetStreamer.h - Hexagon Target Streamer ------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index c942f645aa88..38062e8e922c 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file /// This file implements a TargetTransformInfo analysis pass specific to the @@ -161,14 +160,15 @@ unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned VecWidth = VecTy->getBitWidth(); if (useHVX() && isTypeForHVX(VecTy)) { unsigned RegWidth = getRegisterBitWidth(true); - Alignment = std::min(Alignment, RegWidth/8); + assert(RegWidth && "Non-zero vector register width expected"); // Cost of HVX loads. if (VecWidth % RegWidth == 0) return VecWidth / RegWidth; // Cost of constructing HVX vector from scalar loads. + Alignment = std::min(Alignment, RegWidth / 8); unsigned AlignWidth = 8 * std::max(1u, Alignment); unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; - return 3*NumLoads; + return 3 * NumLoads; } // Non-HVX vectors. diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 5c6f85584ec2..27e8fc019007 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -1,9 +1,8 @@ //==- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file /// This file implements a TargetTransformInfo analysis pass specific to the diff --git a/lib/Target/Hexagon/HexagonVExtract.cpp b/lib/Target/Hexagon/HexagonVExtract.cpp index 929ac2bd0d93..a9692f42e468 100644 --- a/lib/Target/Hexagon/HexagonVExtract.cpp +++ b/lib/Target/Hexagon/HexagonVExtract.cpp @@ -1,9 +1,8 @@ //===- HexagonVExtract.cpp ------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This pass will replace multiple occurrences of V6_extractw from the same diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 722699907ca0..3619e4c239d7 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -1,9 +1,8 @@ //===- HexagonPacketizer.cpp - VLIW packetizer ----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h index ca70cf967a46..daa86b6f5393 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h @@ -1,9 +1,8 @@ //===- HexagonPacketizer.h - VLIW packetizer --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp index 9d1073346c72..e5df1d456c1e 100644 --- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp +++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp @@ -1,9 +1,8 @@ //===- HexagonVectorLoopCarriedReuse.cpp ----------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -239,10 +238,17 @@ namespace { // used over the backedge. This is teh value that gets reused from a // previous iteration. Instruction *BackedgeInst = nullptr; + std::map DepChains; + int Iterations = -1; ReuseValue() = default; - void reset() { Inst2Replace = nullptr; BackedgeInst = nullptr; } + void reset() { + Inst2Replace = nullptr; + BackedgeInst = nullptr; + DepChains.clear(); + Iterations = -1; + } bool isDefined() { return Inst2Replace != nullptr; } }; @@ -289,10 +295,10 @@ namespace { void findDepChainFromPHI(Instruction *I, DepChain &D); void reuseValue(); Value *findValueInBlock(Value *Op, BasicBlock *BB); - bool isDepChainBtwn(Instruction *I1, Instruction *I2, int Iters); - DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2); + DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2, int Iters); bool isEquivalentOperation(Instruction *I1, Instruction *I2); bool canReplace(Instruction *I); + bool isCallInstCommutative(CallInst *C); }; } // end anonymous namespace @@ -327,6 +333,70 @@ bool HexagonVectorLoopCarriedReuse::runOnLoop(Loop *L, LPPassManager &LPM) { return doVLCR(); } +bool HexagonVectorLoopCarriedReuse::isCallInstCommutative(CallInst *C) { + switch (C->getCalledFunction()->getIntrinsicID()) { + case Intrinsic::hexagon_V6_vaddb: + case Intrinsic::hexagon_V6_vaddb_128B: + case Intrinsic::hexagon_V6_vaddh: + case Intrinsic::hexagon_V6_vaddh_128B: + case Intrinsic::hexagon_V6_vaddw: + case Intrinsic::hexagon_V6_vaddw_128B: + case Intrinsic::hexagon_V6_vaddubh: + case Intrinsic::hexagon_V6_vaddubh_128B: + case Intrinsic::hexagon_V6_vadduhw: + case Intrinsic::hexagon_V6_vadduhw_128B: + case Intrinsic::hexagon_V6_vaddhw: + case Intrinsic::hexagon_V6_vaddhw_128B: + case Intrinsic::hexagon_V6_vmaxb: + case Intrinsic::hexagon_V6_vmaxb_128B: + case Intrinsic::hexagon_V6_vmaxh: + case Intrinsic::hexagon_V6_vmaxh_128B: + case Intrinsic::hexagon_V6_vmaxw: + case Intrinsic::hexagon_V6_vmaxw_128B: + case Intrinsic::hexagon_V6_vmaxub: + case Intrinsic::hexagon_V6_vmaxub_128B: + case Intrinsic::hexagon_V6_vmaxuh: + case Intrinsic::hexagon_V6_vmaxuh_128B: + case Intrinsic::hexagon_V6_vminub: + case Intrinsic::hexagon_V6_vminub_128B: + case Intrinsic::hexagon_V6_vminuh: + case Intrinsic::hexagon_V6_vminuh_128B: + case Intrinsic::hexagon_V6_vminb: + case Intrinsic::hexagon_V6_vminb_128B: + case Intrinsic::hexagon_V6_vminh: + case Intrinsic::hexagon_V6_vminh_128B: + case Intrinsic::hexagon_V6_vminw: + case Intrinsic::hexagon_V6_vminw_128B: + case Intrinsic::hexagon_V6_vmpyub: + case Intrinsic::hexagon_V6_vmpyub_128B: + case Intrinsic::hexagon_V6_vmpyuh: + case Intrinsic::hexagon_V6_vmpyuh_128B: + case Intrinsic::hexagon_V6_vavgub: + case Intrinsic::hexagon_V6_vavgub_128B: + case Intrinsic::hexagon_V6_vavgh: + case Intrinsic::hexagon_V6_vavgh_128B: + case Intrinsic::hexagon_V6_vavguh: + case Intrinsic::hexagon_V6_vavguh_128B: + case Intrinsic::hexagon_V6_vavgw: + case Intrinsic::hexagon_V6_vavgw_128B: + case Intrinsic::hexagon_V6_vavgb: + case Intrinsic::hexagon_V6_vavgb_128B: + case Intrinsic::hexagon_V6_vavguw: + case Intrinsic::hexagon_V6_vavguw_128B: + case Intrinsic::hexagon_V6_vabsdiffh: + case Intrinsic::hexagon_V6_vabsdiffh_128B: + case Intrinsic::hexagon_V6_vabsdiffub: + case Intrinsic::hexagon_V6_vabsdiffub_128B: + case Intrinsic::hexagon_V6_vabsdiffuh: + case Intrinsic::hexagon_V6_vabsdiffuh_128B: + case Intrinsic::hexagon_V6_vabsdiffw: + case Intrinsic::hexagon_V6_vabsdiffw_128B: + return true; + default: + return false; + } +} + bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1, Instruction *I2) { if (!I1->isSameOperationAs(I2)) @@ -361,13 +431,19 @@ bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1, bool HexagonVectorLoopCarriedReuse::canReplace(Instruction *I) { const IntrinsicInst *II = dyn_cast(I); - if (II && - (II->getIntrinsicID() == Intrinsic::hexagon_V6_hi || - II->getIntrinsicID() == Intrinsic::hexagon_V6_lo)) { + if (!II) + return true; + + switch (II->getIntrinsicID()) { + case Intrinsic::hexagon_V6_hi: + case Intrinsic::hexagon_V6_lo: + case Intrinsic::hexagon_V6_hi_128B: + case Intrinsic::hexagon_V6_lo_128B: LLVM_DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n"); return false; + default: + return true; } - return true; } void HexagonVectorLoopCarriedReuse::findValueToReuse() { for (auto *D : Dependences) { @@ -428,27 +504,85 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() { int NumOperands = I->getNumOperands(); - for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { - Value *Op = I->getOperand(OpNo); - Instruction *OpInst = dyn_cast(Op); - if (!OpInst) - continue; - - Value *BEOp = BEUser->getOperand(OpNo); - Instruction *BEOpInst = dyn_cast(BEOp); - - if (!isDepChainBtwn(OpInst, BEOpInst, Iters)) { - BEUser = nullptr; - break; + // Take operands of each PNUser one by one and try to find DepChain + // with every operand of the BEUser. If any of the operands of BEUser + // has DepChain with current operand of the PNUser, break the matcher + // loop. Keep doing this for Every PNUser operand. If PNUser operand + // does not have DepChain with any of the BEUser operand, break the + // outer matcher loop, mark the BEUser as null and reset the ReuseCandidate. + // This ensures that DepChain exist for all the PNUser operand with + // BEUser operand. This also ensures that DepChains are independent of + // the positions in PNUser and BEUser. + std::map DepChains; + CallInst *C1 = dyn_cast(I); + if ((I && I->isCommutative()) || (C1 && isCallInstCommutative(C1))) { + bool Found = false; + for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { + Value *Op = I->getOperand(OpNo); + Instruction *OpInst = dyn_cast(Op); + Found = false; + for (int T = 0; T < NumOperands; ++T) { + Value *BEOp = BEUser->getOperand(T); + Instruction *BEOpInst = dyn_cast(BEOp); + if (!OpInst && !BEOpInst) { + if (Op == BEOp) { + Found = true; + break; + } + } + + if ((OpInst && !BEOpInst) || (!OpInst && BEOpInst)) + continue; + + DepChain *D = getDepChainBtwn(OpInst, BEOpInst, Iters); + + if (D) { + Found = true; + DepChains[OpInst] = D; + break; + } + } + if (!Found) { + BEUser = nullptr; + break; + } + } + } else { + + for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { + Value *Op = I->getOperand(OpNo); + Value *BEOp = BEUser->getOperand(OpNo); + + Instruction *OpInst = dyn_cast(Op); + if (!OpInst) { + if (Op == BEOp) + continue; + // Do not allow reuse to occur when the operands may be different + // values. + BEUser = nullptr; + break; + } + + Instruction *BEOpInst = dyn_cast(BEOp); + DepChain *D = getDepChainBtwn(OpInst, BEOpInst, Iters); + + if (D) { + DepChains[OpInst] = D; + } else { + BEUser = nullptr; + break; + } } } if (BEUser) { LLVM_DEBUG(dbgs() << "Found Value for reuse.\n"); ReuseCandidate.Inst2Replace = I; ReuseCandidate.BackedgeInst = BEUser; + ReuseCandidate.DepChains = DepChains; + ReuseCandidate.Iterations = Iters; return; - } else - ReuseCandidate.reset(); + } + ReuseCandidate.reset(); } } } @@ -468,27 +602,10 @@ void HexagonVectorLoopCarriedReuse::reuseValue() { Instruction *Inst2Replace = ReuseCandidate.Inst2Replace; Instruction *BEInst = ReuseCandidate.BackedgeInst; int NumOperands = Inst2Replace->getNumOperands(); - std::map DepChains; - int Iterations = -1; + std::map &DepChains = ReuseCandidate.DepChains; + int Iterations = ReuseCandidate.Iterations; BasicBlock *LoopPH = CurLoop->getLoopPreheader(); - - for (int i = 0; i < NumOperands; ++i) { - Instruction *I = dyn_cast(Inst2Replace->getOperand(i)); - if(!I) - continue; - else { - Instruction *J = cast(BEInst->getOperand(i)); - DepChain *D = getDepChainBtwn(I, J); - - assert(D && - "No DepChain between corresponding operands in ReuseCandidate\n"); - if (Iterations == -1) - Iterations = D->iterations(); - assert(Iterations == D->iterations() && "Iterations mismatch"); - DepChains[I] = D; - } - } - + assert(!DepChains.empty() && "No DepChains"); LLVM_DEBUG(dbgs() << "reuseValue is making the following changes\n"); SmallVector InstsInPreheader; @@ -597,20 +714,11 @@ void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I, } } -bool HexagonVectorLoopCarriedReuse::isDepChainBtwn(Instruction *I1, - Instruction *I2, - int Iters) { - for (auto *D : Dependences) { - if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters) - return true; - } - return false; -} - DepChain *HexagonVectorLoopCarriedReuse::getDepChainBtwn(Instruction *I1, - Instruction *I2) { + Instruction *I2, + int Iters) { for (auto *D : Dependences) { - if (D->front() == I1 && D->back() == I2) + if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters) return D; } return nullptr; diff --git a/lib/Target/Hexagon/HexagonVectorPrint.cpp b/lib/Target/Hexagon/HexagonVectorPrint.cpp index 18d2f2f4acde..65a8dcd75bdc 100644 --- a/lib/Target/Hexagon/HexagonVectorPrint.cpp +++ b/lib/Target/Hexagon/HexagonVectorPrint.cpp @@ -1,9 +1,8 @@ //===- HexagonVectorPrint.cpp - Generate vector printing instructions -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index af1e5429d0c2..7c0770926abe 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -1,13 +1,11 @@ //===-- HexagonAsmBackend.cpp - Hexagon Assembler Backend -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "Hexagon.h" #include "HexagonFixupKinds.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCChecker.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index 6543d8313900..3c64893bae45 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -1,9 +1,8 @@ //===- HexagonBaseInfo.h - Top level definitions for Hexagon ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index e82e6b559f62..f678bf49322e 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -1,14 +1,13 @@ //===-- HexagonELFObjectWriter.cpp - Hexagon Target Descriptions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "Hexagon.h" #include "MCTargetDesc/HexagonFixupKinds.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h b/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h index 347327669ad9..8b0ddbcb949f 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h @@ -1,9 +1,8 @@ //===-- HexagonFixupKinds.h - Hexagon Specific Fixup Entries --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp index 687e79a7dbab..6b9e63f5ac9e 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp @@ -1,9 +1,8 @@ //===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "HexagonInstPrinter.h" -#include "HexagonAsmPrinter.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "llvm/MC/MCAsmInfo.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h index 17af046ce090..ca32c3c1f50f 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h @@ -1,9 +1,8 @@ //===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp index 446b3b2ce668..f3da67562320 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- HexagonMCAsmInfo.cpp - Hexagon asm properties ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h index efeff2436234..e1f0a26cf858 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index 53f3cba052bc..fcd3758600c1 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -1,9 +1,8 @@ //===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/HexagonMCChecker.h" -#include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCShuffler.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index 7577baace20c..bc55ade9ccd7 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -1,9 +1,8 @@ //===- HexagonMCChecker.h - Instruction bundle checking ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 3382684803aa..95e23c99868a 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -1,14 +1,12 @@ //===- HexagonMCCodeEmitter.cpp - Hexagon Target Descriptions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/HexagonMCCodeEmitter.h" -#include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonFixupKinds.h" #include "MCTargetDesc/HexagonMCExpr.h" @@ -378,7 +376,7 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, State.Bundle = &MI; State.Index = 0; size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1; - uint64_t Features = computeAvailableFeatures(STI.getFeatureBits()); + FeatureBitset Features = computeAvailableFeatures(STI.getFeatureBits()); for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) { MCInst &HMI = const_cast(*I.getInst()); diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h index fcea63db23a3..9e86dc8e4989 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h @@ -1,9 +1,8 @@ //===- HexagonMCCodeEmitter.h - Hexagon Target Descriptions -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -83,9 +82,10 @@ private: // Return parse bits for instruction `MCI' inside bundle `MCB' uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const; - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index 3eaef9ac7410..ed571188c1e8 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -1,9 +1,8 @@ //=== HexagonMCCompound.cpp - Hexagon Compound checker -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,7 +10,6 @@ // //===----------------------------------------------------------------------===// -#include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCShuffler.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index f0654d612b4b..3cbb8600ce7a 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -1,9 +1,8 @@ //===- HexagonMCDuplexInfo.cpp - Instruction bundle checking --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index f304bc50530f..f2432883af6f 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -1,9 +1,8 @@ //=== HexagonMCELFStreamer.cpp - Hexagon subclass of MCELFStreamer -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -60,7 +59,7 @@ HexagonMCELFStreamer::HexagonMCELFStreamer( MCII(createHexagonMCInstrInfo()) {} void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB, - const MCSubtargetInfo &STI, bool) { + const MCSubtargetInfo &STI) { assert(MCB.getOpcode() == Hexagon::BUNDLE); assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE); assert(HexagonMCInstrInfo::bundleSize(MCB) > 0); diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h index c02bef8f06f7..6248bd25d433 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h @@ -1,9 +1,8 @@ //===- HexagonMCELFStreamer.h - Hexagon subclass of MCElfStreamer ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -31,8 +30,7 @@ public: std::unique_ptr Emitter, MCAssembler *Assembler); - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - bool) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; void EmitSymbol(const MCInst &Inst); void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment, diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp index f0689252b396..1e708ba1bcd3 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp @@ -1,10 +1,9 @@ //===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes //----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h index acfd996ccf82..59b1326adf0c 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h @@ -1,9 +1,8 @@ //==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index a11aa92ccbe1..0750bfe74f76 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -1,9 +1,8 @@ //===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/HexagonMCInstrInfo.h" -#include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCExpr.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index d040bea23b6d..829f872c453e 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -1,9 +1,8 @@ //===- HexagonMCInstrInfo.cpp - Utility functions on Hexagon MCInsts ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp index 4281144acaee..7d45b4fcfdde 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp @@ -1,9 +1,8 @@ //===----- HexagonMCShuffler.cpp - MC bundle shuffling --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,6 @@ #define DEBUG_TYPE "hexagon-shuffle" #include "MCTargetDesc/HexagonMCShuffler.h" -#include "Hexagon.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonShuffler.h" #include "llvm/MC/MCInst.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h index 59658999d24d..3410c0ddbd84 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h @@ -1,9 +1,8 @@ //===- HexagonMCShuffler.h --------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 92ce7345f358..9c50b25156c3 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,13 +11,13 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/HexagonMCTargetDesc.h" -#include "Hexagon.h" #include "HexagonDepArch.h" #include "HexagonTargetStreamer.h" #include "MCTargetDesc/HexagonInstPrinter.h" #include "MCTargetDesc/HexagonMCAsmInfo.h" #include "MCTargetDesc/HexagonMCELFStreamer.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/ELF.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index d6ea664222d3..7b42460a2a1c 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,7 +63,6 @@ class StringRef; class raw_ostream; class raw_pwrite_stream; -Target &getTheHexagonTarget(); extern cl::opt HexagonDisableCompound; extern cl::opt HexagonDisableDuplex; extern const InstrStage HexagonStages[]; diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index f4ee2bbfaaaa..18c7790a17cc 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -1,9 +1,8 @@ //===- HexagonShuffler.cpp - Instruction bundle shuffling -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,6 @@ #define DEBUG_TYPE "hexagon-shuffle" #include "MCTargetDesc/HexagonShuffler.h" -#include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" @@ -23,6 +21,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index ef50c5bebbfb..bf3bad36dfe5 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -1,9 +1,8 @@ //===- HexagonShuffler.h - Instruction bundle shuffling ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,8 +14,8 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONSHUFFLER_H #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONSHUFFLER_H -#include "Hexagon.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp index 4339fa2089d9..7702024f87bd 100644 --- a/lib/Target/Hexagon/RDFCopy.cpp +++ b/lib/Target/Hexagon/RDFCopy.cpp @@ -1,9 +1,8 @@ //===- RDFCopy.cpp --------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/RDFCopy.h b/lib/Target/Hexagon/RDFCopy.h index 7b2e78bdf633..1450ab884849 100644 --- a/lib/Target/Hexagon/RDFCopy.h +++ b/lib/Target/Hexagon/RDFCopy.h @@ -1,9 +1,8 @@ //===- RDFCopy.h ------------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp index 8dcd485d65e9..52178931aa6d 100644 --- a/lib/Target/Hexagon/RDFDeadCode.cpp +++ b/lib/Target/Hexagon/RDFDeadCode.cpp @@ -1,9 +1,8 @@ //===--- RDFDeadCode.cpp --------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/RDFDeadCode.h b/lib/Target/Hexagon/RDFDeadCode.h index 8977e730b855..7f91977e1d6c 100644 --- a/lib/Target/Hexagon/RDFDeadCode.h +++ b/lib/Target/Hexagon/RDFDeadCode.h @@ -1,9 +1,8 @@ //===--- RDFDeadCode.h ----------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp index d8ca08e70505..9d8f706b8a0f 100644 --- a/lib/Target/Hexagon/RDFGraph.cpp +++ b/lib/Target/Hexagon/RDFGraph.cpp @@ -1,9 +1,8 @@ //===- RDFGraph.cpp -------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -55,7 +54,6 @@ raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { auto &TRI = P.G.getTRI(); if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs()) @@ -66,7 +64,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { auto NA = P.G.addr(P.Obj); uint16_t Attrs = NA.Addr->getAttrs(); @@ -116,7 +113,6 @@ static void printRefHeader(raw_ostream &OS, const NodeAddr RA, OS << '!'; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { printRefHeader(OS, P.Obj, P.G); OS << '('; @@ -134,7 +130,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { printRefHeader(OS, P.Obj, P.G); OS << '('; @@ -146,7 +141,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { printRefHeader(OS, P.Obj, P.G); @@ -162,7 +156,6 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { switch (P.Obj.Addr->getKind()) { case NodeAttrs::Def: @@ -178,7 +171,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { unsigned N = P.Obj.size(); for (auto I : P.Obj) { @@ -189,7 +181,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { unsigned N = P.Obj.size(); for (auto I : P.Obj) { @@ -224,16 +215,13 @@ namespace { } // end anonymous namespace -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { OS << Print(P.Obj.Id, P.G) << ": phi [" << PrintListV(P.Obj.Addr->members(P.G), P.G) << ']'; return OS; } -template<> -raw_ostream &operator<< (raw_ostream &OS, - const Print> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print> &P) { const MachineInstr &MI = *P.Obj.Addr->getCode(); unsigned Opc = MI.getOpcode(); OS << Print(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc); @@ -258,7 +246,6 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { switch (P.Obj.Addr->getKind()) { @@ -275,7 +262,6 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print> &P) { MachineBasicBlock *BB = P.Obj.Addr->getCode(); @@ -309,9 +295,7 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -template<> -raw_ostream &operator<< (raw_ostream &OS, - const Print> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print> &P) { OS << "DFG dump:[\n" << Print(P.Obj.Id, P.G) << ": Function: " << P.Obj.Addr->getCode()->getName() << '\n'; for (auto I : P.Obj.Addr->members(P.G)) @@ -320,7 +304,6 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { OS << '{'; for (auto I : P.Obj) @@ -329,13 +312,11 @@ raw_ostream &operator<< (raw_ostream &OS, const Print &P) { return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { P.Obj.print(OS); return OS; } -template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) { diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h index e3abb0e22f76..585f43e116f9 100644 --- a/lib/Target/Hexagon/RDFGraph.h +++ b/lib/Target/Hexagon/RDFGraph.h @@ -1,9 +1,8 @@ //===- RDFGraph.h -----------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -925,10 +924,6 @@ namespace rdf { return MM; } - template struct Print; - template - raw_ostream &operator<< (raw_ostream &OS, const Print &P); - template struct Print { Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {} @@ -943,6 +938,29 @@ namespace rdf { : Print>(x, g) {} }; + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + raw_ostream &operator<<(raw_ostream &OS, const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, + const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + raw_ostream &operator<<(raw_ostream &OS, const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, + const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, + const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, + const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, + const Print> &P); + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + raw_ostream &operator<<(raw_ostream &OS, + const Print &P); + } // end namespace rdf } // end namespace llvm diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp index 9ff48d25a026..9cd304aa10bc 100644 --- a/lib/Target/Hexagon/RDFLiveness.cpp +++ b/lib/Target/Hexagon/RDFLiveness.cpp @@ -1,9 +1,8 @@ //===- RDFLiveness.cpp ----------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -58,7 +57,6 @@ static cl::opt MaxRecNest("rdf-liveness-max-rec", cl::init(25), namespace llvm { namespace rdf { - template<> raw_ostream &operator<< (raw_ostream &OS, const Print &P) { OS << '{'; for (auto &I : P.Obj) { diff --git a/lib/Target/Hexagon/RDFLiveness.h b/lib/Target/Hexagon/RDFLiveness.h index eaeb4ea115b3..ea4890271726 100644 --- a/lib/Target/Hexagon/RDFLiveness.h +++ b/lib/Target/Hexagon/RDFLiveness.h @@ -1,9 +1,8 @@ //===- RDFLiveness.h --------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -143,6 +142,8 @@ namespace rdf { unsigned Nest, unsigned MaxNest); }; + raw_ostream &operator<<(raw_ostream &OS, const Print &P); + } // end namespace rdf } // end namespace llvm diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp index 9408c5dc3952..6e0f33695f0e 100644 --- a/lib/Target/Hexagon/RDFRegisters.cpp +++ b/lib/Target/Hexagon/RDFRegisters.cpp @@ -1,9 +1,8 @@ //===- RDFRegisters.cpp ---------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h index 459850d87df1..646233bacda5 100644 --- a/lib/Target/Hexagon/RDFRegisters.h +++ b/lib/Target/Hexagon/RDFRegisters.h @@ -1,9 +1,8 @@ //===- RDFRegisters.h -------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp index 78e2f2b2ddb3..d77b235d0077 100644 --- a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp +++ b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp @@ -1,14 +1,12 @@ //===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "Hexagon.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/HexagonTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h new file mode 100644 index 000000000000..902b61cb5b6c --- /dev/null +++ b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h @@ -0,0 +1,20 @@ +//===-- HexagonTargetInfo.h - Hexagon Target Implementation -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_HEXAGON_TARGETINFO_HEXAGONTARGETINFO_H +#define LLVM_LIB_TARGET_HEXAGON_TARGETINFO_HEXAGONTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheHexagonTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_HEXAGON_TARGETINFO_HEXAGONTARGETINFO_H diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index a77b2b8f15ca..9af8a0b35b2f 100644 --- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -1,16 +1,16 @@ //===-- LanaiAsmParser.cpp - Parse Lanai assembly to MCInst instructions --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "Lanai.h" #include "LanaiAluCode.h" #include "LanaiCondCode.h" +#include "LanaiInstrInfo.h" #include "MCTargetDesc/LanaiMCExpr.h" +#include "TargetInfo/LanaiTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp index 609b650e5d32..25ae7c521706 100644 --- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp +++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp @@ -1,9 +1,8 @@ //===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,8 +12,10 @@ #include "LanaiDisassembler.h" -#include "Lanai.h" -#include "LanaiSubtarget.h" +#include "LanaiAluCode.h" +#include "LanaiCondCode.h" +#include "LanaiInstrInfo.h" +#include "TargetInfo/LanaiTargetInfo.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h index e0c19e8ea644..ae821df303d8 100644 --- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h +++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h @@ -1,9 +1,8 @@ //===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp deleted file mode 100644 index 2fa411fcfd87..000000000000 --- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp +++ /dev/null @@ -1,305 +0,0 @@ -//===-- LanaiInstPrinter.cpp - Convert Lanai MCInst to asm syntax ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an Lanai MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "LanaiInstPrinter.h" -#include "Lanai.h" -#include "MCTargetDesc/LanaiMCExpr.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// Include the auto-generated portion of the assembly writer. -#define PRINT_ALIAS_INSTR -#include "LanaiGenAsmWriter.inc" - -void LanaiInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << StringRef(getRegisterName(RegNo)).lower(); -} - -bool LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Alias, unsigned OpNo0, - unsigned OpNo1) { - OS << "\t" << Alias << " "; - printOperand(MI, OpNo0, OS); - OS << ", "; - printOperand(MI, OpNo1, OS); - return true; -} - -static bool usesGivenOffset(const MCInst *MI, int AddOffset) { - unsigned AluCode = MI->getOperand(3).getImm(); - return LPAC::encodeLanaiAluCode(AluCode) == LPAC::ADD && - (MI->getOperand(2).getImm() == AddOffset || - MI->getOperand(2).getImm() == -AddOffset); -} - -static bool isPreIncrementForm(const MCInst *MI, int AddOffset) { - unsigned AluCode = MI->getOperand(3).getImm(); - return LPAC::isPreOp(AluCode) && usesGivenOffset(MI, AddOffset); -} - -static bool isPostIncrementForm(const MCInst *MI, int AddOffset) { - unsigned AluCode = MI->getOperand(3).getImm(); - return LPAC::isPostOp(AluCode) && usesGivenOffset(MI, AddOffset); -} - -static StringRef decIncOperator(const MCInst *MI) { - if (MI->getOperand(2).getImm() < 0) - return "--"; - return "++"; -} - -bool LanaiInstPrinter::printMemoryLoadIncrement(const MCInst *MI, - raw_ostream &OS, - StringRef Opcode, - int AddOffset) { - if (isPreIncrementForm(MI, AddOffset)) { - OS << "\t" << Opcode << "\t[" << decIncOperator(MI) << "%" - << getRegisterName(MI->getOperand(1).getReg()) << "], %" - << getRegisterName(MI->getOperand(0).getReg()); - return true; - } - if (isPostIncrementForm(MI, AddOffset)) { - OS << "\t" << Opcode << "\t[%" - << getRegisterName(MI->getOperand(1).getReg()) << decIncOperator(MI) - << "], %" << getRegisterName(MI->getOperand(0).getReg()); - return true; - } - return false; -} - -bool LanaiInstPrinter::printMemoryStoreIncrement(const MCInst *MI, - raw_ostream &OS, - StringRef Opcode, - int AddOffset) { - if (isPreIncrementForm(MI, AddOffset)) { - OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg()) - << ", [" << decIncOperator(MI) << "%" - << getRegisterName(MI->getOperand(1).getReg()) << "]"; - return true; - } - if (isPostIncrementForm(MI, AddOffset)) { - OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg()) - << ", [%" << getRegisterName(MI->getOperand(1).getReg()) - << decIncOperator(MI) << "]"; - return true; - } - return false; -} - -bool LanaiInstPrinter::printAlias(const MCInst *MI, raw_ostream &OS) { - switch (MI->getOpcode()) { - case Lanai::LDW_RI: - // ld 4[*%rN], %rX => ld [++imm], %rX - // ld -4[*%rN], %rX => ld [--imm], %rX - // ld 4[%rN*], %rX => ld [imm++], %rX - // ld -4[%rN*], %rX => ld [imm--], %rX - return printMemoryLoadIncrement(MI, OS, "ld", 4); - case Lanai::LDHs_RI: - return printMemoryLoadIncrement(MI, OS, "ld.h", 2); - case Lanai::LDHz_RI: - return printMemoryLoadIncrement(MI, OS, "uld.h", 2); - case Lanai::LDBs_RI: - return printMemoryLoadIncrement(MI, OS, "ld.b", 1); - case Lanai::LDBz_RI: - return printMemoryLoadIncrement(MI, OS, "uld.b", 1); - case Lanai::SW_RI: - // st %rX, 4[*%rN] => st %rX, [++imm] - // st %rX, -4[*%rN] => st %rX, [--imm] - // st %rX, 4[%rN*] => st %rX, [imm++] - // st %rX, -4[%rN*] => st %rX, [imm--] - return printMemoryStoreIncrement(MI, OS, "st", 4); - case Lanai::STH_RI: - return printMemoryStoreIncrement(MI, OS, "st.h", 2); - case Lanai::STB_RI: - return printMemoryStoreIncrement(MI, OS, "st.b", 1); - default: - return false; - } -} - -void LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annotation, - const MCSubtargetInfo & /*STI*/) { - if (!printAlias(MI, OS) && !printAliasInstr(MI, OS)) - printInstruction(MI, OS); - printAnnotation(OS, Annotation); -} - -void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS, const char *Modifier) { - assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) - OS << "%" << getRegisterName(Op.getReg()); - else if (Op.isImm()) - OS << formatHex(Op.getImm()); - else { - assert(Op.isExpr() && "Expected an expression"); - Op.getExpr()->print(OS, &MAI); - } -} - -void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - OS << '[' << formatHex(Op.getImm()) << ']'; - } else { - // Symbolic operand will be lowered to immediate value by linker - assert(Op.isExpr() && "Expected an expression"); - OS << '['; - Op.getExpr()->print(OS, &MAI); - OS << ']'; - } -} - -void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - OS << formatHex(Op.getImm() << 16); - } else { - // Symbolic operand will be lowered to immediate value by linker - assert(Op.isExpr() && "Expected an expression"); - Op.getExpr()->print(OS, &MAI); - } -} - -void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - OS << formatHex((Op.getImm() << 16) | 0xffff); - } else { - // Symbolic operand will be lowered to immediate value by linker - assert(Op.isExpr() && "Expected an expression"); - Op.getExpr()->print(OS, &MAI); - } -} - -void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - OS << formatHex(0xffff0000 | Op.getImm()); - } else { - // Symbolic operand will be lowered to immediate value by linker - assert(Op.isExpr() && "Expected an expression"); - Op.getExpr()->print(OS, &MAI); - } -} - -static void printMemoryBaseRegister(raw_ostream &OS, const unsigned AluCode, - const MCOperand &RegOp) { - assert(RegOp.isReg() && "Register operand expected"); - OS << "["; - if (LPAC::isPreOp(AluCode)) - OS << "*"; - OS << "%" << LanaiInstPrinter::getRegisterName(RegOp.getReg()); - if (LPAC::isPostOp(AluCode)) - OS << "*"; - OS << "]"; -} - -template -static void printMemoryImmediateOffset(const MCAsmInfo &MAI, - const MCOperand &OffsetOp, - raw_ostream &OS) { - assert((OffsetOp.isImm() || OffsetOp.isExpr()) && "Immediate expected"); - if (OffsetOp.isImm()) { - assert(isInt(OffsetOp.getImm()) && "Constant value truncated"); - OS << OffsetOp.getImm(); - } else - OffsetOp.getExpr()->print(OS, &MAI); -} - -void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo, - raw_ostream &OS, - const char * /*Modifier*/) { - const MCOperand &RegOp = MI->getOperand(OpNo); - const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); - const MCOperand &AluOp = MI->getOperand(OpNo + 2); - const unsigned AluCode = AluOp.getImm(); - - // Offset - printMemoryImmediateOffset<16>(MAI, OffsetOp, OS); - - // Register - printMemoryBaseRegister(OS, AluCode, RegOp); -} - -void LanaiInstPrinter::printMemRrOperand(const MCInst *MI, int OpNo, - raw_ostream &OS, - const char * /*Modifier*/) { - const MCOperand &RegOp = MI->getOperand(OpNo); - const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); - const MCOperand &AluOp = MI->getOperand(OpNo + 2); - const unsigned AluCode = AluOp.getImm(); - assert(OffsetOp.isReg() && RegOp.isReg() && "Registers expected."); - - // [ Base OP Offset ] - OS << "["; - if (LPAC::isPreOp(AluCode)) - OS << "*"; - OS << "%" << getRegisterName(RegOp.getReg()); - if (LPAC::isPostOp(AluCode)) - OS << "*"; - OS << " " << LPAC::lanaiAluCodeToString(AluCode) << " "; - OS << "%" << getRegisterName(OffsetOp.getReg()); - OS << "]"; -} - -void LanaiInstPrinter::printMemSplsOperand(const MCInst *MI, int OpNo, - raw_ostream &OS, - const char * /*Modifier*/) { - const MCOperand &RegOp = MI->getOperand(OpNo); - const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); - const MCOperand &AluOp = MI->getOperand(OpNo + 2); - const unsigned AluCode = AluOp.getImm(); - - // Offset - printMemoryImmediateOffset<10>(MAI, OffsetOp, OS); - - // Register - printMemoryBaseRegister(OS, AluCode, RegOp); -} - -void LanaiInstPrinter::printCCOperand(const MCInst *MI, int OpNo, - raw_ostream &OS) { - LPCC::CondCode CC = - static_cast(MI->getOperand(OpNo).getImm()); - // Handle the undefined value here for printing so we don't abort(). - if (CC >= LPCC::UNKNOWN) - OS << ""; - else - OS << lanaiCondCodeToString(CC); -} - -void LanaiInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - LPCC::CondCode CC = - static_cast(MI->getOperand(OpNo).getImm()); - // Handle the undefined value here for printing so we don't abort(). - if (CC >= LPCC::UNKNOWN) - OS << ""; - else if (CC != LPCC::ICC_T) - OS << "." << lanaiCondCodeToString(CC); -} diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h deleted file mode 100644 index 59904fbaa318..000000000000 --- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h +++ /dev/null @@ -1,66 +0,0 @@ -//= LanaiInstPrinter.h - Convert Lanai MCInst to asm syntax -------*- C++ -*--// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a Lanai MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H -#define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H - -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class LanaiInstPrinter : public MCInstPrinter { -public: - LanaiInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O); - void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O); - void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - bool printAliasInstr(const MCInst *MI, raw_ostream &OS); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - -private: - bool printAlias(const MCInst *MI, raw_ostream &Ostream); - bool printInst(const MCInst *MI, raw_ostream &Ostream, StringRef Alias, - unsigned OpNo0, unsigned OpnNo1); - bool printMemoryLoadIncrement(const MCInst *MI, raw_ostream &Ostream, - StringRef Opcode, int AddOffset); - bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream, - StringRef Opcode, int AddOffset); -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H diff --git a/lib/Target/Lanai/Lanai.h b/lib/Target/Lanai/Lanai.h index c1fdf793305b..2f06ea91ab03 100644 --- a/lib/Target/Lanai/Lanai.h +++ b/lib/Target/Lanai/Lanai.h @@ -1,9 +1,8 @@ //===-- Lanai.h - Top-level interface for Lanai representation --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,12 +14,7 @@ #ifndef LLVM_LIB_TARGET_LANAI_LANAI_H #define LLVM_LIB_TARGET_LANAI_LANAI_H -#include "LanaiAluCode.h" -#include "LanaiCondCode.h" -#include "MCTargetDesc/LanaiBaseInfo.h" -#include "MCTargetDesc/LanaiMCTargetDesc.h" -#include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Pass.h" namespace llvm { class FunctionPass; @@ -45,7 +39,6 @@ FunctionPass *createLanaiMemAluCombinerPass(); // operations. FunctionPass *createLanaiSetflagAluCombinerPass(); -Target &getTheLanaiTarget(); } // namespace llvm #endif // LLVM_LIB_TARGET_LANAI_LANAI_H diff --git a/lib/Target/Lanai/Lanai.td b/lib/Target/Lanai/Lanai.td index 73d080457034..c6d949f42047 100644 --- a/lib/Target/Lanai/Lanai.td +++ b/lib/Target/Lanai/Lanai.td @@ -1,9 +1,8 @@ //===- Lanai.td - Describe the Lanai Target Machine --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiAluCode.h b/lib/Target/Lanai/LanaiAluCode.h index d5145694fe46..728332bff00b 100644 --- a/lib/Target/Lanai/LanaiAluCode.h +++ b/lib/Target/Lanai/LanaiAluCode.h @@ -1,9 +1,8 @@ //===-- LanaiAluCode.h - ALU operator encoding ----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiAsmPrinter.cpp b/lib/Target/Lanai/LanaiAsmPrinter.cpp index 607b2a97b29f..64d963475e1a 100644 --- a/lib/Target/Lanai/LanaiAsmPrinter.cpp +++ b/lib/Target/Lanai/LanaiAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- LanaiAsmPrinter.cpp - Lanai LLVM assembly writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,11 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/LanaiInstPrinter.h" -#include "Lanai.h" +#include "MCTargetDesc/LanaiInstPrinter.h" +#include "LanaiAluCode.h" +#include "LanaiCondCode.h" #include "LanaiInstrInfo.h" #include "LanaiMCInstLower.h" #include "LanaiTargetMachine.h" +#include "TargetInfo/LanaiTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -49,8 +50,7 @@ public: void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void EmitInstruction(const MachineInstr *MI) override; bool isBlockOnlyReachableByFallthrough( const MachineBasicBlock *MBB) const override; @@ -109,7 +109,6 @@ void LanaiAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, // PrintAsmOperand - Print out an operand for an inline asm expression. bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned /*AsmVariant*/, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { @@ -139,7 +138,7 @@ bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return false; } default: - return true; // Unknown modifier. + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); } } printOperand(MI, OpNo, O); diff --git a/lib/Target/Lanai/LanaiCallingConv.td b/lib/Target/Lanai/LanaiCallingConv.td index 056b329c33c5..e2306725290a 100644 --- a/lib/Target/Lanai/LanaiCallingConv.td +++ b/lib/Target/Lanai/LanaiCallingConv.td @@ -1,9 +1,8 @@ //===- LanaiCallingConv.td - Calling Conventions Lanai -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp index ea76a1128373..09c63dca23e2 100644 --- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp +++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp @@ -1,9 +1,8 @@ //===-- LanaiDelaySlotFiller.cpp - Lanai delay slot filler ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiFrameLowering.cpp b/lib/Target/Lanai/LanaiFrameLowering.cpp index 0723668c743e..142c09c504cc 100644 --- a/lib/Target/Lanai/LanaiFrameLowering.cpp +++ b/lib/Target/Lanai/LanaiFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- LanaiFrameLowering.cpp - Lanai Frame Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,8 +12,8 @@ #include "LanaiFrameLowering.h" +#include "LanaiAluCode.h" #include "LanaiInstrInfo.h" -#include "LanaiMachineFunctionInfo.h" #include "LanaiSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h index ca690d513fc2..5fe4535543ec 100644 --- a/lib/Target/Lanai/LanaiFrameLowering.h +++ b/lib/Target/Lanai/LanaiFrameLowering.h @@ -1,9 +1,8 @@ //===-- LanaiFrameLowering.h - Define frame lowering for Lanai --*- C++-*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H #define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H -#include "Lanai.h" #include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp index 5081cfbe4922..aadcdc43f560 100644 --- a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp +++ b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- LanaiISelDAGToDAG.cpp - A dag to dag inst selector for Lanai ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "Lanai.h" +#include "LanaiAluCode.h" #include "LanaiMachineFunctionInfo.h" #include "LanaiRegisterInfo.h" #include "LanaiSubtarget.h" diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp index 0411704be6fb..1ed078bb433f 100644 --- a/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/lib/Target/Lanai/LanaiISelLowering.cpp @@ -1,9 +1,8 @@ //===-- LanaiISelLowering.cpp - Lanai DAG Lowering Implementation ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h index 0cde633cb41a..e7b5755e9041 100644 --- a/lib/Target/Lanai/LanaiISelLowering.h +++ b/lib/Target/Lanai/LanaiISelLowering.h @@ -1,9 +1,8 @@ //===-- LanaiISelLowering.h - Lanai DAG Lowering Interface -....-*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiInstrFormats.td b/lib/Target/Lanai/LanaiInstrFormats.td index 1bb6b3d26a49..4101aa912ade 100644 --- a/lib/Target/Lanai/LanaiInstrFormats.td +++ b/lib/Target/Lanai/LanaiInstrFormats.td @@ -1,9 +1,8 @@ //===- LanaiInstrFormats.td - Lanai Instruction Formats ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp index 196768fdc56a..700a86069102 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- LanaiInstrInfo.cpp - Lanai Instruction Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,10 +10,10 @@ // //===----------------------------------------------------------------------===// -#include "Lanai.h" #include "LanaiInstrInfo.h" -#include "LanaiMachineFunctionInfo.h" -#include "LanaiTargetMachine.h" +#include "LanaiAluCode.h" +#include "LanaiCondCode.h" +#include "MCTargetDesc/LanaiBaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -87,7 +86,8 @@ void LanaiInstrInfo::loadRegFromStackSlot( } bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint( - MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis * /*AA*/) const { + const MachineInstr &MIa, const MachineInstr &MIb, + AliasAnalysis * /*AA*/) const { assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); @@ -101,7 +101,7 @@ bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint( // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. const TargetRegisterInfo *TRI = &getRegisterInfo(); - MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; + const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned int WidthA = 0, WidthB = 0; if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && @@ -756,7 +756,7 @@ unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI, } bool LanaiInstrInfo::getMemOperandWithOffsetWidth( - MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, + const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, const TargetRegisterInfo * /*TRI*/) const { // Handle only loads/stores with base register followed by immediate offset // and with add as ALU op. @@ -794,8 +794,8 @@ bool LanaiInstrInfo::getMemOperandWithOffsetWidth( return true; } -bool LanaiInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, - MachineOperand *&BaseOp, +bool LanaiInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { switch (LdSt.getOpcode()) { diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h index bdcf9a361b5f..d71424aeb0b1 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.h +++ b/lib/Target/Lanai/LanaiInstrInfo.h @@ -1,9 +1,8 @@ //===- LanaiInstrInfo.h - Lanai Instruction Information ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -36,7 +35,8 @@ public: return RegisterInfo; } - bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, @@ -68,11 +68,13 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; - bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const override; - bool getMemOperandWithOffsetWidth(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td index 66192b4a4704..fcf89a0b52f6 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.td +++ b/lib/Target/Lanai/LanaiInstrInfo.td @@ -1,9 +1,8 @@ //===-- LanaiInstrInfo.td - Target Description for Lanai Target -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiMCInstLower.cpp b/lib/Target/Lanai/LanaiMCInstLower.cpp index 90ede6566acf..743f4f7c6e2f 100644 --- a/lib/Target/Lanai/LanaiMCInstLower.cpp +++ b/lib/Target/Lanai/LanaiMCInstLower.cpp @@ -1,9 +1,8 @@ //=-- LanaiMCInstLower.cpp - Convert Lanai MachineInstr to an MCInst --------=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiMCInstLower.h b/lib/Target/Lanai/LanaiMCInstLower.h index 6d7818d63d87..00d3ebb05045 100644 --- a/lib/Target/Lanai/LanaiMCInstLower.h +++ b/lib/Target/Lanai/LanaiMCInstLower.h @@ -1,9 +1,8 @@ //===-- LanaiMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp index c72271b67790..7b4e0750ba08 100644 --- a/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp +++ b/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- LanaiMachineFuctionInfo.cpp - Lanai machine function info ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/lib/Target/Lanai/LanaiMachineFunctionInfo.h index 3bd9112a9e13..2c97c619c246 100644 --- a/lib/Target/Lanai/LanaiMachineFunctionInfo.h +++ b/lib/Target/Lanai/LanaiMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===- LanaiMachineFuctionInfo.h - Lanai machine func info -------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/lib/Target/Lanai/LanaiMemAluCombiner.cpp index 54500b0e52e3..67443b771d3d 100644 --- a/lib/Target/Lanai/LanaiMemAluCombiner.cpp +++ b/lib/Target/Lanai/LanaiMemAluCombiner.cpp @@ -1,9 +1,8 @@ //===-- LanaiMemAluCombiner.cpp - Pass to combine memory & ALU operations -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Simple pass to combine memory and ALU operations @@ -23,7 +22,7 @@ // in the same machine basic block into one machine instruction. //===----------------------------------------------------------------------===// -#include "Lanai.h" +#include "LanaiAluCode.h" #include "LanaiTargetMachine.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" @@ -159,7 +158,8 @@ bool isNonVolatileMemoryOp(const MachineInstr &MI) { const MachineMemOperand *MemOperand = *MI.memoperands_begin(); // Don't move volatile memory accesses - if (MemOperand->isVolatile()) + // TODO: unclear if we need to be as conservative about atomics + if (MemOperand->isVolatile() || MemOperand->isAtomic()) return false; return true; diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp index 56a5e0ea2def..d3056a1eba8e 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.cpp +++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- LanaiRegisterInfo.cpp - Lanai Register Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,8 +11,10 @@ //===----------------------------------------------------------------------===// #include "LanaiRegisterInfo.h" -#include "Lanai.h" -#include "LanaiSubtarget.h" +#include "LanaiAluCode.h" +#include "LanaiCondCode.h" +#include "LanaiFrameLowering.h" +#include "LanaiInstrInfo.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -257,12 +258,12 @@ bool LanaiRegisterInfo::hasBasePointer(const MachineFunction &MF) const { unsigned LanaiRegisterInfo::getRARegister() const { return Lanai::RCA; } -unsigned +Register LanaiRegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const { return Lanai::FP; } -unsigned LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; } +Register LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; } const uint32_t * LanaiRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/, diff --git a/lib/Target/Lanai/LanaiRegisterInfo.h b/lib/Target/Lanai/LanaiRegisterInfo.h index 35f4788b2886..4e4da619d366 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.h +++ b/lib/Target/Lanai/LanaiRegisterInfo.h @@ -1,9 +1,8 @@ //===- LanaiRegisterInfo.h - Lanai Register Information Impl ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -43,8 +42,8 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo { // Debug information queries. unsigned getRARegister() const; - unsigned getFrameRegister(const MachineFunction &MF) const override; - unsigned getBaseRegister() const; + Register getFrameRegister(const MachineFunction &MF) const override; + Register getBaseRegister() const; bool hasBasePointer(const MachineFunction &MF) const; int getDwarfRegNum(unsigned RegNum, bool IsEH) const; diff --git a/lib/Target/Lanai/LanaiRegisterInfo.td b/lib/Target/Lanai/LanaiRegisterInfo.td index cf8cfe30cce9..5879dfca8d65 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.td +++ b/lib/Target/Lanai/LanaiRegisterInfo.td @@ -1,9 +1,8 @@ //===- LanaiRegisterInfo.td - Lanai Register defs ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Declarations that describe the Lanai register file diff --git a/lib/Target/Lanai/LanaiSchedule.td b/lib/Target/Lanai/LanaiSchedule.td index 7f931c4be8bb..32763c7fdf49 100644 --- a/lib/Target/Lanai/LanaiSchedule.td +++ b/lib/Target/Lanai/LanaiSchedule.td @@ -1,9 +1,8 @@ //=-LanaiSchedule.td - Lanai Scheduling Definitions --*- tablegen -*-=========// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp b/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp index b71c30fe3e05..dff87a3e264d 100644 --- a/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp +++ b/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- LanaiSelectionDAGInfo.cpp - Lanai SelectionDAG Info -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiSelectionDAGInfo.h b/lib/Target/Lanai/LanaiSelectionDAGInfo.h index bfd2be2ede09..c5650a7c1f53 100644 --- a/lib/Target/Lanai/LanaiSelectionDAGInfo.h +++ b/lib/Target/Lanai/LanaiSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- LanaiSelectionDAGInfo.h - Lanai SelectionDAG Info -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiSubtarget.cpp b/lib/Target/Lanai/LanaiSubtarget.cpp index 0fa5e82a7a66..9a872c789bcc 100644 --- a/lib/Target/Lanai/LanaiSubtarget.cpp +++ b/lib/Target/Lanai/LanaiSubtarget.cpp @@ -1,9 +1,8 @@ //===- LanaiSubtarget.cpp - Lanai Subtarget Information -----------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h index 4bfa19920239..116c83a4df91 100644 --- a/lib/Target/Lanai/LanaiSubtarget.h +++ b/lib/Target/Lanai/LanaiSubtarget.h @@ -1,9 +1,8 @@ //=====-- LanaiSubtarget.h - Define Subtarget for the Lanai -----*- C++ -*--==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 10bd9e2c65d2..8ae0225629ab 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- LanaiTargetMachine.cpp - Define TargetMachine for Lanai ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,6 +15,7 @@ #include "Lanai.h" #include "LanaiTargetObjectFile.h" #include "LanaiTargetTransformInfo.h" +#include "TargetInfo/LanaiTargetInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 0db286ec13e7..d2ac40007e24 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -1,9 +1,8 @@ //===-- LanaiTargetMachine.h - Define TargetMachine for Lanai --- C++ ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/lib/Target/Lanai/LanaiTargetObjectFile.cpp index 7d165e9c5f8c..b0f7c090bb8e 100644 --- a/lib/Target/Lanai/LanaiTargetObjectFile.cpp +++ b/lib/Target/Lanai/LanaiTargetObjectFile.cpp @@ -1,8 +1,7 @@ // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.h b/lib/Target/Lanai/LanaiTargetObjectFile.h index 99ec1956da4b..938a1e675b6a 100644 --- a/lib/Target/Lanai/LanaiTargetObjectFile.h +++ b/lib/Target/Lanai/LanaiTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- LanaiTargetObjectFile.h - Lanai Object Info -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiTargetTransformInfo.h b/lib/Target/Lanai/LanaiTargetTransformInfo.h index 3b5a1b88326b..63cc47dedce3 100644 --- a/lib/Target/Lanai/LanaiTargetTransformInfo.h +++ b/lib/Target/Lanai/LanaiTargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- LanaiTargetTransformInfo.h - Lanai specific TTI ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index 82fa93ea5e5e..a6ce3d5eb4ff 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- LanaiAsmBackend.cpp - Lanai Assembler Backend ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h index ce7f83509c9b..1bc84014e736 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h +++ b/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h @@ -1,9 +1,8 @@ //===-- LanaiBaseInfo.h - Top level definitions for Lanai MC ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp index 7676891ef981..4313fa5a82b5 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- LanaiELFObjectWriter.cpp - Lanai ELF Writer -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -35,7 +34,7 @@ protected: LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI) : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI, - /*HasRelocationAddend=*/true) {} + /*HasRelocationAddend_=*/true) {} unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/, const MCValue & /*Target*/, diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h b/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h index 9ff8340d2922..1e692f8d31cb 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h +++ b/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h @@ -1,9 +1,8 @@ //===-- LanaiFixupKinds.h - Lanai Specific Fixup Entries --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp new file mode 100644 index 000000000000..0d42612824b4 --- /dev/null +++ b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp @@ -0,0 +1,307 @@ +//===-- LanaiInstPrinter.cpp - Convert Lanai MCInst to asm syntax ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an Lanai MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "LanaiInstPrinter.h" +#include "LanaiMCExpr.h" +#include "LanaiAluCode.h" +#include "LanaiCondCode.h" +#include "MCTargetDesc/LanaiMCTargetDesc.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "LanaiGenAsmWriter.inc" + +void LanaiInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << StringRef(getRegisterName(RegNo)).lower(); +} + +bool LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Alias, unsigned OpNo0, + unsigned OpNo1) { + OS << "\t" << Alias << " "; + printOperand(MI, OpNo0, OS); + OS << ", "; + printOperand(MI, OpNo1, OS); + return true; +} + +static bool usesGivenOffset(const MCInst *MI, int AddOffset) { + unsigned AluCode = MI->getOperand(3).getImm(); + return LPAC::encodeLanaiAluCode(AluCode) == LPAC::ADD && + (MI->getOperand(2).getImm() == AddOffset || + MI->getOperand(2).getImm() == -AddOffset); +} + +static bool isPreIncrementForm(const MCInst *MI, int AddOffset) { + unsigned AluCode = MI->getOperand(3).getImm(); + return LPAC::isPreOp(AluCode) && usesGivenOffset(MI, AddOffset); +} + +static bool isPostIncrementForm(const MCInst *MI, int AddOffset) { + unsigned AluCode = MI->getOperand(3).getImm(); + return LPAC::isPostOp(AluCode) && usesGivenOffset(MI, AddOffset); +} + +static StringRef decIncOperator(const MCInst *MI) { + if (MI->getOperand(2).getImm() < 0) + return "--"; + return "++"; +} + +bool LanaiInstPrinter::printMemoryLoadIncrement(const MCInst *MI, + raw_ostream &OS, + StringRef Opcode, + int AddOffset) { + if (isPreIncrementForm(MI, AddOffset)) { + OS << "\t" << Opcode << "\t[" << decIncOperator(MI) << "%" + << getRegisterName(MI->getOperand(1).getReg()) << "], %" + << getRegisterName(MI->getOperand(0).getReg()); + return true; + } + if (isPostIncrementForm(MI, AddOffset)) { + OS << "\t" << Opcode << "\t[%" + << getRegisterName(MI->getOperand(1).getReg()) << decIncOperator(MI) + << "], %" << getRegisterName(MI->getOperand(0).getReg()); + return true; + } + return false; +} + +bool LanaiInstPrinter::printMemoryStoreIncrement(const MCInst *MI, + raw_ostream &OS, + StringRef Opcode, + int AddOffset) { + if (isPreIncrementForm(MI, AddOffset)) { + OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg()) + << ", [" << decIncOperator(MI) << "%" + << getRegisterName(MI->getOperand(1).getReg()) << "]"; + return true; + } + if (isPostIncrementForm(MI, AddOffset)) { + OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg()) + << ", [%" << getRegisterName(MI->getOperand(1).getReg()) + << decIncOperator(MI) << "]"; + return true; + } + return false; +} + +bool LanaiInstPrinter::printAlias(const MCInst *MI, raw_ostream &OS) { + switch (MI->getOpcode()) { + case Lanai::LDW_RI: + // ld 4[*%rN], %rX => ld [++imm], %rX + // ld -4[*%rN], %rX => ld [--imm], %rX + // ld 4[%rN*], %rX => ld [imm++], %rX + // ld -4[%rN*], %rX => ld [imm--], %rX + return printMemoryLoadIncrement(MI, OS, "ld", 4); + case Lanai::LDHs_RI: + return printMemoryLoadIncrement(MI, OS, "ld.h", 2); + case Lanai::LDHz_RI: + return printMemoryLoadIncrement(MI, OS, "uld.h", 2); + case Lanai::LDBs_RI: + return printMemoryLoadIncrement(MI, OS, "ld.b", 1); + case Lanai::LDBz_RI: + return printMemoryLoadIncrement(MI, OS, "uld.b", 1); + case Lanai::SW_RI: + // st %rX, 4[*%rN] => st %rX, [++imm] + // st %rX, -4[*%rN] => st %rX, [--imm] + // st %rX, 4[%rN*] => st %rX, [imm++] + // st %rX, -4[%rN*] => st %rX, [imm--] + return printMemoryStoreIncrement(MI, OS, "st", 4); + case Lanai::STH_RI: + return printMemoryStoreIncrement(MI, OS, "st.h", 2); + case Lanai::STB_RI: + return printMemoryStoreIncrement(MI, OS, "st.b", 1); + default: + return false; + } +} + +void LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annotation, + const MCSubtargetInfo & /*STI*/) { + if (!printAlias(MI, OS) && !printAliasInstr(MI, OS)) + printInstruction(MI, OS); + printAnnotation(OS, Annotation); +} + +void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS, const char *Modifier) { + assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) + OS << "%" << getRegisterName(Op.getReg()); + else if (Op.isImm()) + OS << formatHex(Op.getImm()); + else { + assert(Op.isExpr() && "Expected an expression"); + Op.getExpr()->print(OS, &MAI); + } +} + +void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + OS << '[' << formatHex(Op.getImm()) << ']'; + } else { + // Symbolic operand will be lowered to immediate value by linker + assert(Op.isExpr() && "Expected an expression"); + OS << '['; + Op.getExpr()->print(OS, &MAI); + OS << ']'; + } +} + +void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + OS << formatHex(Op.getImm() << 16); + } else { + // Symbolic operand will be lowered to immediate value by linker + assert(Op.isExpr() && "Expected an expression"); + Op.getExpr()->print(OS, &MAI); + } +} + +void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + OS << formatHex((Op.getImm() << 16) | 0xffff); + } else { + // Symbolic operand will be lowered to immediate value by linker + assert(Op.isExpr() && "Expected an expression"); + Op.getExpr()->print(OS, &MAI); + } +} + +void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + OS << formatHex(0xffff0000 | Op.getImm()); + } else { + // Symbolic operand will be lowered to immediate value by linker + assert(Op.isExpr() && "Expected an expression"); + Op.getExpr()->print(OS, &MAI); + } +} + +static void printMemoryBaseRegister(raw_ostream &OS, const unsigned AluCode, + const MCOperand &RegOp) { + assert(RegOp.isReg() && "Register operand expected"); + OS << "["; + if (LPAC::isPreOp(AluCode)) + OS << "*"; + OS << "%" << LanaiInstPrinter::getRegisterName(RegOp.getReg()); + if (LPAC::isPostOp(AluCode)) + OS << "*"; + OS << "]"; +} + +template +static void printMemoryImmediateOffset(const MCAsmInfo &MAI, + const MCOperand &OffsetOp, + raw_ostream &OS) { + assert((OffsetOp.isImm() || OffsetOp.isExpr()) && "Immediate expected"); + if (OffsetOp.isImm()) { + assert(isInt(OffsetOp.getImm()) && "Constant value truncated"); + OS << OffsetOp.getImm(); + } else + OffsetOp.getExpr()->print(OS, &MAI); +} + +void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo, + raw_ostream &OS, + const char * /*Modifier*/) { + const MCOperand &RegOp = MI->getOperand(OpNo); + const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); + const MCOperand &AluOp = MI->getOperand(OpNo + 2); + const unsigned AluCode = AluOp.getImm(); + + // Offset + printMemoryImmediateOffset<16>(MAI, OffsetOp, OS); + + // Register + printMemoryBaseRegister(OS, AluCode, RegOp); +} + +void LanaiInstPrinter::printMemRrOperand(const MCInst *MI, int OpNo, + raw_ostream &OS, + const char * /*Modifier*/) { + const MCOperand &RegOp = MI->getOperand(OpNo); + const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); + const MCOperand &AluOp = MI->getOperand(OpNo + 2); + const unsigned AluCode = AluOp.getImm(); + assert(OffsetOp.isReg() && RegOp.isReg() && "Registers expected."); + + // [ Base OP Offset ] + OS << "["; + if (LPAC::isPreOp(AluCode)) + OS << "*"; + OS << "%" << getRegisterName(RegOp.getReg()); + if (LPAC::isPostOp(AluCode)) + OS << "*"; + OS << " " << LPAC::lanaiAluCodeToString(AluCode) << " "; + OS << "%" << getRegisterName(OffsetOp.getReg()); + OS << "]"; +} + +void LanaiInstPrinter::printMemSplsOperand(const MCInst *MI, int OpNo, + raw_ostream &OS, + const char * /*Modifier*/) { + const MCOperand &RegOp = MI->getOperand(OpNo); + const MCOperand &OffsetOp = MI->getOperand(OpNo + 1); + const MCOperand &AluOp = MI->getOperand(OpNo + 2); + const unsigned AluCode = AluOp.getImm(); + + // Offset + printMemoryImmediateOffset<10>(MAI, OffsetOp, OS); + + // Register + printMemoryBaseRegister(OS, AluCode, RegOp); +} + +void LanaiInstPrinter::printCCOperand(const MCInst *MI, int OpNo, + raw_ostream &OS) { + LPCC::CondCode CC = + static_cast(MI->getOperand(OpNo).getImm()); + // Handle the undefined value here for printing so we don't abort(). + if (CC >= LPCC::UNKNOWN) + OS << ""; + else + OS << lanaiCondCodeToString(CC); +} + +void LanaiInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + LPCC::CondCode CC = + static_cast(MI->getOperand(OpNo).getImm()); + // Handle the undefined value here for printing so we don't abort(). + if (CC >= LPCC::UNKNOWN) + OS << ""; + else if (CC != LPCC::ICC_T) + OS << "." << lanaiCondCodeToString(CC); +} diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h new file mode 100644 index 000000000000..721a129a859e --- /dev/null +++ b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h @@ -0,0 +1,65 @@ +//= LanaiInstPrinter.h - Convert Lanai MCInst to asm syntax -------*- C++ -*--// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints a Lanai MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H +#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class LanaiInstPrinter : public MCInstPrinter { +public: + LanaiInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O); + void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O); + void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + +private: + bool printAlias(const MCInst *MI, raw_ostream &Ostream); + bool printInst(const MCInst *MI, raw_ostream &Ostream, StringRef Alias, + unsigned OpNo0, unsigned OpnNo1); + bool printMemoryLoadIncrement(const MCInst *MI, raw_ostream &Ostream, + StringRef Opcode, int AddOffset); + bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream, + StringRef Opcode, int AddOffset); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp index 7e2705e67b6d..14d3dac26d1f 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- LanaiMCAsmInfo.cpp - Lanai asm properties -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h index 3eef0592d2fa..265af425d037 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h @@ -1,9 +1,8 @@ //=====-- LanaiMCAsmInfo.h - Lanai asm properties -----------*- C++ -*--====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp index 21f4005aaf83..df4ee297155f 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- LanaiMCCodeEmitter.cpp - Convert Lanai code to machine code -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "Lanai.h" +#include "LanaiAluCode.h" #include "MCTargetDesc/LanaiBaseInfo.h" #include "MCTargetDesc/LanaiFixupKinds.h" #include "MCTargetDesc/LanaiMCExpr.h" diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp index 201c95de07f4..56d5fbf40360 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp @@ -1,9 +1,8 @@ //===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h index 5004d541ff70..c99af32d9102 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h @@ -1,9 +1,8 @@ //===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp index ddb01cdd2d8f..a9de0416fcac 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- LanaiMCTargetDesc.cpp - Lanai Target Descriptions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,8 +11,9 @@ //===----------------------------------------------------------------------===// #include "LanaiMCTargetDesc.h" -#include "InstPrinter/LanaiInstPrinter.h" +#include "LanaiInstPrinter.h" #include "LanaiMCAsmInfo.h" +#include "TargetInfo/LanaiTargetInfo.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCInst.h" diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h index 2d8828ea4fa9..cf66d3226659 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- LanaiMCTargetDesc.h - Lanai Target Descriptions ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -32,8 +31,6 @@ class Triple; class StringRef; class raw_pwrite_stream; -Target &getTheLanaiTarget(); - MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp index ccf47b08fcff..93deb891dec5 100644 --- a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp +++ b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp @@ -1,23 +1,20 @@ //===-- LanaiTargetInfo.cpp - Lanai Target Implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "llvm/IR/Module.h" +#include "TargetInfo/LanaiTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; -namespace llvm { -Target &getTheLanaiTarget() { +Target &llvm::getTheLanaiTarget() { static Target TheLanaiTarget; return TheLanaiTarget; } -} // namespace llvm extern "C" void LLVMInitializeLanaiTargetInfo() { RegisterTarget X(getTheLanaiTarget(), "lanai", "Lanai", diff --git a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h new file mode 100644 index 000000000000..429cf0234a60 --- /dev/null +++ b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h @@ -0,0 +1,20 @@ +//===-- LanaiTargetInfo.h - Lanai Target Implementation ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_LANAI_TARGETINFO_LANAITARGETINFO_H +#define LLVM_LIB_TARGET_LANAI_TARGETINFO_LANAITARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheLanaiTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_LANAI_TARGETINFO_LANAITARGETINFO_H diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index 1ad70ac72c73..a0ec14ae2381 100644 --- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -1,15 +1,15 @@ //===- MSP430AsmParser.cpp - Parse MSP430 assembly to MCInst instructions -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MSP430.h" #include "MSP430RegisterInfo.h" #include "MCTargetDesc/MSP430MCTargetDesc.h" +#include "TargetInfo/MSP430TargetInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/StringSwitch.h" diff --git a/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp index e5da130f9bbb..59c12e24e8bf 100644 --- a/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp +++ b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp @@ -1,9 +1,8 @@ //===-- MSP430Disassembler.cpp - Disassembler for MSP430 ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "MSP430.h" #include "MCTargetDesc/MSP430MCTargetDesc.h" +#include "TargetInfo/MSP430TargetInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp deleted file mode 100644 index 4d62547bc65b..000000000000 --- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp +++ /dev/null @@ -1,138 +0,0 @@ -//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an MSP430 MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "MSP430InstPrinter.h" -#include "MSP430.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// Include the auto-generated portion of the assembly writer. -#define PRINT_ALIAS_INSTR -#include "MSP430GenAsmWriter.inc" - -void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - if (!printAliasInstr(MI, O)) - printInstruction(MI, O); - printAnnotation(O, Annot); -} - -void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - int64_t Imm = Op.getImm() * 2 + 2; - O << "$"; - if (Imm >= 0) - O << '+'; - O << Imm; - } else { - assert(Op.isExpr() && "unknown pcrel immediate operand"); - Op.getExpr()->print(O, &MAI); - } -} - -void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { - assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported"); - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - O << getRegisterName(Op.getReg()); - } else if (Op.isImm()) { - O << '#' << Op.getImm(); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << '#'; - Op.getExpr()->print(O, &MAI); - } -} - -void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, - const char *Modifier) { - const MCOperand &Base = MI->getOperand(OpNo); - const MCOperand &Disp = MI->getOperand(OpNo+1); - - // Print displacement first - - // If the global address expression is a part of displacement field with a - // register base, we should not emit any prefix symbol here, e.g. - // mov.w &foo, r1 - // vs - // mov.w glb(r1), r2 - // Otherwise (!) msp430-as will silently miscompile the output :( - if (Base.getReg() == MSP430::SR) - O << '&'; - - if (Disp.isExpr()) - Disp.getExpr()->print(O, &MAI); - else { - assert(Disp.isImm() && "Expected immediate in displacement field"); - O << Disp.getImm(); - } - - // Print register base field - if ((Base.getReg() != MSP430::SR) && - (Base.getReg() != MSP430::PC)) - O << '(' << getRegisterName(Base.getReg()) << ')'; -} - -void MSP430InstPrinter::printIndRegOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Base = MI->getOperand(OpNo); - O << "@" << getRegisterName(Base.getReg()); -} - -void MSP430InstPrinter::printPostIndRegOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Base = MI->getOperand(OpNo); - O << "@" << getRegisterName(Base.getReg()) << "+"; -} - -void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned CC = MI->getOperand(OpNo).getImm(); - - switch (CC) { - default: - llvm_unreachable("Unsupported CC code"); - case MSP430CC::COND_E: - O << "eq"; - break; - case MSP430CC::COND_NE: - O << "ne"; - break; - case MSP430CC::COND_HS: - O << "hs"; - break; - case MSP430CC::COND_LO: - O << "lo"; - break; - case MSP430CC::COND_GE: - O << "ge"; - break; - case MSP430CC::COND_L: - O << 'l'; - break; - case MSP430CC::COND_N: - O << 'n'; - break; - } -} diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h deleted file mode 100644 index cd02c4fa645a..000000000000 --- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h +++ /dev/null @@ -1,50 +0,0 @@ -//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a MSP430 MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H -#define LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - class MSP430InstPrinter : public MCInstPrinter { - public: - MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - bool printAliasInstr(const MCInst *MI, raw_ostream &O); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - -private: - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = nullptr); - void printIndRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printPostIndRegOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O); - void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - }; -} - -#endif diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp index bd69a9d8d795..365e5da74de0 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp @@ -1,9 +1,8 @@ //===-- MSP430AsmBackend.cpp - MSP430 Assembler Backend -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp index e47db2400a05..38b7da32c246 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- MSP430ELFObjectWriter.cpp - MSP430 ELF Writer ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp index 9449cb278024..4e054f85ccc3 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp @@ -1,9 +1,8 @@ //===-- MSP430ELFStreamer.cpp - MSP430 ELF Target Streamer Methods --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h index 1eb6a2759423..68e41b0fb874 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h +++ b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h @@ -1,9 +1,8 @@ //===-- MSP430FixupKinds.h - MSP430 Specific Fixup Entries ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp new file mode 100644 index 000000000000..2f3c6ed3c17e --- /dev/null +++ b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp @@ -0,0 +1,137 @@ +//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an MSP430 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "MSP430InstPrinter.h" +#include "MSP430.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "MSP430GenAsmWriter.inc" + +void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + if (!printAliasInstr(MI, O)) + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + int64_t Imm = Op.getImm() * 2 + 2; + O << "$"; + if (Imm >= 0) + O << '+'; + O << Imm; + } else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + Op.getExpr()->print(O, &MAI); + } +} + +void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported"); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + O << getRegisterName(Op.getReg()); + } else if (Op.isImm()) { + O << '#' << Op.getImm(); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << '#'; + Op.getExpr()->print(O, &MAI); + } +} + +void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, + const char *Modifier) { + const MCOperand &Base = MI->getOperand(OpNo); + const MCOperand &Disp = MI->getOperand(OpNo+1); + + // Print displacement first + + // If the global address expression is a part of displacement field with a + // register base, we should not emit any prefix symbol here, e.g. + // mov.w &foo, r1 + // vs + // mov.w glb(r1), r2 + // Otherwise (!) msp430-as will silently miscompile the output :( + if (Base.getReg() == MSP430::SR) + O << '&'; + + if (Disp.isExpr()) + Disp.getExpr()->print(O, &MAI); + else { + assert(Disp.isImm() && "Expected immediate in displacement field"); + O << Disp.getImm(); + } + + // Print register base field + if ((Base.getReg() != MSP430::SR) && + (Base.getReg() != MSP430::PC)) + O << '(' << getRegisterName(Base.getReg()) << ')'; +} + +void MSP430InstPrinter::printIndRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Base = MI->getOperand(OpNo); + O << "@" << getRegisterName(Base.getReg()); +} + +void MSP430InstPrinter::printPostIndRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Base = MI->getOperand(OpNo); + O << "@" << getRegisterName(Base.getReg()) << "+"; +} + +void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CC = MI->getOperand(OpNo).getImm(); + + switch (CC) { + default: + llvm_unreachable("Unsupported CC code"); + case MSP430CC::COND_E: + O << "eq"; + break; + case MSP430CC::COND_NE: + O << "ne"; + break; + case MSP430CC::COND_HS: + O << "hs"; + break; + case MSP430CC::COND_LO: + O << "lo"; + break; + case MSP430CC::COND_GE: + O << "ge"; + break; + case MSP430CC::COND_L: + O << 'l'; + break; + case MSP430CC::COND_N: + O << 'n'; + break; + } +} diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h new file mode 100644 index 000000000000..25451033236e --- /dev/null +++ b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h @@ -0,0 +1,49 @@ +//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints a MSP430 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430INSTPRINTER_H +#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430INSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + class MSP430InstPrinter : public MCInstPrinter { + public: + MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + bool printAliasInstr(const MCInst *MI, raw_ostream &O); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + +private: + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = nullptr); + void printIndRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPostIndRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O); + void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + }; +} + +#endif diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp index 36e9a9c31075..db5a49dd22a7 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,4 +23,5 @@ MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) { AlignmentIsInBytes = false; UsesELFSectionDirectiveForBSS = true; + UseIntegratedAssembler = true; } diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h index de486ec4b7bd..93979df037e6 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h @@ -1,9 +1,8 @@ //===-- MSP430MCAsmInfo.h - MSP430 asm properties --------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp index 06f9f307cb1a..cf57e87a073d 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- MSP430MCCodeEmitter.cpp - Convert MSP430 code to machine code -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp index b21145d3904a..da928733015f 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- MSP430MCTargetDesc.cpp - MSP430 Target Descriptions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,8 +11,9 @@ //===----------------------------------------------------------------------===// #include "MSP430MCTargetDesc.h" -#include "InstPrinter/MSP430InstPrinter.h" +#include "MSP430InstPrinter.h" #include "MSP430MCAsmInfo.h" +#include "TargetInfo/MSP430TargetInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h index e484c79c9ee9..02bfbe40c6bf 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h @@ -1,9 +1,8 @@ //===-- MSP430MCTargetDesc.h - MSP430 Target Descriptions -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,8 +29,6 @@ class MCObjectTargetWriter; class MCStreamer; class MCTargetStreamer; -Target &getTheMSP430Target(); - /// Creates a machine code emitter for MSP430. MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h index 7a5314a10844..67f35b8034d9 100644 --- a/lib/Target/MSP430/MSP430.h +++ b/lib/Target/MSP430/MSP430.h @@ -1,9 +1,8 @@ //==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td index 8fa99dc13dd5..38aa30fcf4dd 100644 --- a/lib/Target/MSP430/MSP430.td +++ b/lib/Target/MSP430/MSP430.td @@ -1,9 +1,8 @@ //===-- MSP430.td - Describe the MSP430 Target Machine -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This is the top level entry point for the MSP430 target. diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp index f39c21fc8aa2..3a71a084d1af 100644 --- a/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,11 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/MSP430InstPrinter.h" +#include "MCTargetDesc/MSP430InstPrinter.h" #include "MSP430.h" #include "MSP430InstrInfo.h" #include "MSP430MCInstLower.h" #include "MSP430TargetMachine.h" +#include "TargetInfo/MSP430TargetInfo.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -28,6 +29,7 @@ #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/TargetRegistry.h" @@ -44,20 +46,34 @@ namespace { StringRef getPassName() const override { return "MSP430 Assembly Printer"; } + bool runOnMachineFunction(MachineFunction &MF) override; + + void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override; void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, const char* Modifier = nullptr); void printSrcMemOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; - bool PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override; void EmitInstruction(const MachineInstr *MI) override; + + void EmitInterruptVectorSection(MachineFunction &ISR); }; } // end of anonymous namespace +void MSP430AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { + uint64_t Offset = MO.getOffset(); + if (Offset) + O << '(' << Offset << '+'; + + getSymbol(MO.getGlobal())->print(O, MAI); + + if (Offset) + O << ')'; +} void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, const char *Modifier) { @@ -76,25 +92,13 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum, MO.getMBB()->getSymbol()->print(O, MAI); return; case MachineOperand::MO_GlobalAddress: { - bool isMemOp = Modifier && !strcmp(Modifier, "mem"); - uint64_t Offset = MO.getOffset(); - // If the global address expression is a part of displacement field with a // register base, we should not emit any prefix symbol here, e.g. - // mov.w &foo, r1 - // vs // mov.w glb(r1), r2 // Otherwise (!) msp430-as will silently miscompile the output :( if (!Modifier || strcmp(Modifier, "nohash")) - O << (isMemOp ? '&' : '#'); - if (Offset) - O << '(' << Offset << '+'; - - getSymbol(MO.getGlobal())->print(O, MAI); - - if (Offset) - O << ')'; - + O << '#'; + PrintSymbolOperand(MO, O); return; } } @@ -108,12 +112,12 @@ void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum, // Print displacement first // Imm here is in fact global address - print extra modifier. - if (Disp.isImm() && !Base.getReg()) + if (Disp.isImm() && Base.getReg() == MSP430::SR) O << '&'; printOperand(MI, OpNum+1, O, "nohash"); // Print register base field - if (Base.getReg()) { + if (Base.getReg() != MSP430::SR && Base.getReg() != MSP430::PC) { O << '('; printOperand(MI, OpNum, O); O << ')'; @@ -123,18 +127,17 @@ void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum, /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) - return true; // Unknown modifier. + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); printOperand(MI, OpNo, O); return false; } bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, + unsigned OpNo, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) { @@ -153,6 +156,32 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } +void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) { + MCSection *Cur = OutStreamer->getCurrentSectionOnly(); + const auto *F = &ISR.getFunction(); + assert(F->hasFnAttribute("interrupt") && + "Functions with MSP430_INTR CC should have 'interrupt' attribute"); + StringRef IVIdx = F->getFnAttribute("interrupt").getValueAsString(); + MCSection *IV = OutStreamer->getContext().getELFSection( + "__interrupt_vector_" + IVIdx, + ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR); + OutStreamer->SwitchSection(IV); + + const MCSymbol *FunctionSymbol = getSymbol(F); + OutStreamer->EmitSymbolValue(FunctionSymbol, TM.getProgramPointerSize()); + OutStreamer->SwitchSection(Cur); +} + +bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + // Emit separate section for an interrupt vector if ISR + if (MF.getFunction().getCallingConv() == CallingConv::MSP430_INTR) + EmitInterruptVectorSection(MF); + + SetupMachineFunction(MF); + EmitFunctionBody(); + return false; +} + // Force static initialization. extern "C" void LLVMInitializeMSP430AsmPrinter() { RegisterAsmPrinter X(getTheMSP430Target()); diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp index 2b3495405545..45e7c26e4d30 100644 --- a/lib/Target/MSP430/MSP430BranchSelector.cpp +++ b/lib/Target/MSP430/MSP430BranchSelector.cpp @@ -1,9 +1,8 @@ //===-- MSP430BranchSelector.cpp - Emit long conditional branches ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td index 0434f8abfbf4..49191fa5dd5f 100644 --- a/lib/Target/MSP430/MSP430CallingConv.td +++ b/lib/Target/MSP430/MSP430CallingConv.td @@ -1,9 +1,8 @@ //==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for MSP430 architecture. diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp index 2421f09fbf59..de60ad9bd7e6 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -1,9 +1,8 @@ //===-- MSP430FrameLowering.cpp - MSP430 Frame Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index 8807101f37ca..33ce3c70a2a3 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -1,9 +1,8 @@ //==- MSP430FrameLowering.h - Define frame lowering for MSP430 --*- C++ -*--==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index 7a1998ad355d..23449585505e 100644 --- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index 3e706134afc5..fedfb857bd0f 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index 731bc1406711..ee6b6316d7a9 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -1,9 +1,8 @@ //===-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430InstrFormats.td b/lib/Target/MSP430/MSP430InstrFormats.td index e2e4503db20c..36f40d6fc89d 100644 --- a/lib/Target/MSP430/MSP430InstrFormats.td +++ b/lib/Target/MSP430/MSP430InstrFormats.td @@ -1,9 +1,8 @@ //===-- MSP430InstrFormats.td - MSP430 Instruction Formats -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp index c136933a51bc..5c3a3fc69266 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -1,9 +1,8 @@ //===-- MSP430InstrInfo.cpp - MSP430 Instruction Information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -308,7 +307,8 @@ unsigned MSP430InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: return 0; - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(), diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h index fee3bea9b8d6..13c50ad23adc 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.h +++ b/lib/Target/MSP430/MSP430InstrInfo.h @@ -1,9 +1,8 @@ //===-- MSP430InstrInfo.h - MSP430 Instruction Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index 25c81d94f75b..aaca3504822d 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -1,9 +1,8 @@ //===-- MSP430InstrInfo.td - MSP430 Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp index 860c0006f782..1e57f33386e6 100644 --- a/lib/Target/MSP430/MSP430MCInstLower.cpp +++ b/lib/Target/MSP430/MSP430MCInstLower.cpp @@ -1,9 +1,8 @@ //===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h index ebd639744bcc..910ad4bb12d5 100644 --- a/lib/Target/MSP430/MSP430MCInstLower.h +++ b/lib/Target/MSP430/MSP430MCInstLower.h @@ -1,9 +1,8 @@ //===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp index b442fc03b257..1d3a6d118bd6 100644 --- a/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp +++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h index fcaa8a1d6c72..2b2c8967a749 100644 --- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h +++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h @@ -1,9 +1,8 @@ //=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 54e53e19eb54..afbb2f213b45 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- MSP430RegisterInfo.cpp - MSP430 Register Information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -155,7 +154,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); } -unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const MSP430FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP; } diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index 47a5e147953e..c3eff93f55d2 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -1,9 +1,8 @@ //===-- MSP430RegisterInfo.h - MSP430 Register Information Impl -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -38,7 +37,7 @@ public: RegScavenger *RS = nullptr) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td index 1e86bdf34a0b..11003dba383f 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.td +++ b/lib/Target/MSP430/MSP430RegisterInfo.td @@ -1,9 +1,8 @@ //===-- MSP430RegisterInfo.td - MSP430 Register defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 776a9dcb11d4..20168773cd53 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -1,9 +1,8 @@ //===-- MSP430Subtarget.cpp - MSP430 Subtarget Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h index 01a428056377..ab2b71e3bb1a 100644 --- a/lib/Target/MSP430/MSP430Subtarget.h +++ b/lib/Target/MSP430/MSP430Subtarget.h @@ -1,9 +1,8 @@ //===-- MSP430Subtarget.h - Define Subtarget for the MSP430 ----*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 9f6ebba75ec6..8c4ca982c966 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -1,9 +1,8 @@ //===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "MSP430TargetMachine.h" #include "MSP430.h" +#include "TargetInfo/MSP430TargetInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h index 4935b80cfdd9..96fbc3ba0377 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.h +++ b/lib/Target/MSP430/MSP430TargetMachine.h @@ -1,9 +1,8 @@ //===-- MSP430TargetMachine.h - Define TargetMachine for MSP430 -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp index dfa21f580cb7..5da7d588079f 100644 --- a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp +++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp @@ -1,14 +1,12 @@ //===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "MSP430.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/MSP430TargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h new file mode 100644 index 000000000000..17854244f28b --- /dev/null +++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h @@ -0,0 +1,20 @@ +//===-- MSP430TargetInfo.h - MSP430 Target Implementation -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_MSP430_TARGETINFO_MSP430TARGETINFO_H +#define LLVM_LIB_TARGET_MSP430_TARGETINFO_MSP430TARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheMSP430Target(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_MSP430_TARGETINFO_MSP430TARGETINFO_H diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index d2fed6861477..1f7d095bf49b 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -1,9 +1,8 @@ //===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,6 +12,7 @@ #include "MCTargetDesc/MipsMCExpr.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "MipsTargetStreamer.h" +#include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -29,6 +29,7 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCAsmParserExtension.h" +#include "llvm/MC/MCParser/MCAsmParserUtils.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCSectionELF.h" @@ -65,10 +66,7 @@ class MCInstrInfo; } // end namespace llvm -static cl::opt -EmitJalrReloc("mips-jalr-reloc", cl::Hidden, - cl::desc("MIPS: Emit R_{MICRO}MIPS_JALR relocation with jalr"), - cl::init(true)); +extern cl::opt EmitJalrReloc; namespace { @@ -148,6 +146,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool IsPicEnabled; bool IsCpRestoreSet; int CpRestoreOffset; + unsigned GPReg; unsigned CpSaveLocation; /// If true, then CpSaveLocation is a register, otherwise it's an offset. bool CpSaveLocationIsRegister; @@ -277,6 +276,15 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); + bool expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + + bool expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + + bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, @@ -304,6 +312,9 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, bool IsLoad); + bool expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); @@ -324,6 +335,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetFeature(uint64_t Feature); bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup. bool parseDirectiveCpLoad(SMLoc Loc); + bool parseDirectiveCpLocal(SMLoc Loc); bool parseDirectiveCpRestore(SMLoc Loc); bool parseDirectiveCPSetup(); bool parseDirectiveCPReturn(); @@ -517,6 +529,7 @@ public: IsCpRestoreSet = false; CpRestoreOffset = -1; + GPReg = ABI.GetGlobalPtr(); const Triple &TheTriple = sti.getTargetTriple(); IsLittleEndian = TheTriple.isLittleEndian(); @@ -895,14 +908,6 @@ private: .getRegister(RegIdx.Index); } - /// Coerce the register to FGRH32 and return the real register for the current - /// target. - unsigned getFGRH32Reg() const { - assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!"); - return RegIdx.RegInfo->getRegClass(Mips::FGRH32RegClassID) - .getRegister(RegIdx.Index); - } - /// Coerce the register to FCC and return the real register for the current /// target. unsigned getFCCReg() const { @@ -1100,11 +1105,6 @@ public: "registers"); } - void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(getFGRH32Reg())); - } - void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getFCCReg())); @@ -2043,7 +2043,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, const MCExpr *Lo16RelocExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext()); - TOut.emitRRX(Mips::LW, Mips::T9, Mips::GP, + TOut.emitRRX(Mips::LW, Mips::T9, GPReg, MCOperand::createExpr(Got16RelocExpr), IDLoc, STI); TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9, MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI); @@ -2057,7 +2057,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext()); TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, - Mips::GP, MCOperand::createExpr(GotDispRelocExpr), IDLoc, + GPReg, MCOperand::createExpr(GotDispRelocExpr), IDLoc, STI); } } else { @@ -2068,7 +2068,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, const MCExpr *Call16RelocExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext()); - TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP, + TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, GPReg, MCOperand::createExpr(Call16RelocExpr), IDLoc, STI); } @@ -2485,6 +2485,19 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, case Mips::NORImm: case Mips::NORImm64: return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SGE: + case Mips::SGEU: + return expandSge(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SGEImm: + case Mips::SGEUImm: + case Mips::SGEImm64: + case Mips::SGEUImm64: + return expandSgeImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SGTImm: + case Mips::SGTUImm: + case Mips::SGTImm64: + case Mips::SGTUImm64: + return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SLTImm64: if (isInt<16>(Inst.getOperand(2).getImm())) { Inst.setOpcode(Mips::SLTi64); @@ -2553,6 +2566,10 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, Inst.getOpcode() == Mips::LDMacro) ? MER_Fail : MER_Success; + case Mips::SDC1_M1: + return expandStoreDM1Macro(Inst, IDLoc, Out, STI) + ? MER_Fail + : MER_Success; case Mips::SEQMacro: return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQIMacro: @@ -2879,8 +2896,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, ELF::STB_LOCAL))) { const MCExpr *CallExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); - TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(), - MCOperand::createExpr(CallExpr), IDLoc, STI); + TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr), + IDLoc, STI); return false; } @@ -2919,8 +2936,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, TmpReg = ATReg; } - TOut.emitRRX(Mips::LW, TmpReg, ABI.GetGlobalPtr(), - MCOperand::createExpr(GotExpr), IDLoc, STI); + TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, + STI); if (LoExpr) TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), @@ -2955,8 +2972,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, ELF::STB_LOCAL))) { const MCExpr *CallExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); - TOut.emitRRX(Mips::LD, DstReg, ABI.GetGlobalPtr(), - MCOperand::createExpr(CallExpr), IDLoc, STI); + TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr), + IDLoc, STI); return false; } @@ -2998,8 +3015,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, TmpReg = ATReg; } - TOut.emitRRX(Mips::LD, TmpReg, ABI.GetGlobalPtr(), - MCOperand::createExpr(GotExpr), IDLoc, STI); + TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, + STI); if (LoExpr) TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), @@ -3229,10 +3246,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext()); if(isABI_O32() || isABI_N32()) { - TOut.emitRRX(Mips::LW, ATReg, Mips::GP, MCOperand::createExpr(GotExpr), + TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, STI); } else { //isABI_N64() - TOut.emitRRX(Mips::LD, ATReg, Mips::GP, MCOperand::createExpr(GotExpr), + TOut.emitRRX(Mips::LD, ATReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, STI); } } else { //!IsPicEnabled @@ -4293,6 +4310,143 @@ bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return false; } +bool MipsAsmParser::expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isReg() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + unsigned OpReg = Inst.getOperand(2).getReg(); + unsigned OpCode; + + warnIfNoMacro(IDLoc); + + switch (Inst.getOpcode()) { + case Mips::SGE: + OpCode = Mips::SLT; + break; + case Mips::SGEU: + OpCode = Mips::SLTu; + break; + default: + llvm_unreachable("unexpected 'sge' opcode"); + } + + // $SrcReg >= $OpReg is equal to (not ($SrcReg < $OpReg)) + TOut.emitRRR(OpCode, DstReg, SrcReg, OpReg, IDLoc, STI); + TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); + + return false; +} + +bool MipsAsmParser::expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + unsigned OpRegCode, OpImmCode; + + warnIfNoMacro(IDLoc); + + switch (Inst.getOpcode()) { + case Mips::SGEImm: + case Mips::SGEImm64: + OpRegCode = Mips::SLT; + OpImmCode = Mips::SLTi; + break; + case Mips::SGEUImm: + case Mips::SGEUImm64: + OpRegCode = Mips::SLTu; + OpImmCode = Mips::SLTiu; + break; + default: + llvm_unreachable("unexpected 'sge' opcode with immediate"); + } + + // $SrcReg >= Imm is equal to (not ($SrcReg < Imm)) + if (isInt<16>(ImmValue)) { + // Use immediate version of STL. + TOut.emitRRI(OpImmCode, DstReg, SrcReg, ImmValue, IDLoc, STI); + TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); + } else { + unsigned ImmReg = DstReg; + if (DstReg == SrcReg) { + unsigned ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + ImmReg = ATReg; + } + + if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue), + false, IDLoc, Out, STI)) + return true; + + TOut.emitRRR(OpRegCode, DstReg, SrcReg, ImmReg, IDLoc, STI); + TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); + } + + return false; +} + +bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + unsigned ImmReg = DstReg; + int64_t ImmValue = Inst.getOperand(2).getImm(); + unsigned OpCode; + + warnIfNoMacro(IDLoc); + + switch (Inst.getOpcode()) { + case Mips::SGTImm: + case Mips::SGTImm64: + OpCode = Mips::SLT; + break; + case Mips::SGTUImm: + case Mips::SGTUImm64: + OpCode = Mips::SLTu; + break; + default: + llvm_unreachable("unexpected 'sgt' opcode with immediate"); + } + + if (DstReg == SrcReg) { + unsigned ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + ImmReg = ATReg; + } + + if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue), + false, IDLoc, Out, STI)) + return true; + + // $SrcReg > $ImmReg is equal to $ImmReg < $SrcReg + TOut.emitRRR(OpCode, DstReg, ImmReg, SrcReg, IDLoc, STI); + + return false; +} + bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { @@ -4859,61 +5013,110 @@ bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, return false; } + +// Expand 's.d $ offset($reg2)' to 'swc1 $, offset($reg2); +// swc1 $, offset+4($reg2)' +// or if little endian to 'swc1 $, offset($reg2); +// swc1 $, offset+4($reg2)' +// for Mips1. +bool MipsAsmParser::expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out, + const MCSubtargetInfo *STI) { + if (!isABI_O32()) + return true; + + warnIfNoMacro(IDLoc); + + MipsTargetStreamer &TOut = getTargetStreamer(); + unsigned Opcode = Mips::SWC1; + unsigned FirstReg = Inst.getOperand(0).getReg(); + unsigned SecondReg = nextReg(FirstReg); + unsigned BaseReg = Inst.getOperand(1).getReg(); + if (!SecondReg) + return true; + + warnIfRegIndexIsAT(FirstReg, IDLoc); + + assert(Inst.getOperand(2).isImm() && + "Offset for macro is not immediate!"); + + MCOperand &FirstOffset = Inst.getOperand(2); + signed NextOffset = FirstOffset.getImm() + 4; + MCOperand SecondOffset = MCOperand::createImm(NextOffset); + + if (!isInt<16>(FirstOffset.getImm()) || !isInt<16>(NextOffset)) + return true; + + if (!IsLittleEndian) + std::swap(FirstReg, SecondReg); + + TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI); + TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI); + + return false; +} + bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isReg() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + unsigned OpReg = Inst.getOperand(2).getReg(); warnIfNoMacro(IDLoc); - MipsTargetStreamer &TOut = getTargetStreamer(); - if (Inst.getOperand(1).getReg() != Mips::ZERO && - Inst.getOperand(2).getReg() != Mips::ZERO) { - TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(), - Inst.getOperand(1).getReg(), Inst.getOperand(2).getReg(), - IDLoc, STI); - TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), - Inst.getOperand(0).getReg(), 1, IDLoc, STI); + if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) { + TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI); + TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI); return false; } - unsigned Reg = 0; - if (Inst.getOperand(1).getReg() == Mips::ZERO) { - Reg = Inst.getOperand(2).getReg(); - } else { - Reg = Inst.getOperand(1).getReg(); - } - TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), Reg, 1, IDLoc, STI); + unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg; + TOut.emitRRI(Mips::SLTiu, DstReg, Reg, 1, IDLoc, STI); return false; } bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { - warnIfNoMacro(IDLoc); MipsTargetStreamer &TOut = getTargetStreamer(); - unsigned Opc; + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); int64_t Imm = Inst.getOperand(2).getImm(); - unsigned Reg = Inst.getOperand(1).getReg(); + + warnIfNoMacro(IDLoc); if (Imm == 0) { - TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), - Inst.getOperand(1).getReg(), 1, IDLoc, STI); + TOut.emitRRI(Mips::SLTiu, DstReg, SrcReg, 1, IDLoc, STI); return false; - } else { + } - if (Reg == Mips::ZERO) { - Warning(IDLoc, "comparison is always false"); - TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, - Inst.getOperand(0).getReg(), Reg, Reg, IDLoc, STI); - return false; - } + if (SrcReg == Mips::ZERO) { + Warning(IDLoc, "comparison is always false"); + TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, + DstReg, SrcReg, SrcReg, IDLoc, STI); + return false; + } - if (Imm > -0x8000 && Imm < 0) { - Imm = -Imm; - Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu; - } else { - Opc = Mips::XORi; - } + unsigned Opc; + if (Imm > -0x8000 && Imm < 0) { + Imm = -Imm; + Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu; + } else { + Opc = Mips::XORi; } + if (!isUInt<16>(Imm)) { unsigned ATReg = getATReg(IDLoc); if (!ATReg) @@ -4923,17 +5126,13 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, Out, STI)) return true; - TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(), - Inst.getOperand(1).getReg(), ATReg, IDLoc, STI); - TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), - Inst.getOperand(0).getReg(), 1, IDLoc, STI); + TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI); + TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI); return false; } - TOut.emitRRI(Opc, Inst.getOperand(0).getReg(), Inst.getOperand(1).getReg(), - Imm, IDLoc, STI); - TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), - Inst.getOperand(0).getReg(), 1, IDLoc, STI); + TOut.emitRRI(Opc, DstReg, SrcReg, Imm, IDLoc, STI); + TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI); return false; } @@ -6325,7 +6524,7 @@ bool MipsAsmParser::parseBracketSuffix(StringRef Name, return false; } -static std::string MipsMnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string MipsMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS, unsigned VariantID = 0); bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -6338,7 +6537,7 @@ bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Check if we have valid mnemonic if (!mnemonicIsValid(Name, 0)) { - uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = MipsMnemonicSpellCheck(Name, FBS); return Error(NameLoc, "unknown instruction" + Suggestion); } @@ -6807,7 +7006,6 @@ bool MipsAsmParser::parseSetHardFloatDirective() { bool MipsAsmParser::parseSetAssignment() { StringRef Name; - const MCExpr *Value; MCAsmParser &Parser = getParser(); if (Parser.parseIdentifier(Name)) @@ -6825,17 +7023,16 @@ bool MipsAsmParser::parseSetAssignment() { RegisterSets[Name] = Parser.getTok(); Parser.Lex(); // Eat identifier. getContext().getOrCreateSymbol(Name); - } else if (!Parser.parseExpression(Value)) { - // Parse assignment of an expression including - // symbolic registers: - // .set $tmp, $BB0-$BB1 - // .set r2, $f2 - MCSymbol *Sym = getContext().getOrCreateSymbol(Name); - Sym->setVariableValue(Value); - } else { - return reportParseError("expected valid expression after comma"); + return false; } + MCSymbol *Sym; + const MCExpr *Value; + if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true, + Parser, Sym, Value)) + return true; + Sym->setVariableValue(Value); + return false; } @@ -7047,6 +7244,40 @@ bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { return false; } +bool MipsAsmParser::parseDirectiveCpLocal(SMLoc Loc) { + if (!isABI_N32() && !isABI_N64()) { + reportParseError(".cplocal is allowed only in N32 or N64 mode"); + return false; + } + + SmallVector, 1> Reg; + OperandMatchResultTy ResTy = parseAnyRegister(Reg); + if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { + reportParseError("expected register containing global pointer"); + return false; + } + + MipsOperand &RegOpnd = static_cast(*Reg[0]); + if (!RegOpnd.isGPRAsmReg()) { + reportParseError(RegOpnd.getStartLoc(), "invalid register"); + return false; + } + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + getParser().Lex(); // Consume the EndOfStatement. + + unsigned NewReg = RegOpnd.getGPR32Reg(); + if (IsPicEnabled) + GPReg = NewReg; + + getTargetStreamer().emitDirectiveCpLocal(NewReg); + return false; +} + bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) { MCAsmParser &Parser = getParser(); @@ -7897,6 +8128,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveCpRestore(DirectiveID.getLoc()); return false; } + if (IDVal == ".cplocal") { + parseDirectiveCpLocal(DirectiveID.getLoc()); + return false; + } if (IDVal == ".ent") { StringRef SymbolName; diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index 27b27ff1e1e2..ef13507fe63a 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -1,9 +1,8 @@ //===- MipsDisassembler.cpp - Disassembler for Mips -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "MCTargetDesc/MipsMCTargetDesc.h" #include "Mips.h" +#include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -541,15 +541,6 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair, static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -namespace llvm { - -Target &getTheMipselTarget(); -Target &getTheMipsTarget(); -Target &getTheMips64Target(); -Target &getTheMips64elTarget(); - -} // end namespace llvm - static MCDisassembler *createMipsDisassembler( const Target &T, const MCSubtargetInfo &STI, diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp deleted file mode 100644 index 73732a40bb8a..000000000000 --- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp +++ /dev/null @@ -1,288 +0,0 @@ -//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an Mips MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "MipsInstPrinter.h" -#include "MCTargetDesc/MipsMCExpr.h" -#include "MipsInstrInfo.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#define PRINT_ALIAS_INSTR -#include "MipsGenAsmWriter.inc" - -template -static bool isReg(const MCInst &MI, unsigned OpNo) { - assert(MI.getOperand(OpNo).isReg() && "Register operand expected."); - return MI.getOperand(OpNo).getReg() == R; -} - -const char* Mips::MipsFCCToString(Mips::CondCode CC) { - switch (CC) { - case FCOND_F: - case FCOND_T: return "f"; - case FCOND_UN: - case FCOND_OR: return "un"; - case FCOND_OEQ: - case FCOND_UNE: return "eq"; - case FCOND_UEQ: - case FCOND_ONE: return "ueq"; - case FCOND_OLT: - case FCOND_UGE: return "olt"; - case FCOND_ULT: - case FCOND_OGE: return "ult"; - case FCOND_OLE: - case FCOND_UGT: return "ole"; - case FCOND_ULE: - case FCOND_OGT: return "ule"; - case FCOND_SF: - case FCOND_ST: return "sf"; - case FCOND_NGLE: - case FCOND_GLE: return "ngle"; - case FCOND_SEQ: - case FCOND_SNE: return "seq"; - case FCOND_NGL: - case FCOND_GL: return "ngl"; - case FCOND_LT: - case FCOND_NLT: return "lt"; - case FCOND_NGE: - case FCOND_GE: return "nge"; - case FCOND_LE: - case FCOND_NLE: return "le"; - case FCOND_NGT: - case FCOND_GT: return "ngt"; - } - llvm_unreachable("Impossible condition code!"); -} - -void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << '$' << StringRef(getRegisterName(RegNo)).lower(); -} - -void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - switch (MI->getOpcode()) { - default: - break; - case Mips::RDHWR: - case Mips::RDHWR64: - O << "\t.set\tpush\n"; - O << "\t.set\tmips32r2\n"; - break; - case Mips::Save16: - O << "\tsave\t"; - printSaveRestore(MI, O); - O << " # 16 bit inst\n"; - return; - case Mips::SaveX16: - O << "\tsave\t"; - printSaveRestore(MI, O); - O << "\n"; - return; - case Mips::Restore16: - O << "\trestore\t"; - printSaveRestore(MI, O); - O << " # 16 bit inst\n"; - return; - case Mips::RestoreX16: - O << "\trestore\t"; - printSaveRestore(MI, O); - O << "\n"; - return; - } - - // Try to print any aliases first. - if (!printAliasInstr(MI, O) && !printAlias(*MI, O)) - printInstruction(MI, O); - printAnnotation(O, Annot); - - switch (MI->getOpcode()) { - default: - break; - case Mips::RDHWR: - case Mips::RDHWR64: - O << "\n\t.set\tpop"; - } -} - -void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - return; - } - - if (Op.isImm()) { - O << formatImm(Op.getImm()); - return; - } - - assert(Op.isExpr() && "unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI, true); -} - -template -void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) { - const MCOperand &MO = MI->getOperand(opNum); - if (MO.isImm()) { - uint64_t Imm = MO.getImm(); - Imm -= Offset; - Imm &= (1 << Bits) - 1; - Imm += Offset; - O << formatImm(Imm); - return; - } - - printOperand(MI, opNum, O); -} - -void MipsInstPrinter:: -printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) { - // Load/Store memory operands -- imm($reg) - // If PIC target the target is loaded as the - // pattern lw $25,%call16($28) - - // opNum can be invalid if instruction had reglist as operand. - // MemOperand is always last operand of instruction (base + offset). - switch (MI->getOpcode()) { - default: - break; - case Mips::SWM32_MM: - case Mips::LWM32_MM: - case Mips::SWM16_MM: - case Mips::SWM16_MMR6: - case Mips::LWM16_MM: - case Mips::LWM16_MMR6: - opNum = MI->getNumOperands() - 2; - break; - } - - printOperand(MI, opNum+1, O); - O << "("; - printOperand(MI, opNum, O); - O << ")"; -} - -void MipsInstPrinter:: -printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) { - // when using stack locations for not load/store instructions - // print the same way as all normal 3 operand instructions. - printOperand(MI, opNum, O); - O << ", "; - printOperand(MI, opNum+1, O); -} - -void MipsInstPrinter:: -printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) { - const MCOperand& MO = MI->getOperand(opNum); - O << MipsFCCToString((Mips::CondCode)MO.getImm()); -} - -void MipsInstPrinter:: -printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) { - llvm_unreachable("TODO"); -} - -bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI, - unsigned OpNo, raw_ostream &OS) { - OS << "\t" << Str << "\t"; - printOperand(&MI, OpNo, OS); - return true; -} - -bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI, - unsigned OpNo0, unsigned OpNo1, - raw_ostream &OS) { - printAlias(Str, MI, OpNo0, OS); - OS << ", "; - printOperand(&MI, OpNo1, OS); - return true; -} - -bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) { - switch (MI.getOpcode()) { - case Mips::BEQ: - case Mips::BEQ_MM: - // beq $zero, $zero, $L2 => b $L2 - // beq $r0, $zero, $L2 => beqz $r0, $L2 - return (isReg(MI, 0) && isReg(MI, 1) && - printAlias("b", MI, 2, OS)) || - (isReg(MI, 1) && printAlias("beqz", MI, 0, 2, OS)); - case Mips::BEQ64: - // beq $r0, $zero, $L2 => beqz $r0, $L2 - return isReg(MI, 1) && printAlias("beqz", MI, 0, 2, OS); - case Mips::BNE: - case Mips::BNE_MM: - // bne $r0, $zero, $L2 => bnez $r0, $L2 - return isReg(MI, 1) && printAlias("bnez", MI, 0, 2, OS); - case Mips::BNE64: - // bne $r0, $zero, $L2 => bnez $r0, $L2 - return isReg(MI, 1) && printAlias("bnez", MI, 0, 2, OS); - case Mips::BGEZAL: - // bgezal $zero, $L1 => bal $L1 - return isReg(MI, 0) && printAlias("bal", MI, 1, OS); - case Mips::BC1T: - // bc1t $fcc0, $L1 => bc1t $L1 - return isReg(MI, 0) && printAlias("bc1t", MI, 1, OS); - case Mips::BC1F: - // bc1f $fcc0, $L1 => bc1f $L1 - return isReg(MI, 0) && printAlias("bc1f", MI, 1, OS); - case Mips::JALR: - // jalr $ra, $r1 => jalr $r1 - return isReg(MI, 0) && printAlias("jalr", MI, 1, OS); - case Mips::JALR64: - // jalr $ra, $r1 => jalr $r1 - return isReg(MI, 0) && printAlias("jalr", MI, 1, OS); - case Mips::NOR: - case Mips::NOR_MM: - case Mips::NOR_MMR6: - // nor $r0, $r1, $zero => not $r0, $r1 - return isReg(MI, 2) && printAlias("not", MI, 0, 1, OS); - case Mips::NOR64: - // nor $r0, $r1, $zero => not $r0, $r1 - return isReg(MI, 2) && printAlias("not", MI, 0, 1, OS); - case Mips::OR: - // or $r0, $r1, $zero => move $r0, $r1 - return isReg(MI, 2) && printAlias("move", MI, 0, 1, OS); - default: return false; - } -} - -void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - if (i != 0) O << ", "; - if (MI->getOperand(i).isReg()) - printRegName(O, MI->getOperand(i).getReg()); - else - printUImm<16>(MI, i, O); - } -} - -void MipsInstPrinter:: -printRegisterList(const MCInst *MI, int opNum, raw_ostream &O) { - // - 2 because register List is always first operand of instruction and it is - // always followed by memory operand (base + offset). - for (int i = opNum, e = MI->getNumOperands() - 2; i != e; ++i) { - if (i != opNum) - O << ", "; - printRegName(O, MI->getOperand(i).getReg()); - } -} diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h deleted file mode 100644 index f02443ee21d3..000000000000 --- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h +++ /dev/null @@ -1,113 +0,0 @@ -//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a Mips MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H -#define LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { -// These enumeration declarations were originally in MipsInstrInfo.h but -// had to be moved here to avoid circular dependencies between -// LLVMMipsCodeGen and LLVMMipsAsmPrinter. -namespace Mips { -// Mips Branch Codes -enum FPBranchCode { - BRANCH_F, - BRANCH_T, - BRANCH_FL, - BRANCH_TL, - BRANCH_INVALID -}; - -// Mips Condition Codes -enum CondCode { - // To be used with float branch True - FCOND_F, - FCOND_UN, - FCOND_OEQ, - FCOND_UEQ, - FCOND_OLT, - FCOND_ULT, - FCOND_OLE, - FCOND_ULE, - FCOND_SF, - FCOND_NGLE, - FCOND_SEQ, - FCOND_NGL, - FCOND_LT, - FCOND_NGE, - FCOND_LE, - FCOND_NGT, - - // To be used with float branch False - // This conditions have the same mnemonic as the - // above ones, but are used with a branch False; - FCOND_T, - FCOND_OR, - FCOND_UNE, - FCOND_ONE, - FCOND_UGE, - FCOND_OGE, - FCOND_UGT, - FCOND_OGT, - FCOND_ST, - FCOND_GLE, - FCOND_SNE, - FCOND_GL, - FCOND_NLT, - FCOND_GE, - FCOND_NLE, - FCOND_GT -}; - -const char *MipsFCCToString(Mips::CondCode CC); -} // end namespace Mips - -class MipsInstPrinter : public MCInstPrinter { -public: - MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - - bool printAliasInstr(const MCInst *MI, raw_ostream &OS); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); - -private: - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - template - void printUImm(const MCInst *MI, int opNum, raw_ostream &O); - void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O); - void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O); - void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O); - void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O); - - bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo, - raw_ostream &OS); - bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0, - unsigned OpNo1, raw_ostream &OS); - bool printAlias(const MCInst &MI, raw_ostream &OS); - void printSaveRestore(const MCInst *MI, raw_ostream &O); - void printRegisterList(const MCInst *MI, int opNum, raw_ostream &O); -}; -} // end namespace llvm - -#endif diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp index 4a2b75b9ae46..fca1149453c9 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp @@ -1,9 +1,8 @@ //===- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h index 68bf3829aab5..239e55495e9d 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h @@ -1,9 +1,8 @@ //===- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp index 18d7dd99be34..bdd190fc17c9 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp @@ -1,9 +1,8 @@ //===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,6 +14,13 @@ using namespace llvm; +// Note: this option is defined here to be visible from libLLVMMipsAsmParser +// and libLLVMMipsCodeGen +cl::opt +EmitJalrReloc("mips-jalr-reloc", cl::Hidden, + cl::desc("MIPS: Emit R_{MICRO}MIPS_JALR relocation with jalr"), + cl::init(true)); + namespace { static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3}; diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h index 9372a3c2bb1f..534e6573b63c 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h +++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h @@ -1,9 +1,8 @@ //===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 265d1141cb0b..859f9cbbca07 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- MipsAsmBackend.cpp - Mips Asm Backend ----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -303,7 +302,7 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, Optional MipsAsmBackend::getFixupKind(StringRef Name) const { return StringSwitch>(Name) - .Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE) + .Case("R_MIPS_NONE", FK_NONE) .Case("R_MIPS_32", FK_Data_4) .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE) .Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16) @@ -351,7 +350,6 @@ getFixupKindInfo(MCFixupKind Kind) const { // MipsFixupKinds.h. // // name offset bits flags - { "fixup_Mips_NONE", 0, 0, 0 }, { "fixup_Mips_16", 0, 16, 0 }, { "fixup_Mips_32", 0, 32, 0 }, { "fixup_Mips_REL32", 0, 32, 0 }, @@ -431,7 +429,6 @@ getFixupKindInfo(MCFixupKind Kind) const { // MipsFixupKinds.h. // // name offset bits flags - { "fixup_Mips_NONE", 0, 0, 0 }, { "fixup_Mips_16", 16, 16, 0 }, { "fixup_Mips_32", 0, 32, 0 }, { "fixup_Mips_REL32", 0, 32, 0 }, diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 30359132e92b..4d7e36995ae4 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -1,9 +1,8 @@ //===-- MipsAsmBackend.h - Mips Asm Backend ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h index a90db2384c46..6d8cb264158f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h +++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h @@ -1,9 +1,8 @@ //===-- MipsBaseInfo.h - Top level definitions for MIPS MC ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -89,7 +88,10 @@ namespace MipsII { MO_GOT_HI16, MO_GOT_LO16, MO_CALL_HI16, - MO_CALL_LO16 + MO_CALL_LO16, + + /// Helper operand used to generate R_MIPS_JALR + MO_JALR }; enum { diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 8ace2895d681..cf7bae98a27f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- MipsELFObjectWriter.cpp - Mips ELF Writer -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -223,7 +222,7 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx, unsigned Kind = (unsigned)Fixup.getKind(); switch (Kind) { - case Mips::fixup_Mips_NONE: + case FK_NONE: return ELF::R_MIPS_NONE; case FK_Data_1: Ctx.reportError(Fixup.getLoc(), diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 21b01e850967..1b83e9445fb5 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -1,9 +1,8 @@ //===-------- MipsELFStreamer.cpp - ELF Object Output ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -35,7 +34,7 @@ MipsELFStreamer::MipsELFStreamer(MCContext &Context, } void MipsELFStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI, bool) { + const MCSubtargetInfo &STI) { MCELFStreamer::EmitInstruction(Inst, STI); MCContext &Context = getContext(); diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index 56a0ff96c7bd..2febfbc69b6f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -1,9 +1,8 @@ //===- MipsELFStreamer.h - ELF Object Output --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -42,8 +41,7 @@ public: /// \p Inst is actually emitted. For example, we can inspect the operands and /// gather sufficient information that allows us to reason about the register /// usage for the translation unit. - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - bool = false) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; /// Overriding this function allows us to record all labels that should be /// marked as microMIPS. Based on this data marking is done in diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index eedad16dddc3..b83d822bd8d0 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -1,9 +1,8 @@ //===-- MipsFixupKinds.h - Mips Specific Fixup Entries ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -23,11 +22,8 @@ namespace Mips { // in MipsAsmBackend.cpp. // enum Fixups { - // Branch fixups resulting in R_MIPS_NONE. - fixup_Mips_NONE = FirstTargetFixupKind, - // Branch fixups resulting in R_MIPS_16. - fixup_Mips_16, + fixup_Mips_16 = FirstTargetFixupKind, // Pure 32 bit data fixup resulting in - R_MIPS_32. fixup_Mips_32, diff --git a/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp new file mode 100644 index 000000000000..fb290a8e3f26 --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp @@ -0,0 +1,287 @@ +//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an Mips MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "MipsInstPrinter.h" +#include "MipsInstrInfo.h" +#include "MipsMCExpr.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#define PRINT_ALIAS_INSTR +#include "MipsGenAsmWriter.inc" + +template +static bool isReg(const MCInst &MI, unsigned OpNo) { + assert(MI.getOperand(OpNo).isReg() && "Register operand expected."); + return MI.getOperand(OpNo).getReg() == R; +} + +const char* Mips::MipsFCCToString(Mips::CondCode CC) { + switch (CC) { + case FCOND_F: + case FCOND_T: return "f"; + case FCOND_UN: + case FCOND_OR: return "un"; + case FCOND_OEQ: + case FCOND_UNE: return "eq"; + case FCOND_UEQ: + case FCOND_ONE: return "ueq"; + case FCOND_OLT: + case FCOND_UGE: return "olt"; + case FCOND_ULT: + case FCOND_OGE: return "ult"; + case FCOND_OLE: + case FCOND_UGT: return "ole"; + case FCOND_ULE: + case FCOND_OGT: return "ule"; + case FCOND_SF: + case FCOND_ST: return "sf"; + case FCOND_NGLE: + case FCOND_GLE: return "ngle"; + case FCOND_SEQ: + case FCOND_SNE: return "seq"; + case FCOND_NGL: + case FCOND_GL: return "ngl"; + case FCOND_LT: + case FCOND_NLT: return "lt"; + case FCOND_NGE: + case FCOND_GE: return "nge"; + case FCOND_LE: + case FCOND_NLE: return "le"; + case FCOND_NGT: + case FCOND_GT: return "ngt"; + } + llvm_unreachable("Impossible condition code!"); +} + +void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << '$' << StringRef(getRegisterName(RegNo)).lower(); +} + +void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + switch (MI->getOpcode()) { + default: + break; + case Mips::RDHWR: + case Mips::RDHWR64: + O << "\t.set\tpush\n"; + O << "\t.set\tmips32r2\n"; + break; + case Mips::Save16: + O << "\tsave\t"; + printSaveRestore(MI, O); + O << " # 16 bit inst\n"; + return; + case Mips::SaveX16: + O << "\tsave\t"; + printSaveRestore(MI, O); + O << "\n"; + return; + case Mips::Restore16: + O << "\trestore\t"; + printSaveRestore(MI, O); + O << " # 16 bit inst\n"; + return; + case Mips::RestoreX16: + O << "\trestore\t"; + printSaveRestore(MI, O); + O << "\n"; + return; + } + + // Try to print any aliases first. + if (!printAliasInstr(MI, O) && !printAlias(*MI, O)) + printInstruction(MI, O); + printAnnotation(O, Annot); + + switch (MI->getOpcode()) { + default: + break; + case Mips::RDHWR: + case Mips::RDHWR64: + O << "\n\t.set\tpop"; + } +} + +void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + return; + } + + if (Op.isImm()) { + O << formatImm(Op.getImm()); + return; + } + + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI, true); +} + +template +void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) { + const MCOperand &MO = MI->getOperand(opNum); + if (MO.isImm()) { + uint64_t Imm = MO.getImm(); + Imm -= Offset; + Imm &= (1 << Bits) - 1; + Imm += Offset; + O << formatImm(Imm); + return; + } + + printOperand(MI, opNum, O); +} + +void MipsInstPrinter:: +printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) { + // Load/Store memory operands -- imm($reg) + // If PIC target the target is loaded as the + // pattern lw $25,%call16($28) + + // opNum can be invalid if instruction had reglist as operand. + // MemOperand is always last operand of instruction (base + offset). + switch (MI->getOpcode()) { + default: + break; + case Mips::SWM32_MM: + case Mips::LWM32_MM: + case Mips::SWM16_MM: + case Mips::SWM16_MMR6: + case Mips::LWM16_MM: + case Mips::LWM16_MMR6: + opNum = MI->getNumOperands() - 2; + break; + } + + printOperand(MI, opNum+1, O); + O << "("; + printOperand(MI, opNum, O); + O << ")"; +} + +void MipsInstPrinter:: +printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) { + // when using stack locations for not load/store instructions + // print the same way as all normal 3 operand instructions. + printOperand(MI, opNum, O); + O << ", "; + printOperand(MI, opNum+1, O); +} + +void MipsInstPrinter:: +printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) { + const MCOperand& MO = MI->getOperand(opNum); + O << MipsFCCToString((Mips::CondCode)MO.getImm()); +} + +void MipsInstPrinter:: +printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) { + llvm_unreachable("TODO"); +} + +bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI, + unsigned OpNo, raw_ostream &OS) { + OS << "\t" << Str << "\t"; + printOperand(&MI, OpNo, OS); + return true; +} + +bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI, + unsigned OpNo0, unsigned OpNo1, + raw_ostream &OS) { + printAlias(Str, MI, OpNo0, OS); + OS << ", "; + printOperand(&MI, OpNo1, OS); + return true; +} + +bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) { + switch (MI.getOpcode()) { + case Mips::BEQ: + case Mips::BEQ_MM: + // beq $zero, $zero, $L2 => b $L2 + // beq $r0, $zero, $L2 => beqz $r0, $L2 + return (isReg(MI, 0) && isReg(MI, 1) && + printAlias("b", MI, 2, OS)) || + (isReg(MI, 1) && printAlias("beqz", MI, 0, 2, OS)); + case Mips::BEQ64: + // beq $r0, $zero, $L2 => beqz $r0, $L2 + return isReg(MI, 1) && printAlias("beqz", MI, 0, 2, OS); + case Mips::BNE: + case Mips::BNE_MM: + // bne $r0, $zero, $L2 => bnez $r0, $L2 + return isReg(MI, 1) && printAlias("bnez", MI, 0, 2, OS); + case Mips::BNE64: + // bne $r0, $zero, $L2 => bnez $r0, $L2 + return isReg(MI, 1) && printAlias("bnez", MI, 0, 2, OS); + case Mips::BGEZAL: + // bgezal $zero, $L1 => bal $L1 + return isReg(MI, 0) && printAlias("bal", MI, 1, OS); + case Mips::BC1T: + // bc1t $fcc0, $L1 => bc1t $L1 + return isReg(MI, 0) && printAlias("bc1t", MI, 1, OS); + case Mips::BC1F: + // bc1f $fcc0, $L1 => bc1f $L1 + return isReg(MI, 0) && printAlias("bc1f", MI, 1, OS); + case Mips::JALR: + // jalr $ra, $r1 => jalr $r1 + return isReg(MI, 0) && printAlias("jalr", MI, 1, OS); + case Mips::JALR64: + // jalr $ra, $r1 => jalr $r1 + return isReg(MI, 0) && printAlias("jalr", MI, 1, OS); + case Mips::NOR: + case Mips::NOR_MM: + case Mips::NOR_MMR6: + // nor $r0, $r1, $zero => not $r0, $r1 + return isReg(MI, 2) && printAlias("not", MI, 0, 1, OS); + case Mips::NOR64: + // nor $r0, $r1, $zero => not $r0, $r1 + return isReg(MI, 2) && printAlias("not", MI, 0, 1, OS); + case Mips::OR: + // or $r0, $r1, $zero => move $r0, $r1 + return isReg(MI, 2) && printAlias("move", MI, 0, 1, OS); + default: return false; + } +} + +void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (i != 0) O << ", "; + if (MI->getOperand(i).isReg()) + printRegName(O, MI->getOperand(i).getReg()); + else + printUImm<16>(MI, i, O); + } +} + +void MipsInstPrinter:: +printRegisterList(const MCInst *MI, int opNum, raw_ostream &O) { + // - 2 because register List is always first operand of instruction and it is + // always followed by memory operand (base + offset). + for (int i = opNum, e = MI->getNumOperands() - 2; i != e; ++i) { + if (i != opNum) + O << ", "; + printRegName(O, MI->getOperand(i).getReg()); + } +} diff --git a/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h new file mode 100644 index 000000000000..a34a5c1d6418 --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h @@ -0,0 +1,112 @@ +//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints a Mips MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSINSTPRINTER_H +#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSINSTPRINTER_H +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { +// These enumeration declarations were originally in MipsInstrInfo.h but +// had to be moved here to avoid circular dependencies between +// LLVMMipsCodeGen and LLVMMipsAsmPrinter. +namespace Mips { +// Mips Branch Codes +enum FPBranchCode { + BRANCH_F, + BRANCH_T, + BRANCH_FL, + BRANCH_TL, + BRANCH_INVALID +}; + +// Mips Condition Codes +enum CondCode { + // To be used with float branch True + FCOND_F, + FCOND_UN, + FCOND_OEQ, + FCOND_UEQ, + FCOND_OLT, + FCOND_ULT, + FCOND_OLE, + FCOND_ULE, + FCOND_SF, + FCOND_NGLE, + FCOND_SEQ, + FCOND_NGL, + FCOND_LT, + FCOND_NGE, + FCOND_LE, + FCOND_NGT, + + // To be used with float branch False + // This conditions have the same mnemonic as the + // above ones, but are used with a branch False; + FCOND_T, + FCOND_OR, + FCOND_UNE, + FCOND_ONE, + FCOND_UGE, + FCOND_OGE, + FCOND_UGT, + FCOND_OGT, + FCOND_ST, + FCOND_GLE, + FCOND_SNE, + FCOND_GL, + FCOND_NLT, + FCOND_GE, + FCOND_NLE, + FCOND_GT +}; + +const char *MipsFCCToString(Mips::CondCode CC); +} // end namespace Mips + +class MipsInstPrinter : public MCInstPrinter { +public: + MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + +private: + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + template + void printUImm(const MCInst *MI, int opNum, raw_ostream &O); + void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O); + void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O); + void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O); + void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O); + + bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo, + raw_ostream &OS); + bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0, + unsigned OpNo1, raw_ostream &OS); + bool printAlias(const MCInst &MI, raw_ostream &OS); + void printSaveRestore(const MCInst *MI, raw_ostream &O); + void printRegisterList(const MCInst *MI, int opNum, raw_ostream &O); +}; +} // end namespace llvm + +#endif diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 1506b4a83649..ec78158d387d 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- MipsMCAsmInfo.cpp - Mips Asm Properties ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h index d4ccf0349c16..867f4d223de4 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- MipsMCAsmInfo.h - Mips Asm Info ------------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index f43a4d980f92..759a7fdb32b8 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- MipsMCCodeEmitter.cpp - Convert Mips Code to Machine Code ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -186,7 +185,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, // Check for unimplemented opcodes. // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0 // so we have to special check for them. - unsigned Opcode = TmpInst.getOpcode(); + const unsigned Opcode = TmpInst.getOpcode(); if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && (Opcode != Mips::SLL_MM) && (Opcode != Mips::SLL_MMR6) && !Binary) llvm_unreachable("unimplemented opcode in encodeInstruction()"); @@ -209,7 +208,6 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if (Fixups.size() > N) Fixups.pop_back(); - Opcode = NewOpcode; TmpInst.setOpcode (NewOpcode); Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI); } @@ -614,8 +612,9 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl &Fixups, llvm_unreachable("Unhandled fixup kind!"); break; case MipsMCExpr::MEK_DTPREL: - llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); - break; + // MEK_DTPREL is used for marking TLS DIEExpr only + // and contains a regular sub-expression. + return getExprOpValue(MipsExpr->getSubExpr(), Fixups, STI); case MipsMCExpr::MEK_CALL_HI16: FixupKind = Mips::fixup_Mips_CALL_HI16; break; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h index 09d50d4776ba..ff6e1d62b05f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h @@ -1,9 +1,8 @@ //===- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp index 99857e083c6c..680806c4deb2 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp @@ -1,9 +1,8 @@ //===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -44,8 +43,10 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { llvm_unreachable("MEK_None and MEK_Special are invalid"); break; case MEK_DTPREL: - llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); - break; + // MEK_DTPREL is used for marking TLS DIEExpr only + // and contains a regular sub-expression. + getSubExpr()->print(OS, MAI, true); + return; case MEK_CALL_HI16: OS << "%call_hi"; break; @@ -161,7 +162,9 @@ MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res, case MEK_Special: llvm_unreachable("MEK_None and MEK_Special are invalid"); case MEK_DTPREL: - llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); + // MEK_DTPREL is used for marking TLS DIEExpr only + // and contains a regular sub-expression. + return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup); case MEK_DTPREL_HI: case MEK_DTPREL_LO: case MEK_GOT: @@ -249,9 +252,6 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { case MEK_Special: llvm_unreachable("MEK_None and MEK_Special are invalid"); break; - case MEK_DTPREL: - llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); - break; case MEK_CALL_HI16: case MEK_CALL_LO16: case MEK_GOT: @@ -274,6 +274,7 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { if (const MipsMCExpr *E = dyn_cast(getSubExpr())) E->fixELFSymbolsInTLSFixups(Asm); break; + case MEK_DTPREL: case MEK_DTPREL_HI: case MEK_DTPREL_LO: case MEK_TLSLDM: diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index bf3274ab5d17..edc12e87e9b6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -1,9 +1,8 @@ //===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h index 988629ed1bca..ad5aff6552f6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h @@ -1,9 +1,8 @@ //===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index a8cd7b0d9b03..ddeec03ba784 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- MipsMCTargetDesc.cpp - Mips Target Descriptions -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,12 +11,13 @@ //===----------------------------------------------------------------------===// #include "MipsMCTargetDesc.h" -#include "InstPrinter/MipsInstPrinter.h" #include "MipsAsmBackend.h" #include "MipsELFStreamer.h" +#include "MipsInstPrinter.h" #include "MipsMCAsmInfo.h" #include "MipsMCNaCl.h" #include "MipsTargetStreamer.h" +#include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCELFStreamer.h" @@ -85,7 +85,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI = new MipsMCAsmInfo(TT); unsigned SP = MRI.getDwarfRegNum(Mips::SP, true); - MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfaRegister(nullptr, SP); MAI->addInitialFrameState(Inst); return MAI; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index 4fc174ab5871..809be99ff3f4 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,11 +32,6 @@ class Triple; class raw_ostream; class raw_pwrite_stream; -Target &getTheMipsTarget(); -Target &getTheMipselTarget(); -Target &getTheMips64Target(); -Target &getTheMips64elTarget(); - MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index 6bf62ea618b4..c050db8a17fd 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -1,9 +1,8 @@ //===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -144,8 +143,8 @@ private: public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to mask dangerous instructions. - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - bool) override { + void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) override { // Sandbox indirect jumps. if (isIndirectJump(Inst)) { if (PendingCall) diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp index 2d84528e7469..b4ebb9d18b72 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp @@ -1,9 +1,8 @@ //===- MipsOptionRecord.cpp - Abstraction for storing information ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 58f9717e1cc6..e3bdb3b140a8 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,7 @@ //===----------------------------------------------------------------------===// #include "MipsTargetStreamer.h" -#include "InstPrinter/MipsInstPrinter.h" +#include "MipsInstPrinter.h" #include "MCTargetDesc/MipsABIInfo.h" #include "MipsELFStreamer.h" #include "MipsMCExpr.h" @@ -36,7 +35,7 @@ static cl::opt RoundSectionSizes( } // end anonymous namespace MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) - : MCTargetStreamer(S), ModuleDirectiveAllowed(true) { + : MCTargetStreamer(S), GPReg(Mips::GP), ModuleDirectiveAllowed(true) { GPRInfoSet = FPRInfoSet = FrameInfoSet = false; } void MipsTargetStreamer::emitDirectiveSetMicroMips() {} @@ -107,6 +106,23 @@ void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetDspr2() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {} +void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) { + // .cplocal $reg + // This directive forces to use the alternate register for context pointer. + // For example + // .cplocal $4 + // jal foo + // expands to + // ld $25, %call16(foo)($4) + // jalr $25 + + if (!getABI().IsN32() && !getABI().IsN64()) + return; + + GPReg = RegNo; + + forbidModuleDirective(); +} bool MipsTargetStreamer::emitDirectiveCpRestore( int Offset, function_ref GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { @@ -258,8 +274,7 @@ void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) { /// Emit the $gp restore operation for .cprestore. void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI) { - emitLoadWithImmOffset(Mips::LW, Mips::GP, Mips::SP, Offset, Mips::GP, IDLoc, - STI); + emitLoadWithImmOffset(Mips::LW, GPReg, Mips::SP, Offset, GPReg, IDLoc, STI); } /// Emit a store instruction with an immediate offset. @@ -666,6 +681,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) { forbidModuleDirective(); } +void MipsTargetAsmStreamer::emitDirectiveCpLocal(unsigned RegNo) { + OS << "\t.cplocal\t$" + << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; + MipsTargetStreamer::emitDirectiveCpLocal(RegNo); +} + bool MipsTargetAsmStreamer::emitDirectiveCpRestore( int Offset, function_ref GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { @@ -700,8 +721,11 @@ void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation, } void MipsTargetAsmStreamer::emitDirectiveModuleFP() { - OS << "\t.module\tfp="; - OS << ABIFlagsSection.getFpABIString(ABIFlagsSection.getFpABI()) << "\n"; + MipsABIFlagsSection::FpABIKind FpABI = ABIFlagsSection.getFpABI(); + if (FpABI == MipsABIFlagsSection::FpABIKind::SOFT) + OS << "\t.module\tsoftfloat\n"; + else + OS << "\t.module\tfp=" << ABIFlagsSection.getFpABIString(FpABI) << "\n"; } void MipsTargetAsmStreamer::emitDirectiveSetFp( @@ -1133,7 +1157,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { MCInst TmpInst; TmpInst.setOpcode(Mips::LUi); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); + TmpInst.addOperand(MCOperand::createReg(GPReg)); const MCExpr *HiSym = MipsMCExpr::create( MipsMCExpr::MEK_HI, MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None, @@ -1145,8 +1169,8 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { TmpInst.clear(); TmpInst.setOpcode(Mips::ADDiu); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); + TmpInst.addOperand(MCOperand::createReg(GPReg)); + TmpInst.addOperand(MCOperand::createReg(GPReg)); const MCExpr *LoSym = MipsMCExpr::create( MipsMCExpr::MEK_LO, MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None, @@ -1158,14 +1182,19 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { TmpInst.clear(); TmpInst.setOpcode(Mips::ADDu); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); + TmpInst.addOperand(MCOperand::createReg(GPReg)); + TmpInst.addOperand(MCOperand::createReg(GPReg)); TmpInst.addOperand(MCOperand::createReg(RegNo)); getStreamer().EmitInstruction(TmpInst, STI); forbidModuleDirective(); } +void MipsTargetELFStreamer::emitDirectiveCpLocal(unsigned RegNo) { + if (Pic) + MipsTargetStreamer::emitDirectiveCpLocal(RegNo); +} + bool MipsTargetELFStreamer::emitDirectiveCpRestore( int Offset, function_ref GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { @@ -1182,7 +1211,7 @@ bool MipsTargetELFStreamer::emitDirectiveCpRestore( return true; // Store the $gp on the stack. - emitStoreWithImmOffset(Mips::SW, Mips::GP, Mips::SP, Offset, GetATReg, IDLoc, + emitStoreWithImmOffset(Mips::SW, GPReg, Mips::SP, Offset, GetATReg, IDLoc, STI); return true; } @@ -1203,10 +1232,10 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, // Either store the old $gp in a register or on the stack if (IsReg) { // move $save, $gpreg - emitRRR(Mips::OR64, RegOrOffset, Mips::GP, Mips::ZERO, SMLoc(), &STI); + emitRRR(Mips::OR64, RegOrOffset, GPReg, Mips::ZERO, SMLoc(), &STI); } else { // sd $gpreg, offset($sp) - emitRRI(Mips::SD, Mips::GP, Mips::SP, RegOrOffset, SMLoc(), &STI); + emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI); } if (getABI().IsN32()) { @@ -1219,11 +1248,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, MCA.getContext()); // lui $gp, %hi(__gnu_local_gp) - emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI); + emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI); // addiu $gp, $gp, %lo(__gnu_local_gp) - emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr), - SMLoc(), &STI); + emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(), + &STI); return; } @@ -1236,14 +1265,14 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, MCA.getContext()); // lui $gp, %hi(%neg(%gp_rel(funcSym))) - emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI); + emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI); // addiu $gp, $gp, %lo(%neg(%gp_rel(funcSym))) - emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr), - SMLoc(), &STI); + emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(), + &STI); // daddu $gp, $gp, $funcreg - emitRRR(Mips::DADDu, Mips::GP, Mips::GP, RegNo, SMLoc(), &STI); + emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI); } void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation, @@ -1256,12 +1285,12 @@ void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation, // Either restore the old $gp from a register or on the stack if (SaveLocationIsRegister) { Inst.setOpcode(Mips::OR); - Inst.addOperand(MCOperand::createReg(Mips::GP)); + Inst.addOperand(MCOperand::createReg(GPReg)); Inst.addOperand(MCOperand::createReg(SaveLocation)); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); } else { Inst.setOpcode(Mips::LD); - Inst.addOperand(MCOperand::createReg(Mips::GP)); + Inst.addOperand(MCOperand::createReg(GPReg)); Inst.addOperand(MCOperand::createReg(Mips::SP)); Inst.addOperand(MCOperand::createImm(SaveLocation)); } diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td index ed5b8dd71a51..dbff0f6200f2 100644 --- a/lib/Target/Mips/MicroMips32r6InstrFormats.td +++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td @@ -1,9 +1,8 @@ //=- MicroMips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td index 814918d25e70..425773dc57f1 100644 --- a/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -1,9 +1,8 @@ //=- MicroMips32r6InstrInfo.td - MicroMips r6 Instruction Information -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -246,6 +245,7 @@ class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>; class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>; class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>; class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>; +class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>; class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>; class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>; class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>; @@ -460,6 +460,7 @@ class JALRC16_MMR6_DESC_BASE let isCall = 1; let hasDelaySlot = 0; let Defs = [RA]; + let hasPostISelHook = 1; } class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>; @@ -889,6 +890,8 @@ class FMOV_FNEG_MMR6_DESC_BASE; +class FMOV_D_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>; class FNEG_S_MMR6_DESC : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>; @@ -1039,7 +1042,7 @@ class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd, class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>; class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd, - AFGR64Opnd, II_TRUNC>; + FGR64Opnd, II_TRUNC>; class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>; class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd, @@ -1210,7 +1213,7 @@ class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd, class SWSP_MMR6_DESC : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset), !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>, - MMR6Arch<"sw"> { + MMR6Arch<"swsp"> { let DecoderMethod = "DecodeMemMMSPImm5Lsl2"; let mayStore = 1; } @@ -1461,6 +1464,8 @@ def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC, ISA_MICROMIPS32R6; def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC, ISA_MICROMIPS32R6; +def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC, + ISA_MICROMIPS32R6; def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC, ISA_MICROMIPS32R6; def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6; @@ -1749,6 +1754,8 @@ def : MipsPat<(f32 fpimm0), (MTC1_MMR6 ZERO)>, ISA_MICROMIPS32R6; def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1_MMR6 ZERO))>, ISA_MICROMIPS32R6; def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src), (TRUNC_W_D_MMR6 FGR64Opnd:$src)>, ISA_MICROMIPS32R6; +def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src), + (TRUNC_W_S_MMR6 FGR32Opnd:$src)>, ISA_MICROMIPS32R6; def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm), (ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>, @@ -1767,6 +1774,19 @@ let AddedComplexity = 41 in { def : StoreRegImmPat, FGR_64, ISA_MICROMIPS32R6; } +let isCall=1, hasDelaySlot=0, isCTI=1, Defs = [RA] in { + class JumpLinkMMR6 : + PseudoSE<(outs), (ins calltarget:$target), [], II_JAL>, + PseudoInstExpansion<(JumpInst Opnd:$target)>; +} + +def JAL_MMR6 : JumpLinkMMR6, ISA_MICROMIPS32R6; + +def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)), + (JAL_MMR6 texternalsym:$dst)>, ISA_MICROMIPS32R6; +def : MipsPat<(MipsJmpLink (iPTR tglobaladdr:$dst)), + (JAL_MMR6 tglobaladdr:$dst)>, ISA_MICROMIPS32R6; + def TAILCALL_MMR6 : TailCall, ISA_MICROMIPS32R6; def TAILCALLREG_MMR6 : TailCallReg, ISA_MICROMIPS32R6; diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td index 0d444dfc9fad..26b6cf8994ca 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td +++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td @@ -1,9 +1,8 @@ //===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td index 132de6be750d..5a12568893af 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td +++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -1,9 +1,8 @@ //===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td index 1731afc1961f..5d87068ff407 100644 --- a/lib/Target/Mips/MicroMipsInstrFPU.td +++ b/lib/Target/Mips/MicroMipsInstrFPU.td @@ -1,9 +1,8 @@ //==- MicroMipsInstrFPU.td - microMIPS FPU Instruction Info -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -114,8 +113,7 @@ multiclass ABSS_MMM, + def _D64_MM : StdMMR6Rel, ABSS_FT, ISA_MICROMIPS, FGR_64 { string DecoderNamespace = "MicroMipsFP64"; } @@ -124,7 +122,7 @@ multiclass ABSS_MMM, ROUND_W_FM_MM<1, 0x28>; defm FABS : ABSS_MMM<"abs.d", II_SQRT_D, fabs>, ABS_FM_MM<1, 0xd>; -let DecoderNamespace = "MicroMips" in { +let DecoderNamespace = "MicroMips", AdditionalPredicates = [UseAbs] in { def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>, ABS_FM_MM<0, 0xd>, ISA_MICROMIPS; } @@ -266,7 +264,7 @@ let DecoderNamespace = "MicroMips" in { ROUND_W_FM_MM<0b1, 0b01001000>, ISA_MICROMIPS, FGR_64; def RSQRT_S_MM : MMRel, ABSS_FT<"rsqrt.s", FGR32Opnd, FGR32Opnd, II_RECIP_S>, - ROUND_W_FM_MM<0b0, 0b00001000>; + ROUND_W_FM_MM<0b0, 0b00001000>, ISA_MICROMIPS; def RSQRT_D32_MM : MMRel, ABSS_FT<"rsqrt.d", AFGR64Opnd, AFGR64Opnd, II_RECIP_D>, ROUND_W_FM_MM<0b1, 0b00001000>, ISA_MICROMIPS, FGR_32 { @@ -425,6 +423,11 @@ def : MipsPat<(f64 (fpextend FGR32Opnd:$src)), def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src), (TRUNC_W_MM AFGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32; +def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src), + (CVT_W_D64_MM FGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6, + FGR_64; +def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src), + (TRUNC_W_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6; // Selects defm : MovzPats0, diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td index 2a4cc279ef0d..e9fb9b310e3b 100644 --- a/lib/Target/Mips/MicroMipsInstrFormats.td +++ b/lib/Target/Mips/MicroMipsInstrFormats.td @@ -1,9 +1,8 @@ //===-- MicroMipsInstrFormats.td - microMIPS Inst Formats -*- tablegen -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index af380a0ec71e..9b7f7b25fa94 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -1,9 +1,8 @@ //===--- MicroMipsInstrFormats.td - microMIPS Inst Defs -*- tablegen -*----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -426,6 +425,7 @@ class JumpLinkRegMM16 : let isCall = 1; let hasDelaySlot = 1; let Defs = [RA]; + let hasPostISelHook = 1; } // 16-bit Jump Reg @@ -654,7 +654,7 @@ def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>, LOAD_GP_FM_MM16<0x19>, ISA_MICROMIPS; def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>, LOAD_STORE_SP_FM_MM16<0x12>, ISA_MICROMIPS; -def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>, +def SWSP_MM : StoreSPMM16<"swsp", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>, LOAD_STORE_SP_FM_MM16<0x32>, ISA_MICROMIPS32_NOT_MIPS32R6; def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16, ISA_MICROMIPS; @@ -694,6 +694,10 @@ def BREAK16_MM : BrkSdbbp16MM<"break16", II_BREAK>, BRKSDBBP16_FM_MM<0x28>, def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, BRKSDBBP16_FM_MM<0x2C>, ISA_MICROMIPS32_NOT_MIPS32R6; +class WaitMM : + InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [], + II_WAIT, FrmOther, opstr>; + let DecoderNamespace = "MicroMips" in { /// Load and Store Instructions - multiple def SWM16_MM : StoreMultMM16<"swm16", II_SWM>, LWM_FM_MM16<0x5>, @@ -706,13 +710,7 @@ let DecoderNamespace = "MicroMips" in { def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt), "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">, POOL32A_CFTC2_FM_MM<0b1101110100>, ISA_MICROMIPS; -} - -class WaitMM : - InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [], - II_WAIT, FrmOther, opstr>; -let DecoderNamespace = "MicroMips" in { /// Compact Branch Instructions def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>, COMPACT_BRANCH_FM_MM<0x7>, ISA_MICROMIPS32_NOT_MIPS32R6; @@ -822,8 +820,7 @@ let DecoderNamespace = "MicroMips" in { def SW_MM : Store<"sw", GPR32Opnd, null_frag, II_SW>, MMRel, LW_FM_MM<0x3e>, ISA_MICROMIPS; } -} -let DecoderNamespace = "MicroMips" in { + let DecoderMethod = "DecodeMemMMImm9" in { def LBE_MM : MMRel, Load<"lbe", GPR32Opnd, null_frag, II_LBE>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>, ISA_MICROMIPS, ASE_EVA; @@ -881,8 +878,7 @@ let DecoderNamespace = "MicroMips" in { def SWR_MM : MMRel, StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12, II_SWR>, LWL_FM_MM<0x9>, ISA_MICROMIPS32_NOT_MIPS32R6; -} -let DecoderNamespace = "MicroMips" in { + /// Load and Store Instructions - multiple def SWM32_MM : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>, ISA_MICROMIPS; def LWM32_MM : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>, ISA_MICROMIPS; @@ -1125,7 +1121,8 @@ let AdditionalPredicates = [NotDSP] in { ISA_MICROMIPS32_NOT_MIPS32R6; } -def TAILCALL_MM : TailCall, ISA_MIPS1_NOT_32R6_64R6; +def TAILCALL_MM : TailCall, + ISA_MICROMIPS32_NOT_MIPS32R6; def TAILCALLREG_MM : TailCallReg, ISA_MICROMIPS32_NOT_MIPS32R6; @@ -1139,9 +1136,7 @@ let DecoderNamespace = "MicroMips" in { def LWU_MM : MMRel, LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU, mem_simm12>, LL_FM_MM<0xe>, ISA_MICROMIPS32_NOT_MIPS32R6; -} -let DecoderNamespace = "MicroMips" in { def MFGC0_MM : MMRel, MfCop0MM<"mfgc0", GPR32Opnd, COP0Opnd, II_MFGC0>, POOL32A_MFTC0_FM_MM<0b10011, 0b111100>, ISA_MICROMIPS32R5, ASE_VIRT; @@ -1204,7 +1199,7 @@ def : MipsPat<(atomic_load_32 addr:$a), (LW_MM addr:$a)>, ISA_MICROMIPS; def : MipsPat<(i32 immLi16:$imm), (LI16_MM immLi16:$imm)>, ISA_MICROMIPS; -defm : MaterializeImms, ISA_MICROMIPS; +defm : MaterializeImms, ISA_MICROMIPS; def : MipsPat<(not GPRMM16:$in), (NOT16_MM GPRMM16:$in)>, ISA_MICROMIPS; @@ -1453,3 +1448,6 @@ def : MipsInstAlias<"mtgc0 $rt, $rs", def : MipsInstAlias<"mthgc0 $rt, $rs", (MTHGC0_MM COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>, ISA_MICROMIPS32R5, ASE_VIRT; +def : MipsInstAlias<"sw $rt, $offset", + (SWSP_MM GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset), 1>, + ISA_MICROMIPS; diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp index f9062cc23da2..70af95592aa5 100644 --- a/lib/Target/Mips/MicroMipsSizeReduction.cpp +++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp @@ -1,9 +1,8 @@ //=== MicroMipsSizeReduction.cpp - MicroMips size reduction pass --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// ///\file diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h index 6bb7aecc867a..b3faaab436f0 100644 --- a/lib/Target/Mips/Mips.h +++ b/lib/Target/Mips/Mips.h @@ -1,9 +1,8 @@ //===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index 2f3a1c399d3e..7b83ea8535ae 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -1,9 +1,8 @@ //===-- Mips.td - Describe the Mips Target Machine ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This is the top level entry point for the Mips target. @@ -83,6 +82,8 @@ def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true", "Support for FPXX">; def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true", "IEEE 754-2008 NaN encoding">; +def FeatureAbs2008 : SubtargetFeature<"abs2008", "Abs2008", "true", + "Disable IEEE 754-2008 abs.fmt mode">; def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat", "true", "Only supports single precision float">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "IsSoftFloat", "true", @@ -142,7 +143,7 @@ def FeatureMips32r6 : SubtargetFeature<"mips32r6", "MipsArchVersion", "Mips32r6", "Mips32r6 ISA Support [experimental]", [FeatureMips32r5, FeatureFP64Bit, - FeatureNaN2008]>; + FeatureNaN2008, FeatureAbs2008]>; def FeatureMips64 : SubtargetFeature<"mips64", "MipsArchVersion", "Mips64", "Mips64 ISA Support", [FeatureMips5, FeatureMips32]>; @@ -159,7 +160,7 @@ def FeatureMips64r6 : SubtargetFeature<"mips64r6", "MipsArchVersion", "Mips64r6", "Mips64r6 ISA Support [experimental]", [FeatureMips32r6, FeatureMips64r5, - FeatureNaN2008]>; + FeatureNaN2008, FeatureAbs2008]>; def FeatureSym32 : SubtargetFeature<"sym32", "HasSym32", "true", "Symbols are 32 bit on Mips64">; diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index 122c1f5377b6..5a2a916a6b7a 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -1,9 +1,8 @@ //===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h index f7fa4dc3d86d..6b62453f8dfe 100644 --- a/lib/Target/Mips/Mips16FrameLowering.h +++ b/lib/Target/Mips/Mips16FrameLowering.h @@ -1,9 +1,8 @@ //===-- Mips16FrameLowering.h - Mips16 frame lowering ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp index f237bb6d4006..e9a3c7ec4b19 100644 --- a/lib/Target/Mips/Mips16HardFloat.cpp +++ b/lib/Target/Mips/Mips16HardFloat.cpp @@ -1,9 +1,8 @@ //===- Mips16HardFloat.cpp for Mips16 Hard Float --------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -415,7 +414,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, Attribute::ReadNone); A = A.addAttribute(C, AttributeList::FunctionIndex, Attribute::NoInline); - Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T)); + FunctionCallee F = (M->getOrInsertFunction(Name, A, MyVoid, T)); CallInst::Create(F, Params, "", &I); } else if (const CallInst *CI = dyn_cast(&I)) { FunctionType *FT = CI->getFunctionType(); diff --git a/lib/Target/Mips/Mips16HardFloatInfo.cpp b/lib/Target/Mips/Mips16HardFloatInfo.cpp index 2eb6e5ddd2d9..8a02e8156175 100644 --- a/lib/Target/Mips/Mips16HardFloatInfo.cpp +++ b/lib/Target/Mips/Mips16HardFloatInfo.cpp @@ -1,9 +1,8 @@ //===---- Mips16HardFloatInfo.cpp for Mips16 Hard Float -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16HardFloatInfo.h b/lib/Target/Mips/Mips16HardFloatInfo.h index 7295c287576d..b8c485b7e2e3 100644 --- a/lib/Target/Mips/Mips16HardFloatInfo.h +++ b/lib/Target/Mips/Mips16HardFloatInfo.h @@ -1,9 +1,8 @@ //===---- Mips16HardFloatInfo.h for Mips16 Hard Float --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index a0d5bd9ef305..3ab4f1e064da 100644 --- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h index bbf8cc36f241..1ef194029f50 100644 --- a/lib/Target/Mips/Mips16ISelDAGToDAG.h +++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h @@ -1,9 +1,8 @@ //===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp index 79df622241a0..6d8e5aef2a3f 100644 --- a/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/lib/Target/Mips/Mips16ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -156,11 +155,8 @@ llvm::createMips16TargetLowering(const MipsTargetMachine &TM, return new Mips16TargetLowering(TM, STI); } -bool -Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, - bool *Fast) const { +bool Mips16TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const { return false; } @@ -463,8 +459,7 @@ getOpndList(SmallVectorImpl &Ops, } // one more look at list of intrinsics const Mips16IntrinsicHelperType *Helper = - std::lower_bound(std::begin(Mips16IntrinsicHelper), - std::end(Mips16IntrinsicHelper), IntrinsicFind); + llvm::lower_bound(Mips16IntrinsicHelper, IntrinsicFind); if (Helper != std::end(Mips16IntrinsicHelper) && *Helper == IntrinsicFind) { Mips16HelperFunction = Helper->Helper; diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h index 0ee0b816ef70..200249933577 100644 --- a/lib/Target/Mips/Mips16ISelLowering.h +++ b/lib/Target/Mips/Mips16ISelLowering.h @@ -1,9 +1,8 @@ //===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,6 +23,7 @@ namespace llvm { bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, + MachineMemOperand::Flags Flags, bool *Fast) const override; MachineBasicBlock * diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td index 4ff68bef957e..f4ac160c2ba5 100644 --- a/lib/Target/Mips/Mips16InstrFormats.td +++ b/lib/Target/Mips/Mips16InstrFormats.td @@ -1,9 +1,8 @@ //===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp index efebc99b5dae..c234c309d760 100644 --- a/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/lib/Target/Mips/Mips16InstrInfo.cpp @@ -1,9 +1,8 @@ //===- Mips16InstrInfo.cpp - Mips16 Instruction Information ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h index 6a802e4cce5d..dadcaa3055b3 100644 --- a/lib/Target/Mips/Mips16InstrInfo.h +++ b/lib/Target/Mips/Mips16InstrInfo.h @@ -1,9 +1,8 @@ //===- Mips16InstrInfo.h - Mips16 Instruction Information -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td index b7a1b9ce41bf..36b6c73d1008 100644 --- a/lib/Target/Mips/Mips16InstrInfo.td +++ b/lib/Target/Mips/Mips16InstrInfo.td @@ -1,9 +1,8 @@ //===- Mips16InstrInfo.td - Target Description for Mips16 -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -484,13 +483,11 @@ class SelT: // // 32 bit constant // -def Constant32: - MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>; +def Constant32 : MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>; -def LwConstant32: +def LwConstant32 : MipsPseudo16<(outs CPU16Regs:$rx), (ins simm32:$imm, simm32:$constid), - "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>; - + "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>; // // Some general instruction class info diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp index 751afd5ed369..5703f585a6a2 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.cpp +++ b/lib/Target/Mips/Mips16RegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- Mips16RegisterInfo.cpp - MIPS16 Register Information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h index d67a79b64033..fca78b43f96b 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.h +++ b/lib/Target/Mips/Mips16RegisterInfo.h @@ -1,9 +1,8 @@ //===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td index 623af570a5e6..ccb6d1df777a 100644 --- a/lib/Target/Mips/Mips32r6InstrFormats.td +++ b/lib/Target/Mips/Mips32r6InstrFormats.td @@ -1,9 +1,8 @@ //=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td index 2bd0cf2d59a6..2c3048411a5c 100644 --- a/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/lib/Target/Mips/Mips32r6InstrInfo.td @@ -1,9 +1,8 @@ //=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -150,7 +149,6 @@ class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>; class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>; class LWPC_ENC : PCREL19_FM; -class LWUPC_ENC : PCREL19_FM; class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>; class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>; @@ -326,7 +324,6 @@ class PCREL_DESC_BASE; class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2, II_LWPC>; -class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>; class ALIGN_DESC_BASE @@ -927,7 +924,6 @@ let AdditionalPredicates = [NotInMicroMips] in { } def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6; let AdditionalPredicates = [NotInMicroMips] in { - def LWUPC : R6MMR6Rel, LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; @@ -1105,7 +1101,7 @@ def : MipsPat<(select i32:$cond, immz, i32:$f), // Pseudo instructions let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1, - hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in { + hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in { class TailCallRegR6 : PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>, PseudoInstExpansion<(JumpInst RT:$rt, RO:$rs)>; diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 5729182deafb..7f35280f7936 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -1,9 +1,8 @@ //===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -250,7 +249,7 @@ def SC64 : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_64, def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>, PTR_64; } -def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM; +def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM, PTR_64; /// Jump and Branch Instructions let isCodeGenOnly = 1 in { @@ -267,14 +266,15 @@ let isCodeGenOnly = 1 in { def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>, GPR_64; let AdditionalPredicates = [NoIndirectJumpGuards] in - def JALR64Pseudo : JumpLinkRegPseudo; + def JALR64Pseudo : JumpLinkRegPseudo, + PTR_64; } let AdditionalPredicates = [NotInMicroMips], DecoderNamespace = "Mips64" in { - def JR_HB64 : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6; - def JALR_HB64 : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32R2; + def JR_HB64 : JR_HB_DESC, JR_HB_ENC, ISA_MIPS64_NOT_64R6; + def JALR_HB64 : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS64R2; } -def PseudoReturn64 : PseudoReturnBase; +def PseudoReturn64 : PseudoReturnBase, GPR_64; let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips, NoIndirectJumpGuards] in { @@ -290,7 +290,7 @@ let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips, ISA_MIPS32R2_NOT_32R6_64R6, PTR_64; def PseudoIndirectHazardBranch64 : PseudoIndirectBranchBase, - ISA_MIPS32R2_NOT_32R6_64R6; + ISA_MIPS32R2_NOT_32R6_64R6, PTR_64; } /// Multiply and Divide Instructions. @@ -332,17 +332,17 @@ def PseudoMTLOHI64 : PseudoMTLOHI, ISA_MIPS3_NOT_32R6_64R6; /// Sign Ext In Register Instructions. def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>, - ISA_MIPS32R2; + ISA_MIPS32R2, GPR_64; def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>, - ISA_MIPS32R2; + ISA_MIPS32R2, GPR_64; } /// Count Leading let AdditionalPredicates = [NotInMicroMips] in { def DCLZ : CountLeading0<"dclz", GPR64Opnd, II_DCLZ>, CLO_FM<0x24>, - ISA_MIPS64_NOT_64R6; + ISA_MIPS64_NOT_64R6, GPR_64; def DCLO : CountLeading1<"dclo", GPR64Opnd, II_DCLO>, CLO_FM<0x25>, - ISA_MIPS64_NOT_64R6; + ISA_MIPS64_NOT_64R6, GPR_64; /// Double Word Swap Bytes/HalfWords def DSBH : SubwordSwap<"dsbh", GPR64Opnd, II_DSBH>, SEB_FM<2, 0x24>, @@ -417,17 +417,25 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in { // explanation. // Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt) -def LONG_BRANCH_LUi2Op_64 : PseudoSE<(outs GPR64Opnd:$dst), - (ins brtarget:$tgt), []>, GPR_64; +def LONG_BRANCH_LUi2Op_64 : + PseudoSE<(outs GPR64Opnd:$dst), (ins brtarget:$tgt), []>, GPR_64 { + bit hasNoSchedulingInfo = 1; +} // Expands to: addiu $dst, %highest/%higher/%hi/%lo($tgt) -def LONG_BRANCH_DADDiu2Op : PseudoSE<(outs GPR64Opnd:$dst), - (ins GPR64Opnd:$src, brtarget:$tgt), []>, GPR_64; - +def LONG_BRANCH_DADDiu2Op : + PseudoSE<(outs GPR64Opnd:$dst), (ins GPR64Opnd:$src, brtarget:$tgt), []>, + GPR_64 { + bit hasNoSchedulingInfo = 1; +} // Expands to: daddiu $dst, $src, %PART($tgt - $baltgt) // where %PART may be %hi or %lo, depending on the relocation kind // that $tgt is annotated with. -def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst), - (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>, GPR_64; +def LONG_BRANCH_DADDiu : + PseudoSE<(outs GPR64Opnd:$dst), + (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>, + GPR_64 { + bit hasNoSchedulingInfo = 1; +} // Cavium Octeon cnMIPS instructions let DecoderNamespace = "CnMips", @@ -580,15 +588,15 @@ def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd, II_DMTC2>, MFC2OP_FM<0x12, 5>, } /// Move between CPU and coprocessor registers -let DecoderNamespace = "Mips64", Predicates = [HasMips64] in { +let DecoderNamespace = "Mips64" in { def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>, - MFC3OP_FM<0x10, 1, 0>, ISA_MIPS3; + MFC3OP_FM<0x10, 1, 0>, ISA_MIPS3, GPR_64; def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>, - MFC3OP_FM<0x10, 5, 0>, ISA_MIPS3; + MFC3OP_FM<0x10, 5, 0>, ISA_MIPS3, GPR_64; def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>, - MFC3OP_FM<0x12, 1, 0>, ISA_MIPS3; + MFC3OP_FM<0x12, 1, 0>, ISA_MIPS3, GPR_64; def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, - MFC3OP_FM<0x12, 5, 0>, ISA_MIPS3; + MFC3OP_FM<0x12, 5, 0>, ISA_MIPS3, GPR_64; } /// Move between CPU and guest coprocessor registers (Virtualization ASE) @@ -600,7 +608,7 @@ let DecoderNamespace = "Mips64" in { } let AdditionalPredicates = [UseIndirectJumpsHazard] in - def JALRHB64Pseudo : JumpLinkRegPseudo; + def JALRHB64Pseudo : JumpLinkRegPseudo, PTR_64; //===----------------------------------------------------------------------===// // Arbitrary patterns that map to one or more instructions @@ -845,7 +853,7 @@ def : MipsPat<(i64 (sext (i32 (sub GPR32:$src, GPR32:$src2)))), (SUBu GPR32:$src, GPR32:$src2), sub_32)>; def : MipsPat<(i64 (sext (i32 (mul GPR32:$src, GPR32:$src2)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (MUL GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS3_NOT_32R6_64R6; + (MUL GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS32_NOT_32R6_64R6; def : MipsPat<(i64 (sext (i32 (MipsMFHI ACC64:$src)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (PseudoMFHI ACC64:$src), sub_32)>; @@ -1147,5 +1155,33 @@ def SLTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs), def : MipsInstAlias<"sltu\t$rs, $imm", (SLTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs, imm64:$imm)>, GPR_64; +def SGEImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd), + (ins GPR64Opnd:$rs, imm64:$imm), + "sge\t$rd, $rs, $imm">, GPR_64; +def : MipsInstAlias<"sge $rs, $imm", (SGEImm64 GPR64Opnd:$rs, + GPR64Opnd:$rs, + imm64:$imm), 0>, GPR_64; + +def SGEUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd), + (ins GPR64Opnd:$rs, imm64:$imm), + "sgeu\t$rd, $rs, $imm">, GPR_64; +def : MipsInstAlias<"sgeu $rs, $imm", (SGEUImm64 GPR64Opnd:$rs, + GPR64Opnd:$rs, + imm64:$imm), 0>, GPR_64; + +def SGTImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd), + (ins GPR64Opnd:$rs, imm64:$imm), + "sgt\t$rd, $rs, $imm">, GPR_64; +def : MipsInstAlias<"sgt $rs, $imm", (SGTImm64 GPR64Opnd:$rs, + GPR64Opnd:$rs, + imm64:$imm), 0>, GPR_64; + +def SGTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd), + (ins GPR64Opnd:$rs, imm64:$imm), + "sgtu\t$rd, $rs, $imm">, GPR_64; +def : MipsInstAlias<"sgtu $rs, $imm", (SGTUImm64 GPR64Opnd:$rs, + GPR64Opnd:$rs, + imm64:$imm), 0>, GPR_64; + def : MipsInstAlias<"rdhwr $rt, $rs", (RDHWR64 GPR64Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, GPR_64; diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td index ac223bc77256..d746bb61f824 100644 --- a/lib/Target/Mips/Mips64r6InstrInfo.td +++ b/lib/Target/Mips/Mips64r6InstrInfo.td @@ -1,9 +1,8 @@ //=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,6 +36,7 @@ class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b011101>; class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>; class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b011101>; class LDPC_ENC : PCREL18_FM; +class LWUPC_ENC : PCREL19_FM; class LLD_R6_ENC : SPECIAL3_LL_SC_FM; class SCD_R6_ENC : SPECIAL3_LL_SC_FM; class CRC32D_ENC : SPECIAL3_2R_SZ_CRC<3,0>; @@ -73,6 +73,7 @@ class DMUHU_DESC : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>; class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>; class DMULU_DESC : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>; class LDPC_DESC : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>; +class LWUPC_DESC : PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>; class LLD_R6_DESC : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simmptr, II_LLD>; class SCD_R6_DESC : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>; class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>; @@ -148,6 +149,7 @@ let AdditionalPredicates = [NotInMicroMips] in { def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS64R6; } def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6; +def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS64R6; def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6; let DecoderNamespace = "Mips32r6_64r6_GP64" in { def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64; diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp index 4e17ee327ab6..ae2b83c414db 100644 --- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp +++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp @@ -1,9 +1,8 @@ //===- MipsAnalyzeImmediate.cpp - Analyze Immediates ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h index 1c520242fb8d..018b9d824526 100644 --- a/lib/Target/Mips/MipsAnalyzeImmediate.h +++ b/lib/Target/Mips/MipsAnalyzeImmediate.h @@ -1,9 +1,8 @@ //===- MipsAnalyzeImmediate.h - Analyze Immediates -------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 362431fd42a6..db83fe49cec0 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -1,9 +1,8 @@ //===- MipsAsmPrinter.cpp - Mips LLVM Assembly Printer --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "MipsAsmPrinter.h" -#include "InstPrinter/MipsInstPrinter.h" #include "MCTargetDesc/MipsABIInfo.h" #include "MCTargetDesc/MipsBaseInfo.h" +#include "MCTargetDesc/MipsInstPrinter.h" #include "MCTargetDesc/MipsMCNaCl.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "Mips.h" @@ -24,6 +23,7 @@ #include "MipsSubtarget.h" #include "MipsTargetMachine.h" #include "MipsTargetStreamer.h" +#include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" @@ -68,6 +68,8 @@ using namespace llvm; #define DEBUG_TYPE "mips-asm-printer" +extern cl::opt EmitJalrReloc; + MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() const { return static_cast(*OutStreamer->getTargetStreamer()); } @@ -148,6 +150,40 @@ void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer, EmitToStreamer(OutStreamer, TmpInst0); } +// If there is an MO_JALR operand, insert: +// +// .reloc tmplabel, R_{MICRO}MIPS_JALR, symbol +// tmplabel: +// +// This is an optimization hint for the linker which may then replace +// an indirect call with a direct branch. +static void emitDirectiveRelocJalr(const MachineInstr &MI, + MCContext &OutContext, + TargetMachine &TM, + MCStreamer &OutStreamer, + const MipsSubtarget &Subtarget) { + for (unsigned int I = MI.getDesc().getNumOperands(), E = MI.getNumOperands(); + I < E; ++I) { + MachineOperand MO = MI.getOperand(I); + if (MO.isMCSymbol() && (MO.getTargetFlags() & MipsII::MO_JALR)) { + MCSymbol *Callee = MO.getMCSymbol(); + if (Callee && !Callee->getName().empty()) { + MCSymbol *OffsetLabel = OutContext.createTempSymbol(); + const MCExpr *OffsetExpr = + MCSymbolRefExpr::create(OffsetLabel, OutContext); + const MCExpr *CaleeExpr = + MCSymbolRefExpr::create(Callee, OutContext); + OutStreamer.EmitRelocDirective + (*OffsetExpr, + Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", + CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo()); + OutStreamer.EmitLabel(OffsetLabel); + return; + } + } + } +} + void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { MipsTargetStreamer &TS = getTargetStreamer(); unsigned Opc = MI->getOpcode(); @@ -207,6 +243,11 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + if (EmitJalrReloc && + (MI->isReturn() || MI->isCall() || MI->isIndirectBranch())) { + emitDirectiveRelocJalr(*MI, OutContext, TM, *OutStreamer, *Subtarget); + } + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); @@ -470,8 +511,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* // Print out an operand for an inline asm expression. bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) { + const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. @@ -480,7 +520,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O); + return AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O); case 'X': // hex const int if ((MO.getType()) != MachineOperand::MO_Immediate) return true; @@ -576,7 +616,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, } bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNum, unsigned AsmVariant, + unsigned OpNum, const char *ExtraCode, raw_ostream &O) { assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands"); @@ -653,7 +693,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, return; case MachineOperand::MO_GlobalAddress: - getSymbol(MO.getGlobal())->print(O, MAI); + PrintSymbolOperand(MO, O); break; case MachineOperand::MO_BlockAddress: { @@ -772,7 +812,8 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { // We should always emit a '.module fp=...' but binutils 2.24 does not accept // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or // -mfp64) and omit it otherwise. - if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit())) + if ((ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit())) || + STI.useSoftFloat()) TS.emitDirectiveModuleFP(); // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h index eb58234e3e77..173a1312812e 100644 --- a/lib/Target/Mips/MipsAsmPrinter.h +++ b/lib/Target/Mips/MipsAsmPrinter.h @@ -1,9 +1,8 @@ //===- MipsAsmPrinter.h - Mips LLVM Assembly Printer -----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -146,11 +145,9 @@ public: bool isBlockOnlyReachableByFallthrough( const MachineBasicBlock* MBB) const override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O); void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O); void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O); diff --git a/lib/Target/Mips/MipsBranchExpansion.cpp b/lib/Target/Mips/MipsBranchExpansion.cpp index e59267c4fd9b..1523a6c020aa 100644 --- a/lib/Target/Mips/MipsBranchExpansion.cpp +++ b/lib/Target/Mips/MipsBranchExpansion.cpp @@ -1,9 +1,8 @@ //===----------------------- MipsBranchExpansion.cpp ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp index 90cb3f437bd5..ef48c850a1b8 100644 --- a/lib/Target/Mips/MipsCCState.cpp +++ b/lib/Target/Mips/MipsCCState.cpp @@ -1,9 +1,8 @@ //===---- MipsCCState.cpp - CCState with Mips specific extensions ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h index 27901699480b..fd2fd97c8f13 100644 --- a/lib/Target/Mips/MipsCCState.h +++ b/lib/Target/Mips/MipsCCState.h @@ -1,9 +1,8 @@ //===---- MipsCCState.h - CCState with Mips specific extensions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp index c550fadf6632..da65689ecff5 100644 --- a/lib/Target/Mips/MipsCallLowering.cpp +++ b/lib/Target/Mips/MipsCallLowering.cpp @@ -1,9 +1,8 @@ //===- MipsCallLowering.cpp -------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,6 +14,7 @@ #include "MipsCallLowering.h" #include "MipsCCState.h" +#include "MipsMachineFunction.h" #include "MipsTargetMachine.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -24,10 +24,10 @@ using namespace llvm; MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI) : CallLowering(&TLI) {} -bool MipsCallLowering::MipsHandler::assign(unsigned VReg, - const CCValAssign &VA) { +bool MipsCallLowering::MipsHandler::assign(Register VReg, const CCValAssign &VA, + const EVT &VT) { if (VA.isRegLoc()) { - assignValueToReg(VReg, VA); + assignValueToReg(VReg, VA, VT); } else if (VA.isMemLoc()) { assignValueToAddress(VReg, VA); } else { @@ -36,24 +36,25 @@ bool MipsCallLowering::MipsHandler::assign(unsigned VReg, return true; } -bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef VRegs, +bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef VRegs, ArrayRef ArgLocs, - unsigned ArgLocsStartIndex) { + unsigned ArgLocsStartIndex, + const EVT &VT) { for (unsigned i = 0; i < VRegs.size(); ++i) - if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i])) + if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i], VT)) return false; return true; } void MipsCallLowering::MipsHandler::setLeastSignificantFirst( - SmallVectorImpl &VRegs) { + SmallVectorImpl &VRegs) { if (!MIRBuilder.getMF().getDataLayout().isLittleEndian()) std::reverse(VRegs.begin(), VRegs.end()); } bool MipsCallLowering::MipsHandler::handle( ArrayRef ArgLocs, ArrayRef Args) { - SmallVector VRegs; + SmallVector VRegs; unsigned SplitLength; const Function &F = MIRBuilder.getMF().getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); @@ -65,6 +66,8 @@ bool MipsCallLowering::MipsHandler::handle( EVT VT = TLI.getValueType(DL, Args[ArgsIndex].Ty); SplitLength = TLI.getNumRegistersForCallingConv(F.getContext(), F.getCallingConv(), VT); + assert(Args[ArgsIndex].Regs.size() == 1 && "Can't handle multple regs yet"); + if (SplitLength > 1) { VRegs.clear(); MVT RegisterVT = TLI.getRegisterTypeForCallingConv( @@ -72,10 +75,11 @@ bool MipsCallLowering::MipsHandler::handle( for (unsigned i = 0; i < SplitLength; ++i) VRegs.push_back(MRI.createGenericVirtualRegister(LLT{RegisterVT})); - if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Reg)) + if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Regs[0], + VT)) return false; } else { - if (!assign(Args[ArgsIndex].Reg, ArgLocs[ArgLocsIndex])) + if (!assign(Args[ArgsIndex].Regs[0], ArgLocs[ArgLocsIndex], VT)) return false; } } @@ -89,24 +93,25 @@ public: : MipsHandler(MIRBuilder, MRI) {} private: - void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) override; + void assignValueToReg(Register ValVReg, const CCValAssign &VA, + const EVT &VT) override; - unsigned getStackAddress(const CCValAssign &VA, + Register getStackAddress(const CCValAssign &VA, MachineMemOperand *&MMO) override; - void assignValueToAddress(unsigned ValVReg, const CCValAssign &VA) override; + void assignValueToAddress(Register ValVReg, const CCValAssign &VA) override; - bool handleSplit(SmallVectorImpl &VRegs, + bool handleSplit(SmallVectorImpl &VRegs, ArrayRef ArgLocs, unsigned ArgLocsStartIndex, - unsigned ArgsReg) override; + Register ArgsReg, const EVT &VT) override; virtual void markPhysRegUsed(unsigned PhysReg) { MIRBuilder.getMBB().addLiveIn(PhysReg); } - void buildLoad(unsigned Val, const CCValAssign &VA) { + void buildLoad(Register Val, const CCValAssign &VA) { MachineMemOperand *MMO; - unsigned Addr = getStackAddress(VA, MMO); + Register Addr = getStackAddress(VA, MMO); MIRBuilder.buildLoad(Val, Addr, *MMO); } }; @@ -127,59 +132,88 @@ private: } // end anonymous namespace -void IncomingValueHandler::assignValueToReg(unsigned ValVReg, - const CCValAssign &VA) { - unsigned PhysReg = VA.getLocReg(); - switch (VA.getLocInfo()) { - case CCValAssign::LocInfo::SExt: - case CCValAssign::LocInfo::ZExt: - case CCValAssign::LocInfo::AExt: { - auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); - break; - } - default: - MIRBuilder.buildCopy(ValVReg, PhysReg); - break; +void IncomingValueHandler::assignValueToReg(Register ValVReg, + const CCValAssign &VA, + const EVT &VT) { + const MipsSubtarget &STI = + static_cast(MIRBuilder.getMF().getSubtarget()); + Register PhysReg = VA.getLocReg(); + if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) { + const MipsSubtarget &STI = + static_cast(MIRBuilder.getMF().getSubtarget()); + + MIRBuilder + .buildInstr(STI.isFP64bit() ? Mips::BuildPairF64_64 + : Mips::BuildPairF64) + .addDef(ValVReg) + .addUse(PhysReg + (STI.isLittle() ? 0 : 1)) + .addUse(PhysReg + (STI.isLittle() ? 1 : 0)) + .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + markPhysRegUsed(PhysReg); + markPhysRegUsed(PhysReg + 1); + } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) { + MIRBuilder.buildInstr(Mips::MTC1) + .addDef(ValVReg) + .addUse(PhysReg) + .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + markPhysRegUsed(PhysReg); + } else { + switch (VA.getLocInfo()) { + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + } + markPhysRegUsed(PhysReg); } - markPhysRegUsed(PhysReg); } -unsigned IncomingValueHandler::getStackAddress(const CCValAssign &VA, +Register IncomingValueHandler::getStackAddress(const CCValAssign &VA, MachineMemOperand *&MMO) { + MachineFunction &MF = MIRBuilder.getMF(); unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8; unsigned Offset = VA.getLocMemOffset(); - MachineFrameInfo &MFI = MIRBuilder.getMF().getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); MachinePointerInfo MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - MMO = MIRBuilder.getMF().getMachineMemOperand(MPO, MachineMemOperand::MOLoad, - Size, /* Alignment */ 0); - unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32)); + const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering(); + unsigned Align = MinAlign(TFL->getStackAlignment(), Offset); + MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Align); + + Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32)); MIRBuilder.buildFrameIndex(AddrReg, FI); return AddrReg; } -void IncomingValueHandler::assignValueToAddress(unsigned ValVReg, +void IncomingValueHandler::assignValueToAddress(Register ValVReg, const CCValAssign &VA) { if (VA.getLocInfo() == CCValAssign::SExt || VA.getLocInfo() == CCValAssign::ZExt || VA.getLocInfo() == CCValAssign::AExt) { - unsigned LoadReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + Register LoadReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); buildLoad(LoadReg, VA); MIRBuilder.buildTrunc(ValVReg, LoadReg); } else buildLoad(ValVReg, VA); } -bool IncomingValueHandler::handleSplit(SmallVectorImpl &VRegs, +bool IncomingValueHandler::handleSplit(SmallVectorImpl &VRegs, ArrayRef ArgLocs, unsigned ArgLocsStartIndex, - unsigned ArgsReg) { - if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex)) + Register ArgsReg, const EVT &VT) { + if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT)) return false; setLeastSignificantFirst(VRegs); MIRBuilder.buildMerge(ArgsReg, VRegs); @@ -194,78 +228,111 @@ public: : MipsHandler(MIRBuilder, MRI), MIB(MIB) {} private: - void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) override; + void assignValueToReg(Register ValVReg, const CCValAssign &VA, + const EVT &VT) override; - unsigned getStackAddress(const CCValAssign &VA, + Register getStackAddress(const CCValAssign &VA, MachineMemOperand *&MMO) override; - void assignValueToAddress(unsigned ValVReg, const CCValAssign &VA) override; + void assignValueToAddress(Register ValVReg, const CCValAssign &VA) override; - bool handleSplit(SmallVectorImpl &VRegs, + bool handleSplit(SmallVectorImpl &VRegs, ArrayRef ArgLocs, unsigned ArgLocsStartIndex, - unsigned ArgsReg) override; + Register ArgsReg, const EVT &VT) override; - unsigned extendRegister(unsigned ValReg, const CCValAssign &VA); + Register extendRegister(Register ValReg, const CCValAssign &VA); MachineInstrBuilder &MIB; }; } // end anonymous namespace -void OutgoingValueHandler::assignValueToReg(unsigned ValVReg, - const CCValAssign &VA) { - unsigned PhysReg = VA.getLocReg(); - unsigned ExtReg = extendRegister(ValVReg, VA); - MIRBuilder.buildCopy(PhysReg, ExtReg); - MIB.addUse(PhysReg, RegState::Implicit); +void OutgoingValueHandler::assignValueToReg(Register ValVReg, + const CCValAssign &VA, + const EVT &VT) { + Register PhysReg = VA.getLocReg(); + const MipsSubtarget &STI = + static_cast(MIRBuilder.getMF().getSubtarget()); + + if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) { + MIRBuilder + .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64 + : Mips::ExtractElementF64) + .addDef(PhysReg + (STI.isLittle() ? 1 : 0)) + .addUse(ValVReg) + .addImm(1) + .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + MIRBuilder + .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64 + : Mips::ExtractElementF64) + .addDef(PhysReg + (STI.isLittle() ? 0 : 1)) + .addUse(ValVReg) + .addImm(0) + .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) { + MIRBuilder.buildInstr(Mips::MFC1) + .addDef(PhysReg) + .addUse(ValVReg) + .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + } else { + Register ExtReg = extendRegister(ValVReg, VA); + MIRBuilder.buildCopy(PhysReg, ExtReg); + MIB.addUse(PhysReg, RegState::Implicit); + } } -unsigned OutgoingValueHandler::getStackAddress(const CCValAssign &VA, +Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA, MachineMemOperand *&MMO) { + MachineFunction &MF = MIRBuilder.getMF(); + const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering(); + LLT p0 = LLT::pointer(0, 32); LLT s32 = LLT::scalar(32); - unsigned SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, Mips::SP); + Register SPReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildCopy(SPReg, Register(Mips::SP)); - unsigned OffsetReg = MRI.createGenericVirtualRegister(s32); + Register OffsetReg = MRI.createGenericVirtualRegister(s32); unsigned Offset = VA.getLocMemOffset(); MIRBuilder.buildConstant(OffsetReg, Offset); - unsigned AddrReg = MRI.createGenericVirtualRegister(p0); + Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); MachinePointerInfo MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8; - MMO = MIRBuilder.getMF().getMachineMemOperand(MPO, MachineMemOperand::MOStore, - Size, /* Alignment */ 0); + unsigned Align = MinAlign(TFL->getStackAlignment(), Offset); + MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, Align); return AddrReg; } -void OutgoingValueHandler::assignValueToAddress(unsigned ValVReg, +void OutgoingValueHandler::assignValueToAddress(Register ValVReg, const CCValAssign &VA) { MachineMemOperand *MMO; - unsigned Addr = getStackAddress(VA, MMO); - unsigned ExtReg = extendRegister(ValVReg, VA); + Register Addr = getStackAddress(VA, MMO); + Register ExtReg = extendRegister(ValVReg, VA); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } -unsigned OutgoingValueHandler::extendRegister(unsigned ValReg, +Register OutgoingValueHandler::extendRegister(Register ValReg, const CCValAssign &VA) { LLT LocTy{VA.getLocVT()}; switch (VA.getLocInfo()) { case CCValAssign::SExt: { - unsigned ExtReg = MRI.createGenericVirtualRegister(LocTy); + Register ExtReg = MRI.createGenericVirtualRegister(LocTy); MIRBuilder.buildSExt(ExtReg, ValReg); return ExtReg; } case CCValAssign::ZExt: { - unsigned ExtReg = MRI.createGenericVirtualRegister(LocTy); + Register ExtReg = MRI.createGenericVirtualRegister(LocTy); MIRBuilder.buildZExt(ExtReg, ValReg); return ExtReg; } case CCValAssign::AExt: { - unsigned ExtReg = MRI.createGenericVirtualRegister(LocTy); + Register ExtReg = MRI.createGenericVirtualRegister(LocTy); MIRBuilder.buildAnyExt(ExtReg, ValReg); return ExtReg; } @@ -278,13 +345,13 @@ unsigned OutgoingValueHandler::extendRegister(unsigned ValReg, llvm_unreachable("unable to extend register"); } -bool OutgoingValueHandler::handleSplit(SmallVectorImpl &VRegs, +bool OutgoingValueHandler::handleSplit(SmallVectorImpl &VRegs, ArrayRef ArgLocs, unsigned ArgLocsStartIndex, - unsigned ArgsReg) { + Register ArgsReg, const EVT &VT) { MIRBuilder.buildUnmerge(VRegs, ArgsReg); setLeastSignificantFirst(VRegs); - if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex)) + if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT)) return false; return true; @@ -295,6 +362,8 @@ static bool isSupportedType(Type *T) { return true; if (T->isPointerTy()) return true; + if (T->isFloatingPointTy()) + return true; return false; } @@ -330,7 +399,7 @@ static void setLocInfo(SmallVectorImpl &ArgLocs, bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const { + ArrayRef VRegs) const { MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA); @@ -376,9 +445,9 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return true; } -bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef VRegs) const { +bool MipsCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { // Quick exit if there aren't any args. if (F.arg_empty()) @@ -444,7 +513,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (Arg.Flags.isByVal() || Arg.Flags.isSRet()) return false; } - if (OrigRet.Reg && !isSupportedType(OrigRet.Ty)) + + if (OrigRet.Regs[0] && !isSupportedType(OrigRet.Ty)) return false; MachineFunction &MF = MIRBuilder.getMF(); @@ -457,14 +527,22 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MachineInstrBuilder CallSeqStart = MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN); - // FIXME: Add support for pic calling sequences, long call sequences for O32, - // N32 and N64. First handle the case when Callee.isReg(). - if (Callee.isReg()) - return false; + const bool IsCalleeGlobalPIC = + Callee.isGlobal() && TM.isPositionIndependent(); - MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(Mips::JAL); + MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert( + Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL); MIB.addDef(Mips::SP, RegState::Implicit); - MIB.add(Callee); + if (IsCalleeGlobalPIC) { + Register CalleeReg = + MF.getRegInfo().createGenericVirtualRegister(LLT::pointer(0, 32)); + MachineInstr *CalleeGlobalValue = + MIRBuilder.buildGlobalValue(CalleeReg, Callee.getGlobal()); + if (!Callee.getGlobal()->hasLocalLinkage()) + CalleeGlobalValue->getOperand(1).setTargetFlags(MipsII::MO_GOT_CALL); + MIB.addUse(CalleeReg); + } else + MIB.add(Callee); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv())); @@ -507,10 +585,21 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, NextStackOffset = alignTo(NextStackOffset, StackAlignment); CallSeqStart.addImm(NextStackOffset).addImm(0); + if (IsCalleeGlobalPIC) { + MIRBuilder.buildCopy( + Register(Mips::GP), + MF.getInfo()->getGlobalBaseRegForGlobalISel()); + MIB.addDef(Mips::GP, RegState::Implicit); + } MIRBuilder.insertInstr(MIB); + if (MIB->getOpcode() == Mips::JALRPseudo) { + const MipsSubtarget &STI = + static_cast(MIRBuilder.getMF().getSubtarget()); + MIB.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + } - if (OrigRet.Reg) { - + if (OrigRet.Regs[0]) { ArgInfos.clear(); SmallVector OrigRetIndices; diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h index 9916b04ef50c..11c2d53ad35d 100644 --- a/lib/Target/Mips/MipsCallLowering.h +++ b/lib/Target/Mips/MipsCallLowering.h @@ -1,9 +1,8 @@ //===- MipsCallLowering.h ---------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -35,37 +34,39 @@ public: ArrayRef Args); protected: - bool assignVRegs(ArrayRef VRegs, ArrayRef ArgLocs, - unsigned Index); + bool assignVRegs(ArrayRef VRegs, ArrayRef ArgLocs, + unsigned ArgLocsStartIndex, const EVT &VT); - void setLeastSignificantFirst(SmallVectorImpl &VRegs); + void setLeastSignificantFirst(SmallVectorImpl &VRegs); MachineIRBuilder &MIRBuilder; MachineRegisterInfo &MRI; private: - bool assign(unsigned VReg, const CCValAssign &VA); + bool assign(Register VReg, const CCValAssign &VA, const EVT &VT); - virtual unsigned getStackAddress(const CCValAssign &VA, + virtual Register getStackAddress(const CCValAssign &VA, MachineMemOperand *&MMO) = 0; - virtual void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) = 0; + virtual void assignValueToReg(Register ValVReg, const CCValAssign &VA, + const EVT &VT) = 0; - virtual void assignValueToAddress(unsigned ValVReg, + virtual void assignValueToAddress(Register ValVReg, const CCValAssign &VA) = 0; - virtual bool handleSplit(SmallVectorImpl &VRegs, + virtual bool handleSplit(SmallVectorImpl &VRegs, ArrayRef ArgLocs, - unsigned ArgLocsStartIndex, unsigned ArgsReg) = 0; + unsigned ArgLocsStartIndex, Register ArgsReg, + const EVT &VT) = 0; }; MipsCallLowering(const MipsTargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const override; + ArrayRef VRegs) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef VRegs) const override; + ArrayRef> VRegs) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index b5df78f89a6b..88236d8e9abd 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -1,9 +1,8 @@ //===-- MipsCallingConv.td - Calling Conventions for Mips --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for Mips architecture. diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td index 0d7e3e200b5f..5affbcbc2101 100644 --- a/lib/Target/Mips/MipsCondMov.td +++ b/lib/Target/Mips/MipsCondMov.td @@ -1,9 +1,8 @@ //===-- MipsCondMov.td - Describe Mips Conditional Moves --*- tablegen -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -110,11 +109,11 @@ let AdditionalPredicates = [NotInMicroMips] in { let isCodeGenOnly = 1 in { def MOVZ_I_I64 : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>, - ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; def MOVZ_I64_I : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>, - ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>, - ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; } def MOVN_I_I : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>, @@ -122,11 +121,11 @@ let AdditionalPredicates = [NotInMicroMips] in { let isCodeGenOnly = 1 in { def MOVN_I_I64 : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>, - ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; def MOVN_I64_I : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>, - ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>, - ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; } def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>, CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6; @@ -156,9 +155,11 @@ let AdditionalPredicates = [NotInMicroMips] in { CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; let isCodeGenOnly = 1 in { def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>, - CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; + CMov_I_F_FM<18, 17>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64, FGR_64; def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>, - CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; + CMov_I_F_FM<19, 17>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64, FGR_64; } } @@ -262,7 +263,7 @@ let AdditionalPredicates = [NotInMicroMips] in { } // For targets that don't have conditional-move instructions // we have to match SELECT nodes with pseudo instructions. -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { class Select_Pseudo : PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F), [(set RC:$dst, (select GPR32Opnd:$cond, RC:$T, RC:$F))]>, @@ -297,7 +298,7 @@ def PseudoSELECTFP_F_S : SelectFP_Pseudo_F; def PseudoSELECTFP_F_D32 : SelectFP_Pseudo_F, FGR_32; def PseudoSELECTFP_F_D64 : SelectFP_Pseudo_F, FGR_64; -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { class D_SELECT_CLASS : PseudoSE<(outs RC:$dst1, RC:$dst2), (ins GPR32Opnd:$cond, RC:$a1, RC:$a2, RC:$b1, RC:$b2), []>, diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp index 744523cc6cb9..eea28df7eda1 100644 --- a/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -1,9 +1,8 @@ //===- MipsConstantIslandPass.cpp - Emit Pc Relative loads ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -842,9 +841,7 @@ void MipsConstantIslands::updateForInsertedWaterBlock // Next, update WaterList. Specifically, we need to add NewMBB as having // available water after it. - water_iterator IP = - std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, - CompareMBBNumbers); + water_iterator IP = llvm::lower_bound(WaterList, NewBB, CompareMBBNumbers); WaterList.insert(IP, NewBB); } @@ -894,9 +891,7 @@ MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) { // available water after it (but not if it's already there, which happens // when splitting before a conditional branch that is followed by an // unconditional branch - in that case we want to insert NewBB). - water_iterator IP = - std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, - CompareMBBNumbers); + water_iterator IP = llvm::lower_bound(WaterList, OrigBB, CompareMBBNumbers); MachineBasicBlock* WaterBB = *IP; if (WaterBB == OrigBB) WaterList.insert(std::next(IP), NewBB); diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td index 5f0763f5ea46..6f062d0f3c25 100644 --- a/lib/Target/Mips/MipsDSPInstrFormats.td +++ b/lib/Target/Mips/MipsDSPInstrFormats.td @@ -1,9 +1,8 @@ //===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td index b9824220b558..daca8b907081 100644 --- a/lib/Target/Mips/MipsDSPInstrInfo.td +++ b/lib/Target/Mips/MipsDSPInstrInfo.td @@ -1,9 +1,8 @@ //===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -516,6 +515,7 @@ class MTHI_DESC_BASE class BPOSGE32_PSEUDO_DESC_BASE : MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> { + bit hasNoSchedulingInfo = 1; bit usesCustomInserter = 1; } @@ -1314,7 +1314,9 @@ def PseudoCMPU_LE_QB : PseudoCMP; def PseudoPICK_PH : PseudoPICK; def PseudoPICK_QB : PseudoPICK; -def PseudoMTLOHI_DSP : PseudoMTLOHI; +let AdditionalPredicates = [HasDSP] in { + def PseudoMTLOHI_DSP : PseudoMTLOHI; +} // Patterns. class DSPPat : diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index e3823e0dfdb8..aa07dac86828 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -1,9 +1,8 @@ //===- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -493,14 +492,12 @@ MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_) bool MemDefsUses::hasHazard_(const MachineInstr &MI) { bool HasHazard = false; - SmallVector Objs; // Check underlying object list. + SmallVector Objs; if (getUnderlyingObjects(MI, Objs)) { - for (SmallVectorImpl::const_iterator I = Objs.begin(); - I != Objs.end(); ++I) - HasHazard |= updateDefsUses(*I, MI.mayStore()); - + for (ValueType VT : Objs) + HasHazard |= updateDefsUses(VT, MI.mayStore()); return HasHazard; } @@ -526,33 +523,32 @@ bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) { bool MemDefsUses:: getUnderlyingObjects(const MachineInstr &MI, SmallVectorImpl &Objects) const { - if (!MI.hasOneMemOperand() || - (!(*MI.memoperands_begin())->getValue() && - !(*MI.memoperands_begin())->getPseudoValue())) + if (!MI.hasOneMemOperand()) return false; - if (const PseudoSourceValue *PSV = - (*MI.memoperands_begin())->getPseudoValue()) { + auto & MMO = **MI.memoperands_begin(); + + if (const PseudoSourceValue *PSV = MMO.getPseudoValue()) { if (!PSV->isAliased(MFI)) return false; Objects.push_back(PSV); return true; } - const Value *V = (*MI.memoperands_begin())->getValue(); + if (const Value *V = MMO.getValue()) { + SmallVector Objs; + GetUnderlyingObjects(V, Objs, DL); - SmallVector Objs; - GetUnderlyingObjects(const_cast(V), Objs, DL); + for (const Value *UValue : Objs) { + if (!isIdentifiedObject(V)) + return false; - for (SmallVectorImpl::iterator I = Objs.begin(), E = Objs.end(); - I != E; ++I) { - if (!isIdentifiedObject(V)) - return false; - - Objects.push_back(*I); + Objects.push_back(UValue); + } + return true; } - return true; + return false; } // Replace Branch with the compact branch instruction. @@ -726,6 +722,7 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin, // but we don't have enough information to make that decision. if (InMicroMipsMode && TII->getInstSizeInBytes(*CurrI) == 2 && (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch || + Opcode == Mips::PseudoIndirectBranch_MM || Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL)) continue; // Instructions LWP/SWP and MOVEP should not be in a delay slot as that diff --git a/lib/Target/Mips/MipsEVAInstrFormats.td b/lib/Target/Mips/MipsEVAInstrFormats.td index 61785d0e891a..9820e4dcfc88 100644 --- a/lib/Target/Mips/MipsEVAInstrFormats.td +++ b/lib/Target/Mips/MipsEVAInstrFormats.td @@ -1,9 +1,8 @@ //===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsEVAInstrInfo.td b/lib/Target/Mips/MipsEVAInstrInfo.td index ff54b1f17877..73cca8cfa5d9 100644 --- a/lib/Target/Mips/MipsEVAInstrInfo.td +++ b/lib/Target/Mips/MipsEVAInstrInfo.td @@ -1,9 +1,8 @@ //===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp index acf66d1fb1b2..65d84a6c44a0 100644 --- a/lib/Target/Mips/MipsExpandPseudo.cpp +++ b/lib/Target/Mips/MipsExpandPseudo.cpp @@ -1,9 +1,8 @@ //===-- MipsExpandPseudoInsts.cpp - Expand pseudo instructions ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 22ade31a72cd..123d3cc242f0 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -1,9 +1,8 @@ //===- MipsFastISel.cpp - Mips FastISel implementation --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -56,6 +55,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSymbol.h" @@ -75,6 +75,8 @@ using namespace llvm; +extern cl::opt EmitJalrReloc; + namespace { class MipsFastISel final : public FastISel { @@ -951,21 +953,34 @@ bool MipsFastISel::selectBranch(const Instruction *I) { // MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - // For now, just try the simplest case where it's fed by a compare. + + // Fold the common case of a conditional branch with a comparison + // in the same block. + unsigned ZExtCondReg = 0; if (const CmpInst *CI = dyn_cast(BI->getCondition())) { - MVT CIMVT = - TLI.getValueType(DL, CI->getOperand(0)->getType(), true).getSimpleVT(); - if (CIMVT == MVT::i1) + if (CI->hasOneUse() && CI->getParent() == I->getParent()) { + ZExtCondReg = createResultReg(&Mips::GPR32RegClass); + if (!emitCmp(ZExtCondReg, CI)) + return false; + } + } + + // For the general case, we need to mask with 1. + if (ZExtCondReg == 0) { + unsigned CondReg = getRegForValue(BI->getCondition()); + if (CondReg == 0) return false; - unsigned CondReg = getRegForValue(CI); - BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ)) - .addReg(CondReg) - .addMBB(TBB); - finishCondBranch(BI->getParent(), TBB, FBB); - return true; + ZExtCondReg = emitIntExt(MVT::i1, CondReg, MVT::i32, true); + if (ZExtCondReg == 0) + return false; } - return false; + + BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ)) + .addReg(ZExtCondReg) + .addMBB(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); + return true; } bool MipsFastISel::selectCmp(const Instruction *I) { @@ -1551,6 +1566,16 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) { CLI.Call = MIB; + if (EmitJalrReloc && !Subtarget->inMips16Mode()) { + // Attach callee address to the instruction, let asm printer emit + // .reloc R_MIPS_JALR. + if (Symbol) + MIB.addSym(Symbol, MipsII::MO_JALR); + else + MIB.addSym(FuncInfo.MF->getContext().getOrCreateSymbol( + Addr.getGlobalValue()->getName()), MipsII::MO_JALR); + } + // Finish off the call including any return values. return finishCall(CLI, RetVT, NumBytes); } diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index 27a85970da6f..8d5eabf59b71 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- MipsFrameLowering.cpp - Mips Frame Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index 0ead56eddd2f..0537cfd1cb30 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -1,9 +1,8 @@ //===-- MipsFrameLowering.h - Define frame lowering for Mips ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index f99f3a1b3e0a..9ba54d6bb73c 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- MipsISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h index 09003459d180..bae3bbf71f3b 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.h +++ b/lib/Target/Mips/MipsISelDAGToDAG.h @@ -1,9 +1,8 @@ //===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 8c2a364cdfa9..0ff09007da4b 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -1,9 +1,8 @@ //===- MipsISelLowering.cpp - Mips DAG Lowering Implementation ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "MipsISelLowering.h" -#include "InstPrinter/MipsInstPrinter.h" #include "MCTargetDesc/MipsBaseInfo.h" +#include "MCTargetDesc/MipsInstPrinter.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "MipsCCState.h" #include "MipsInstrInfo.h" @@ -57,6 +56,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" @@ -91,6 +91,8 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden, cl::desc("MIPS: Don't trap on integer division by zero."), cl::init(false)); +extern cl::opt EmitJalrReloc; + static const MCPhysReg Mips64DPRegs[8] = { Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64, Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64 @@ -362,6 +364,11 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + if (!(TM.Options.NoNaNsFPMath || Subtarget.inAbs2008Mode())) { + setOperationAction(ISD::FABS, MVT::f32, Custom); + setOperationAction(ISD::FABS, MVT::f64, Custom); + } + if (Subtarget.isGP64bit()) { setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::BlockAddress, MVT::i64, Custom); @@ -1183,14 +1190,22 @@ bool MipsTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasMips32(); } +bool MipsTargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { + if (N->getOperand(0).getValueType().isVector()) + return false; + return true; +} + void MipsTargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res = LowerOperation(SDValue(N, 0), DAG); - for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I) - Results.push_back(Res.getValue(I)); + if (Res) + for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I) + Results.push_back(Res.getValue(I)); } void @@ -1216,6 +1231,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::VASTART: return lowerVASTART(Op, DAG); case ISD::VAARG: return lowerVAARG(Op, DAG); case ISD::FCOPYSIGN: return lowerFCOPYSIGN(Op, DAG); + case ISD::FABS: return lowerFABS(Op, DAG); case ISD::FRAMEADDR: return lowerFRAMEADDR(Op, DAG); case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG); case ISD::EH_RETURN: return lowerEH_RETURN(Op, DAG); @@ -1709,7 +1725,7 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI, assert((MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 || MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I64) && - "Unsupported atomic psseudo for EmitAtomicCmpSwap."); + "Unsupported atomic pseudo for EmitAtomicCmpSwap."); const unsigned Size = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ? 4 : 8; @@ -1735,12 +1751,10 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI, // after fast register allocation, the spills will end up outside of the // blocks that their values are defined in, causing livein errors. - unsigned DestCopy = MRI.createVirtualRegister(MRI.getRegClass(Dest)); unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr)); unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal)); unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal)); - BuildMI(*BB, II, DL, TII->get(Mips::COPY), DestCopy).addReg(Dest); BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr); BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal); BuildMI(*BB, II, DL, TII->get(Mips::COPY), NewValCopy).addReg(NewVal); @@ -2293,11 +2307,79 @@ MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert()); } +static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, + bool HasExtractInsert) { + SDLoc DL(Op); + SDValue Res, Const1 = DAG.getConstant(1, DL, MVT::i32); + + // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it + // to i32. + SDValue X = (Op.getValueType() == MVT::f32) + ? DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) + : DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, + Op.getOperand(0), Const1); + + // Clear MSB. + if (HasExtractInsert) + Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, + DAG.getRegister(Mips::ZERO, MVT::i32), + DAG.getConstant(31, DL, MVT::i32), Const1, X); + else { + // TODO: Provide DAG patterns which transform (and x, cst) + // back to a (shl (srl x (clz cst)) (clz cst)) sequence. + SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1); + Res = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1); + } + + if (Op.getValueType() == MVT::f32) + return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Res); + + // FIXME: For mips32r2, the sequence of (BuildPairF64 (ins (ExtractElementF64 + // Op 1), $zero, 31 1) (ExtractElementF64 Op 0)) and the Op has one use, we + // should be able to drop the usage of mfc1/mtc1 and rewrite the register in + // place. + SDValue LowX = + DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0), + DAG.getConstant(0, DL, MVT::i32)); + return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res); +} + +static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, + bool HasExtractInsert) { + SDLoc DL(Op); + SDValue Res, Const1 = DAG.getConstant(1, DL, MVT::i32); + + // Bitcast to integer node. + SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0)); + + // Clear MSB. + if (HasExtractInsert) + Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64, + DAG.getRegister(Mips::ZERO_64, MVT::i64), + DAG.getConstant(63, DL, MVT::i32), Const1, X); + else { + SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i64, X, Const1); + Res = DAG.getNode(ISD::SRL, DL, MVT::i64, SllX, Const1); + } + + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Res); +} + +SDValue MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const { + if ((ABI.IsN32() || ABI.IsN64()) && (Op.getValueType() == MVT::f64)) + return lowerFABS64(Op, DAG, Subtarget.hasExtractInsert()); + + return lowerFABS32(Op, DAG, Subtarget.hasExtractInsert()); +} + SDValue MipsTargetLowering:: lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // check the depth - assert((cast(Op.getOperand(0))->getZExtValue() == 0) && - "Frame address can only be determined for current frame."); + if (cast(Op.getOperand(0))->getZExtValue() != 0) { + DAG.getContext()->emitError( + "return address can be determined only for current frame"); + return SDValue(); + } MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setFrameAddressIsTaken(true); @@ -2314,8 +2396,11 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op, return SDValue(); // check the depth - assert((cast(Op.getOperand(0))->getZExtValue() == 0) && - "Return address can be determined only for current frame."); + if (cast(Op.getOperand(0))->getZExtValue() != 0) { + DAG.getContext()->emitError( + "return address can be determined only for current frame"); + return SDValue(); + } MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -2879,6 +2964,54 @@ getOpndList(SmallVectorImpl &Ops, Ops.push_back(InFlag); } +void MipsTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const { + switch (MI.getOpcode()) { + default: + return; + case Mips::JALR: + case Mips::JALRPseudo: + case Mips::JALR64: + case Mips::JALR64Pseudo: + case Mips::JALR16_MM: + case Mips::JALRC16_MMR6: + case Mips::TAILCALLREG: + case Mips::TAILCALLREG64: + case Mips::TAILCALLR6REG: + case Mips::TAILCALL64R6REG: + case Mips::TAILCALLREG_MM: + case Mips::TAILCALLREG_MMR6: { + if (!EmitJalrReloc || + Subtarget.inMips16Mode() || + !isPositionIndependent() || + Node->getNumOperands() < 1 || + Node->getOperand(0).getNumOperands() < 2) { + return; + } + // We are after the callee address, set by LowerCall(). + // If added to MI, asm printer will emit .reloc R_MIPS_JALR for the + // symbol. + const SDValue TargetAddr = Node->getOperand(0).getOperand(1); + StringRef Sym; + if (const GlobalAddressSDNode *G = + dyn_cast_or_null(TargetAddr)) { + Sym = G->getGlobal()->getName(); + } + else if (const ExternalSymbolSDNode *ES = + dyn_cast_or_null(TargetAddr)) { + Sym = ES->getSymbol(); + } + + if (Sym.empty()) + return; + + MachineFunction *MF = MI.getParent()->getParent(); + MCSymbol *S = MF->getContext().getOrCreateSymbol(Sym); + MI.addOperand(MachineOperand::CreateMCSymbol(S, MipsII::MO_JALR)); + } + } +} + /// LowerCall - functions arguments are copied from virtual regs to /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. SDValue @@ -2930,7 +3063,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // the maximum out going argument area (including the reserved area), and // preallocates the stack space on entrance to the caller. // - // FIXME: We should do the same for efficency and space. + // FIXME: We should do the same for efficiency and space. // Note: The check on the calling convention below must match // MipsABIInfo::GetCalleeAllocdArgSizeInBytes(). @@ -4007,18 +4140,18 @@ MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { return false; } -EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { +EVT MipsTargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { if (Subtarget.hasMips64()) return MVT::i64; return MVT::i32; } -bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { if (VT != MVT::f32 && VT != MVT::f64) return false; if (Imm.isNegZero()) diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index e043f133a09f..2db60e9801f1 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -1,9 +1,8 @@ //===- MipsISelLowering.h - Mips DAG Lowering Interface ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -285,6 +284,8 @@ class TargetRegisterClass; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; /// Return the register type for a given MVT, ensuring vectors are treated /// as a series of gpr sized integers. @@ -341,6 +342,9 @@ class TargetRegisterClass; EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; + void AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const override; + void HandleByVal(CCState *, unsigned &, unsigned) const override; unsigned getRegisterByName(const char* RegName, EVT VT, @@ -649,9 +653,11 @@ class TargetRegisterClass; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { + if (ConstraintCode == "o") + return InlineAsm::Constraint_o; if (ConstraintCode == "R") return InlineAsm::Constraint_R; - else if (ConstraintCode == "ZC") + if (ConstraintCode == "ZC") return InlineAsm::Constraint_ZC; return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } @@ -666,12 +672,13 @@ class TargetRegisterClass; unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; unsigned getJumpTableEncoding() const override; bool useSoftFloat() const override; diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 4cb8574e08f6..e94e107e64c2 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -1,9 +1,8 @@ //===-- MipsInstrFPU.td - Mips FPU Instruction Information -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -143,7 +142,7 @@ multiclass ABSS_M { def _D32 : MMRel, ABSS_FT, FGR_32; - def _D64 : ABSS_FT, FGR_64 { + def _D64 : StdMMR6Rel, ABSS_FT, FGR_64 { string DecoderNamespace = "MipsFP64"; } } @@ -487,7 +486,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in { def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>; } -let AdditionalPredicates = [NotInMicroMips] in { +let AdditionalPredicates = [NotInMicroMips, UseAbs] in { def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>, ABSS_FM<0x5, 16>, ISA_MIPS1; defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>, ISA_MIPS1; @@ -551,12 +550,7 @@ let AdditionalPredicates = [NotInMicroMips] in { let isMoveReg = 1 in { def FMOV_S : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>, ABSS_FM<0x6, 16>, ISA_MIPS1; - def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>, - ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_32; - def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>, - ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_64 { - let DecoderNamespace = "MipsFP64"; - } + defm FMOV : ABSS_M<"mov.d", II_MOV_D>, ABSS_FM<0x6, 17>, ISA_MIPS1; } // isMoveReg } @@ -793,6 +787,11 @@ def LoadImmDoubleFGR : MipsAsmPseudoInst<(outs StrictlyFGR64Opnd:$rd), "li.d\t$rd, $fpimm">, FGR_64, HARDFLOAT; +def SDC1_M1 : MipsAsmPseudoInst<(outs AFGR64Opnd:$fd), + (ins mem_simm16:$addr), + "s.d\t$fd, $addr">, + FGR_32, ISA_MIPS1, HARDFLOAT; + //===----------------------------------------------------------------------===// // InstAliases. //===----------------------------------------------------------------------===// @@ -805,6 +804,9 @@ def : MipsInstAlias def : MipsInstAlias <"s.d $fd, $addr", (SDC164 FGR64Opnd:$fd, mem_simm16:$addr), 0>, FGR_64, ISA_MIPS2, HARDFLOAT; +def : MipsInstAlias + <"s.d $fd, $addr", (SDC1_M1 AFGR64Opnd:$fd, mem_simm16:$addr), 0>, + FGR_32, ISA_MIPS1, HARDFLOAT; def : MipsInstAlias <"l.s $fd, $addr", (LWC1 FGR32Opnd:$fd, mem_simm16:$addr), 0>, diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td index ebbdcdf0df89..14f01514f33f 100644 --- a/lib/Target/Mips/MipsInstrFormats.td +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -1,9 +1,8 @@ //===-- MipsInstrFormats.td - Mips Instruction Formats -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -146,6 +145,7 @@ class PseudoSE pattern, class MipsAsmPseudoInst: MipsInst { let isPseudo = 1; + let hasNoSchedulingInfo = 1; let Pattern = []; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index bfb4c775205d..fbd56206b249 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -1,9 +1,8 @@ //===- MipsInstrInfo.cpp - Mips Instruction Information -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -578,7 +577,8 @@ unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return MI.getDesc().getSize(); - case TargetOpcode::INLINEASM: { // Inline Asm: Variable size. + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { // Inline Asm: Variable size. const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); @@ -653,6 +653,16 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc, MIB.addImm(0); + // If I has an MCSymbol operand (used by asm printer, to emit R_MIPS_JALR), + // add it to the new instruction. + for (unsigned J = I->getDesc().getNumOperands(), E = I->getNumOperands(); + J < E; ++J) { + const MachineOperand &MO = I->getOperand(J); + if (MO.isMCSymbol() && (MO.getTargetFlags() & MipsII::MO_JALR)) + MIB.addSym(MO.getMCSymbol(), MipsII::MO_JALR); + } + + } else { for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) { if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J) @@ -825,7 +835,8 @@ MipsInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { {MO_GOT_HI16, "mips-got-hi16"}, {MO_GOT_LO16, "mips-got-lo16"}, {MO_CALL_HI16, "mips-call-hi16"}, - {MO_CALL_LO16, "mips-call-lo16"} + {MO_CALL_LO16, "mips-call-lo16"}, + {MO_JALR, "mips-jalr"} }; return makeArrayRef(Flags); } diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index 9d27b8f66211..a626c0c3fdb8 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -1,9 +1,8 @@ //===- MipsInstrInfo.h - Mips Instruction Information -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index d9398b7d6024..a4e85a38ab28 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -1,9 +1,8 @@ //===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -221,6 +220,8 @@ def IsNotN64 : Predicate<"!Subtarget->isABI_N64()">; def RelocNotPIC : Predicate<"!TM.isPositionIndependent()">; def RelocPIC : Predicate<"TM.isPositionIndependent()">; def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">; +def UseAbs : Predicate<"Subtarget->inAbs2008Mode() ||" + "TM.Options.NoNaNsFPMath">; def HasStdEnc : Predicate<"Subtarget->hasStandardEncoding()">, AssemblerPredicate<"!FeatureMips16">; def NotDSP : Predicate<"!Subtarget->hasDSP()">; @@ -1623,11 +1624,15 @@ let isCall=1, hasDelaySlot=1, isCTI=1, Defs = [RA] in { class JumpLinkRegPseudo: PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>, - PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>; + PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)> { + let hasPostISelHook = 1; + } class JumpLinkReg: InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"), - [], II_JALR, FrmR, opstr>; + [], II_JALR, FrmR, opstr> { + let hasPostISelHook = 1; + } class BGEZAL_FT : @@ -1646,7 +1651,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1, class TailCallReg : PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>, - PseudoInstExpansion<(JumpInst RO:$rs)>; + PseudoInstExpansion<(JumpInst RO:$rs)> { + let hasPostISelHook = 1; + } } class BAL_BR_Pseudo : @@ -1844,7 +1851,9 @@ class InsBase : PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr), - [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>; + [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]> { + let hasNoSchedulingInfo = 1; +} class Atomic2OpsPostRA : PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr), []> { @@ -1861,7 +1870,9 @@ class Atomic2OpsSubwordPostRA : // during ISelLowering, which produces the PostRA version of this instruction. class AtomicCmpSwap : PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap), - [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>; + [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]> { + let hasNoSchedulingInfo = 1; +} class AtomicCmpSwapPostRA : PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$cmp, RC:$swap), []> { @@ -1876,7 +1887,6 @@ class AtomicCmpSwapSubwordPostRA : let mayStore = 1; } - class LLBase : InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"), [], II_LL, FrmI, opstr> { @@ -1928,7 +1938,7 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in { def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>; } -let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +let Defs = [SP], Uses = [SP], hasSideEffects = 1, hasNoSchedulingInfo = 1 in { def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -2004,17 +2014,25 @@ let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in { // Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt - $baltgt) def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst), - (ins brtarget:$tgt, brtarget:$baltgt), []>; + (ins brtarget:$tgt, brtarget:$baltgt), []> { + bit hasNoSchedulingInfo = 1; +} // Expands to: lui $dst, highest/%higher/%hi/%lo($tgt) def LONG_BRANCH_LUi2Op : PseudoSE<(outs GPR32Opnd:$dst), - (ins brtarget:$tgt), []>; + (ins brtarget:$tgt), []> { + bit hasNoSchedulingInfo = 1; +} // Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt - $baltgt) def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst), - (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>; + (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []> { + bit hasNoSchedulingInfo = 1; +} // Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt) def LONG_BRANCH_ADDiu2Op : PseudoSE<(outs GPR32Opnd:$dst), - (ins GPR32Opnd:$src, brtarget:$tgt), []>; + (ins GPR32Opnd:$src, brtarget:$tgt), []> { + bit hasNoSchedulingInfo = 1; +} //===----------------------------------------------------------------------===// // Instruction definition @@ -2117,7 +2135,7 @@ let AdditionalPredicates = [NotInMicroMips] in { LW_FM<0x28>, ISA_MIPS1; def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>, ISA_MIPS1; - def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>, ISA_MIPS1; + def SW : StdMMR6Rel, Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>, ISA_MIPS1; } /// load/store left/right @@ -2324,12 +2342,12 @@ def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>; def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in { +let Uses = [V0, V1], isTerminator = 1, isReturn = 1, + isBarrier = 1, isCTI = 1, hasNoSchedulingInfo = 1 in { def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst), - [(MIPSehret GPR32:$spoff, GPR32:$dst)]>; - def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff, - GPR64:$dst), - [(MIPSehret GPR64:$spoff, GPR64:$dst)]>; + [(MIPSehret GPR32:$spoff, GPR32:$dst)]>; + def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff, GPR64:$dst), + [(MIPSehret GPR64:$spoff, GPR64:$dst)]>; } /// Multiply and Divide Instructions. @@ -2675,18 +2693,64 @@ let AdditionalPredicates = [NotInMicroMips] in { (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS1; def : MipsInstAlias<"negu $rt", (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>, ISA_MIPS1; + + def SGE : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "sge\t$rd, $rs, $rt">, ISA_MIPS1; + def : MipsInstAlias<"sge $rs, $rt", + (SGE GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>, + ISA_MIPS1; + def SGEImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, simm32:$imm), + "sge\t$rd, $rs, $imm">, GPR_32; + def : MipsInstAlias<"sge $rs, $imm", (SGEImm GPR32Opnd:$rs, + GPR32Opnd:$rs, + simm32:$imm), 0>, + GPR_32; + + def SGEU : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "sgeu\t$rd, $rs, $rt">, ISA_MIPS1; + def : MipsInstAlias<"sgeu $rs, $rt", + (SGEU GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>, + ISA_MIPS1; + def SGEUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, uimm32_coerced:$imm), + "sgeu\t$rd, $rs, $imm">, GPR_32; + def : MipsInstAlias<"sgeu $rs, $imm", (SGEUImm GPR32Opnd:$rs, + GPR32Opnd:$rs, + uimm32_coerced:$imm), 0>, + GPR_32; + def : MipsInstAlias< "sgt $rd, $rs, $rt", (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1; def : MipsInstAlias< "sgt $rs, $rt", (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1; + + def SGTImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, simm32:$imm), + "sgt\t$rd, $rs, $imm">, GPR_32; + def : MipsInstAlias<"sgt $rs, $imm", (SGTImm GPR32Opnd:$rs, + GPR32Opnd:$rs, + simm32:$imm), 0>, + GPR_32; def : MipsInstAlias< "sgtu $rd, $rs, $rt", (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1; def : MipsInstAlias< "sgtu $$rs, $rt", (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1; + + def SGTUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, uimm32_coerced:$imm), + "sgtu\t$rd, $rs, $imm">, GPR_32; + def : MipsInstAlias<"sgtu $rs, $imm", (SGTUImm GPR32Opnd:$rs, + GPR32Opnd:$rs, + uimm32_coerced:$imm), 0>, + GPR_32; + def : MipsInstAlias< "not $rt, $rs", (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1; @@ -2737,14 +2801,14 @@ let AdditionalPredicates = [NotInMicroMips] in { def : MipsInstAlias<"bnez $rs,$offset", (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>, ISA_MIPS1; - def : MipsInstAlias<"bnezl $rs,$offset", - (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>, + def : MipsInstAlias<"bnezl $rs, $offset", + (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 1>, ISA_MIPS2; def : MipsInstAlias<"beqz $rs,$offset", (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>, ISA_MIPS1; - def : MipsInstAlias<"beqzl $rs,$offset", - (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>, + def : MipsInstAlias<"beqzl $rs, $offset", + (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 1>, ISA_MIPS2; def : MipsInstAlias<"syscall", (SYSCALL 0), 1>, ISA_MIPS1; diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp index b041590ee343..45a47ad3c087 100644 --- a/lib/Target/Mips/MipsInstructionSelector.cpp +++ b/lib/Target/Mips/MipsInstructionSelector.cpp @@ -1,9 +1,8 @@ //===- MipsInstructionSelector.cpp ------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -12,6 +11,8 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#include "MCTargetDesc/MipsInstPrinter.h" +#include "MipsMachineFunction.h" #include "MipsRegisterBankInfo.h" #include "MipsTargetMachine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" @@ -37,6 +38,12 @@ public: private: bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + bool materialize32BitImm(Register DestReg, APInt Imm, + MachineIRBuilder &B) const; + bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; + const TargetRegisterClass * + getRegClassForTypeOnBank(unsigned OpSize, const RegisterBank &RB, + const RegisterBankInfo &RBI) const; const MipsTargetMachine &TM; const MipsSubtarget &STI; @@ -74,15 +81,24 @@ MipsInstructionSelector::MipsInstructionSelector( { } -static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); +bool MipsInstructionSelector::selectCopy(MachineInstr &I, + MachineRegisterInfo &MRI) const { + Register DstReg = I.getOperand(0).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) return true; - const TargetRegisterClass *RC = &Mips::GPR32RegClass; + const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + const TargetRegisterClass *RC = &Mips::GPR32RegClass; + if (RegBank->getID() == Mips::FPRBRegBankID) { + if (DstSize == 32) + RC = &Mips::FGR32RegClass; + else if (DstSize == 64) + RC = STI.isFP64bit() ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass; + else + llvm_unreachable("Unsupported destination size"); + } if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) << " operand\n"); @@ -91,6 +107,102 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return true; } +const TargetRegisterClass *MipsInstructionSelector::getRegClassForTypeOnBank( + unsigned OpSize, const RegisterBank &RB, + const RegisterBankInfo &RBI) const { + if (RB.getID() == Mips::GPRBRegBankID) + return &Mips::GPR32RegClass; + + if (RB.getID() == Mips::FPRBRegBankID) + return OpSize == 32 + ? &Mips::FGR32RegClass + : STI.hasMips32r6() || STI.isFP64bit() ? &Mips::FGR64RegClass + : &Mips::AFGR64RegClass; + + llvm_unreachable("getRegClassForTypeOnBank can't find register class."); + return nullptr; +} + +bool MipsInstructionSelector::materialize32BitImm(Register DestReg, APInt Imm, + MachineIRBuilder &B) const { + assert(Imm.getBitWidth() == 32 && "Unsupported immediate size."); + // Ori zero extends immediate. Used for values with zeros in high 16 bits. + if (Imm.getHiBits(16).isNullValue()) { + MachineInstr *Inst = B.buildInstr(Mips::ORi, {DestReg}, {Register(Mips::ZERO)}) + .addImm(Imm.getLoBits(16).getLimitedValue()); + return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI); + } + // Lui places immediate in high 16 bits and sets low 16 bits to zero. + if (Imm.getLoBits(16).isNullValue()) { + MachineInstr *Inst = B.buildInstr(Mips::LUi, {DestReg}, {}) + .addImm(Imm.getHiBits(16).getLimitedValue()); + return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI); + } + // ADDiu sign extends immediate. Used for values with 1s in high 17 bits. + if (Imm.isSignedIntN(16)) { + MachineInstr *Inst = B.buildInstr(Mips::ADDiu, {DestReg}, {Register(Mips::ZERO)}) + .addImm(Imm.getLoBits(16).getLimitedValue()); + return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI); + } + // Values that cannot be materialized with single immediate instruction. + Register LUiReg = B.getMRI()->createVirtualRegister(&Mips::GPR32RegClass); + MachineInstr *LUi = B.buildInstr(Mips::LUi, {LUiReg}, {}) + .addImm(Imm.getHiBits(16).getLimitedValue()); + MachineInstr *ORi = B.buildInstr(Mips::ORi, {DestReg}, {LUiReg}) + .addImm(Imm.getLoBits(16).getLimitedValue()); + if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI)) + return false; + if (!constrainSelectedInstRegOperands(*ORi, TII, TRI, RBI)) + return false; + return true; +} + +/// Returning Opc indicates that we failed to select MIPS instruction opcode. +static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes, + unsigned RegBank, bool isFP64) { + bool isStore = Opc == TargetOpcode::G_STORE; + if (RegBank == Mips::GPRBRegBankID) { + if (isStore) + switch (MemSizeInBytes) { + case 4: + return Mips::SW; + case 2: + return Mips::SH; + case 1: + return Mips::SB; + default: + return Opc; + } + else + // Unspecified extending load is selected into zeroExtending load. + switch (MemSizeInBytes) { + case 4: + return Mips::LW; + case 2: + return Opc == TargetOpcode::G_SEXTLOAD ? Mips::LH : Mips::LHu; + case 1: + return Opc == TargetOpcode::G_SEXTLOAD ? Mips::LB : Mips::LBu; + default: + return Opc; + } + } + + if (RegBank == Mips::FPRBRegBankID) { + switch (MemSizeInBytes) { + case 4: + return isStore ? Mips::SWC1 : Mips::LWC1; + case 8: + if (isFP64) + return isStore ? Mips::SDC164 : Mips::LDC164; + else + return isStore ? Mips::SDC1 : Mips::LDC1; + default: + return Opc; + } + } + return Opc; +} + bool MipsInstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { @@ -100,19 +212,52 @@ bool MipsInstructionSelector::select(MachineInstr &I, if (!isPreISelGenericOpcode(I.getOpcode())) { if (I.isCopy()) - return selectCopy(I, TII, MRI, TRI, RBI); + return selectCopy(I, MRI); return true; } - if (selectImpl(I, CoverageInfo)) { + if (I.getOpcode() == Mips::G_MUL) { + MachineInstr *Mul = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::MUL)) + .add(I.getOperand(0)) + .add(I.getOperand(1)) + .add(I.getOperand(2)); + if (!constrainSelectedInstRegOperands(*Mul, TII, TRI, RBI)) + return false; + Mul->getOperand(3).setIsDead(true); + Mul->getOperand(4).setIsDead(true); + + I.eraseFromParent(); return true; } + if (selectImpl(I, CoverageInfo)) + return true; + MachineInstr *MI = nullptr; using namespace TargetOpcode; switch (I.getOpcode()) { + case G_UMULH: { + Register PseudoMULTuReg = MRI.createVirtualRegister(&Mips::ACC64RegClass); + MachineInstr *PseudoMULTu, *PseudoMove; + + PseudoMULTu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoMULTu)) + .addDef(PseudoMULTuReg) + .add(I.getOperand(1)) + .add(I.getOperand(2)); + if (!constrainSelectedInstRegOperands(*PseudoMULTu, TII, TRI, RBI)) + return false; + + PseudoMove = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoMFHI)) + .addDef(I.getOperand(0).getReg()) + .addUse(PseudoMULTuReg); + if (!constrainSelectedInstRegOperands(*PseudoMove, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; + } case G_GEP: { MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu)) .add(I.getOperand(0)) @@ -127,16 +272,46 @@ bool MipsInstructionSelector::select(MachineInstr &I, .addImm(0); break; } + case G_BRCOND: { + MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::BNE)) + .add(I.getOperand(0)) + .addUse(Mips::ZERO) + .add(I.getOperand(1)); + break; + } + case G_PHI: { + const Register DestReg = I.getOperand(0).getReg(); + const unsigned OpSize = MRI.getType(DestReg).getSizeInBits(); + + const TargetRegisterClass *DefRC = nullptr; + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + DefRC = TRI.getRegClass(DestReg); + else + DefRC = getRegClassForTypeOnBank(OpSize, + *RBI.getRegBank(DestReg, MRI, TRI), RBI); + + I.setDesc(TII.get(TargetOpcode::PHI)); + return RBI.constrainGenericRegister(DestReg, *DefRC, MRI); + } case G_STORE: - case G_LOAD: { - const unsigned DestReg = I.getOperand(0).getReg(); + case G_LOAD: + case G_ZEXTLOAD: + case G_SEXTLOAD: { + const Register DestReg = I.getOperand(0).getReg(); const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID(); const unsigned OpSize = MRI.getType(DestReg).getSizeInBits(); + const unsigned OpMemSizeInBytes = (*I.memoperands_begin())->getSize(); - if (DestRegBank != Mips::GPRBRegBankID || OpSize != 32) + if (DestRegBank == Mips::GPRBRegBankID && OpSize != 32) return false; - const unsigned NewOpc = I.getOpcode() == G_STORE ? Mips::SW : Mips::LW; + if (DestRegBank == Mips::FPRBRegBankID && OpSize != 32 && OpSize != 64) + return false; + + const unsigned NewOpc = selectLoadStoreOpCode( + I.getOpcode(), OpMemSizeInBytes, DestRegBank, STI.isFP64bit()); + if (NewOpc == I.getOpcode()) + return false; MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc)) .add(I.getOperand(0)) @@ -149,7 +324,7 @@ bool MipsInstructionSelector::select(MachineInstr &I, case G_UREM: case G_SDIV: case G_SREM: { - unsigned HILOReg = MRI.createVirtualRegister(&Mips::ACC64RegClass); + Register HILOReg = MRI.createVirtualRegister(&Mips::ACC64RegClass); bool IsSigned = I.getOpcode() == G_SREM || I.getOpcode() == G_SDIV; bool IsDiv = I.getOpcode() == G_UDIV || I.getOpcode() == G_SDIV; @@ -182,58 +357,150 @@ bool MipsInstructionSelector::select(MachineInstr &I, break; } case G_CONSTANT: { - int Imm = I.getOperand(1).getCImm()->getValue().getLimitedValue(); - unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); - MachineInstr *LUi, *ORi; + MachineIRBuilder B(I); + if (!materialize32BitImm(I.getOperand(0).getReg(), + I.getOperand(1).getCImm()->getValue(), B)) + return false; - LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi)) - .addDef(LUiReg) - .addImm(Imm >> 16); + I.eraseFromParent(); + return true; + } + case G_FCONSTANT: { + const APFloat &FPimm = I.getOperand(1).getFPImm()->getValueAPF(); + APInt APImm = FPimm.bitcastToAPInt(); + unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + + if (Size == 32) { + Register GPRReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineIRBuilder B(I); + if (!materialize32BitImm(GPRReg, APImm, B)) + return false; - ORi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ORi)) - .addDef(I.getOperand(0).getReg()) - .addUse(LUiReg) - .addImm(Imm & 0xFFFF); + MachineInstrBuilder MTC1 = + B.buildInstr(Mips::MTC1, {I.getOperand(0).getReg()}, {GPRReg}); + if (!MTC1.constrainAllUses(TII, TRI, RBI)) + return false; + } + if (Size == 64) { + Register GPRRegHigh = MRI.createVirtualRegister(&Mips::GPR32RegClass); + Register GPRRegLow = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineIRBuilder B(I); + if (!materialize32BitImm(GPRRegHigh, APImm.getHiBits(32).trunc(32), B)) + return false; + if (!materialize32BitImm(GPRRegLow, APImm.getLoBits(32).trunc(32), B)) + return false; + + MachineInstrBuilder PairF64 = B.buildInstr( + STI.isFP64bit() ? Mips::BuildPairF64_64 : Mips::BuildPairF64, + {I.getOperand(0).getReg()}, {GPRRegLow, GPRRegHigh}); + if (!PairF64.constrainAllUses(TII, TRI, RBI)) + return false; + } - if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI)) + I.eraseFromParent(); + return true; + } + case G_FABS: { + unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + unsigned FABSOpcode = + Size == 32 ? Mips::FABS_S + : STI.isFP64bit() ? Mips::FABS_D64 : Mips::FABS_D32; + MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(FABSOpcode)) + .add(I.getOperand(0)) + .add(I.getOperand(1)); + break; + } + case G_FPTOSI: { + unsigned FromSize = MRI.getType(I.getOperand(1).getReg()).getSizeInBits(); + unsigned ToSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + (void)ToSize; + assert((ToSize == 32) && "Unsupported integer size for G_FPTOSI"); + assert((FromSize == 32 || FromSize == 64) && + "Unsupported floating point size for G_FPTOSI"); + + unsigned Opcode; + if (FromSize == 32) + Opcode = Mips::TRUNC_W_S; + else + Opcode = STI.isFP64bit() ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32; + unsigned ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass); + MachineInstr *Trunc = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode)) + .addDef(ResultInFPR) + .addUse(I.getOperand(1).getReg()); + if (!constrainSelectedInstRegOperands(*Trunc, TII, TRI, RBI)) return false; - if (!constrainSelectedInstRegOperands(*ORi, TII, TRI, RBI)) + + MachineInstr *Move = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::MFC1)) + .addDef(I.getOperand(0).getReg()) + .addUse(ResultInFPR); + if (!constrainSelectedInstRegOperands(*Move, TII, TRI, RBI)) return false; I.eraseFromParent(); return true; } case G_GLOBAL_VALUE: { - if (MF.getTarget().isPositionIndependent()) - return false; - const llvm::GlobalValue *GVal = I.getOperand(1).getGlobal(); - unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); - MachineInstr *LUi, *ADDiu; + if (MF.getTarget().isPositionIndependent()) { + MachineInstr *LWGOT = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW)) + .addDef(I.getOperand(0).getReg()) + .addReg(MF.getInfo() + ->getGlobalBaseRegForGlobalISel()) + .addGlobalAddress(GVal); + // Global Values that don't have local linkage are handled differently + // when they are part of call sequence. MipsCallLowering::lowerCall + // creates G_GLOBAL_VALUE instruction as part of call sequence and adds + // MO_GOT_CALL flag when Callee doesn't have local linkage. + if (I.getOperand(1).getTargetFlags() == MipsII::MO_GOT_CALL) + LWGOT->getOperand(2).setTargetFlags(MipsII::MO_GOT_CALL); + else + LWGOT->getOperand(2).setTargetFlags(MipsII::MO_GOT); + LWGOT->addMemOperand( + MF, MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad, 4, 4)); + if (!constrainSelectedInstRegOperands(*LWGOT, TII, TRI, RBI)) + return false; - LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi)) - .addDef(LUiReg) - .addGlobalAddress(GVal); - LUi->getOperand(1).setTargetFlags(MipsII::MO_ABS_HI); + if (GVal->hasLocalLinkage()) { + Register LWGOTDef = MRI.createVirtualRegister(&Mips::GPR32RegClass); + LWGOT->getOperand(0).setReg(LWGOTDef); - ADDiu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) + MachineInstr *ADDiu = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) .addDef(I.getOperand(0).getReg()) - .addUse(LUiReg) + .addReg(LWGOTDef) .addGlobalAddress(GVal); - ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO); - - if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI)) - return false; - if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI)) - return false; + ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO); + if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI)) + return false; + } + } else { + Register LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); + + MachineInstr *LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi)) + .addDef(LUiReg) + .addGlobalAddress(GVal); + LUi->getOperand(1).setTargetFlags(MipsII::MO_ABS_HI); + if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI)) + return false; + MachineInstr *ADDiu = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) + .addDef(I.getOperand(0).getReg()) + .addUse(LUiReg) + .addGlobalAddress(GVal); + ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO); + if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI)) + return false; + } I.eraseFromParent(); return true; } case G_ICMP: { struct Instr { - unsigned Opcode, Def, LHS, RHS; - Instr(unsigned Opcode, unsigned Def, unsigned LHS, unsigned RHS) + unsigned Opcode; + Register Def, LHS, RHS; + Instr(unsigned Opcode, Register Def, Register LHS, Register RHS) : Opcode(Opcode), Def(Def), LHS(LHS), RHS(RHS){}; bool hasImm() const { @@ -244,10 +511,10 @@ bool MipsInstructionSelector::select(MachineInstr &I, }; SmallVector Instructions; - unsigned ICMPReg = I.getOperand(0).getReg(); - unsigned Temp = MRI.createVirtualRegister(&Mips::GPR32RegClass); - unsigned LHS = I.getOperand(2).getReg(); - unsigned RHS = I.getOperand(3).getReg(); + Register ICMPReg = I.getOperand(0).getReg(); + Register Temp = MRI.createVirtualRegister(&Mips::GPR32RegClass); + Register LHS = I.getOperand(2).getReg(); + Register RHS = I.getOperand(3).getReg(); CmpInst::Predicate Cond = static_cast(I.getOperand(1).getPredicate()); @@ -309,6 +576,84 @@ bool MipsInstructionSelector::select(MachineInstr &I, I.eraseFromParent(); return true; } + case G_FCMP: { + unsigned MipsFCMPCondCode; + bool isLogicallyNegated; + switch (CmpInst::Predicate Cond = static_cast( + I.getOperand(1).getPredicate())) { + case CmpInst::FCMP_UNO: // Unordered + case CmpInst::FCMP_ORD: // Ordered (OR) + MipsFCMPCondCode = Mips::FCOND_UN; + isLogicallyNegated = Cond != CmpInst::FCMP_UNO; + break; + case CmpInst::FCMP_OEQ: // Equal + case CmpInst::FCMP_UNE: // Not Equal (NEQ) + MipsFCMPCondCode = Mips::FCOND_OEQ; + isLogicallyNegated = Cond != CmpInst::FCMP_OEQ; + break; + case CmpInst::FCMP_UEQ: // Unordered or Equal + case CmpInst::FCMP_ONE: // Ordered or Greater Than or Less Than (OGL) + MipsFCMPCondCode = Mips::FCOND_UEQ; + isLogicallyNegated = Cond != CmpInst::FCMP_UEQ; + break; + case CmpInst::FCMP_OLT: // Ordered or Less Than + case CmpInst::FCMP_UGE: // Unordered or Greater Than or Equal (UGE) + MipsFCMPCondCode = Mips::FCOND_OLT; + isLogicallyNegated = Cond != CmpInst::FCMP_OLT; + break; + case CmpInst::FCMP_ULT: // Unordered or Less Than + case CmpInst::FCMP_OGE: // Ordered or Greater Than or Equal (OGE) + MipsFCMPCondCode = Mips::FCOND_ULT; + isLogicallyNegated = Cond != CmpInst::FCMP_ULT; + break; + case CmpInst::FCMP_OLE: // Ordered or Less Than or Equal + case CmpInst::FCMP_UGT: // Unordered or Greater Than (UGT) + MipsFCMPCondCode = Mips::FCOND_OLE; + isLogicallyNegated = Cond != CmpInst::FCMP_OLE; + break; + case CmpInst::FCMP_ULE: // Unordered or Less Than or Equal + case CmpInst::FCMP_OGT: // Ordered or Greater Than (OGT) + MipsFCMPCondCode = Mips::FCOND_ULE; + isLogicallyNegated = Cond != CmpInst::FCMP_ULE; + break; + default: + return false; + } + + // Default compare result in gpr register will be `true`. + // We will move `false` (MIPS::Zero) to gpr result when fcmp gives false + // using MOVF_I. When orignal predicate (Cond) is logically negated + // MipsFCMPCondCode, result is inverted i.e. MOVT_I is used. + unsigned MoveOpcode = isLogicallyNegated ? Mips::MOVT_I : Mips::MOVF_I; + + unsigned TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) + .addDef(TrueInReg) + .addUse(Mips::ZERO) + .addImm(1); + + unsigned Size = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + unsigned FCMPOpcode = + Size == 32 ? Mips::FCMP_S32 + : STI.isFP64bit() ? Mips::FCMP_D64 : Mips::FCMP_D32; + MachineInstr *FCMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(FCMPOpcode)) + .addUse(I.getOperand(2).getReg()) + .addUse(I.getOperand(3).getReg()) + .addImm(MipsFCMPCondCode); + if (!constrainSelectedInstRegOperands(*FCMP, TII, TRI, RBI)) + return false; + + MachineInstr *Move = BuildMI(MBB, I, I.getDebugLoc(), TII.get(MoveOpcode)) + .addDef(I.getOperand(0).getReg()) + .addUse(Mips::ZERO) + .addUse(Mips::FCC0) + .addUse(TrueInReg); + if (!constrainSelectedInstRegOperands(*Move, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; + } default: return false; } diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp index c629f02af00e..e442a81837ed 100644 --- a/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- MipsLegalizerInfo.cpp ------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -25,35 +24,65 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { const LLT s64 = LLT::scalar(64); const LLT p0 = LLT::pointer(0, 32); - getActionDefinitionsBuilder(G_ADD) + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({s32}) .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_UADDE) + getActionDefinitionsBuilder({G_UADDO, G_UADDE, G_USUBO, G_USUBE, G_UMULO}) .lowerFor({{s32, s1}}); + getActionDefinitionsBuilder(G_UMULH) + .legalFor({s32}) + .maxScalar(0, s32); + getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalForCartesianProduct({p0, s32}, {p0}); + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}, + {s32, p0, 32, 8}, + {s64, p0, 64, 8}, + {p0, p0, 32, 8}}) + .minScalar(0, s32); + + getActionDefinitionsBuilder(G_UNMERGE_VALUES) + .legalFor({{s32, s64}}); + + getActionDefinitionsBuilder(G_MERGE_VALUES) + .legalFor({{s64, s32}}); + + getActionDefinitionsBuilder({G_ZEXTLOAD, G_SEXTLOAD}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}}) + .minScalar(0, s32); getActionDefinitionsBuilder(G_SELECT) - .legalForCartesianProduct({p0, s32}, {s32}) + .legalForCartesianProduct({p0, s32, s64}, {s32}) .minScalar(0, s32) .minScalar(1, s32); + getActionDefinitionsBuilder(G_BRCOND) + .legalFor({s32}) + .minScalar(0, s32); + + getActionDefinitionsBuilder(G_PHI) + .legalFor({p0, s32, s64}) + .minScalar(0, s32); + getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) .legalFor({s32}) .clampScalar(0, s32, s32); - getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) - .legalFor({s32}); - getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UREM, G_UDIV}) .legalFor({s32}) .minScalar(0, s32) .libcallFor({s64}); + getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) + .legalFor({s32, s32}) + .minScalar(1, s32); + getActionDefinitionsBuilder(G_ICMP) - .legalFor({{s32, s32}}) + .legalForCartesianProduct({s32}, {s32, p0}) + .clampScalar(1, s32, s32) .minScalar(0, s32); getActionDefinitionsBuilder(G_CONSTANT) @@ -69,6 +98,46 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { getActionDefinitionsBuilder(G_GLOBAL_VALUE) .legalFor({p0}); + // FP instructions + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({s32, s64}); + + getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FABS, G_FSQRT}) + .legalFor({s32, s64}); + + getActionDefinitionsBuilder(G_FCMP) + .legalFor({{s32, s32}, {s32, s64}}) + .minScalar(0, s32); + + getActionDefinitionsBuilder({G_FCEIL, G_FFLOOR}) + .libcallFor({s32, s64}); + + getActionDefinitionsBuilder(G_FPEXT) + .legalFor({{s64, s32}}); + + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor({{s32, s64}}); + + // FP to int conversion instructions + getActionDefinitionsBuilder(G_FPTOSI) + .legalForCartesianProduct({s32}, {s64, s32}) + .libcallForCartesianProduct({s64}, {s64, s32}) + .minScalar(0, s32); + + getActionDefinitionsBuilder(G_FPTOUI) + .libcallForCartesianProduct({s64}, {s64, s32}) + .minScalar(0, s32); + + // Int to FP conversion instructions + getActionDefinitionsBuilder(G_SITOFP) + .legalForCartesianProduct({s64, s32}, {s32}) + .libcallForCartesianProduct({s64, s32}, {s64}) + .minScalar(1, s32); + + getActionDefinitionsBuilder(G_UITOFP) + .libcallForCartesianProduct({s64, s32}, {s64}) + .minScalar(1, s32); + computeTables(); verify(*ST.getInstrInfo()); } diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h index 75fadd6cf613..e5021e081890 100644 --- a/lib/Target/Mips/MipsLegalizerInfo.h +++ b/lib/Target/Mips/MipsLegalizerInfo.h @@ -1,9 +1,8 @@ //===- MipsLegalizerInfo ----------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp index 46b37ceae391..fd984058a2bf 100644 --- a/lib/Target/Mips/MipsMCInstLower.cpp +++ b/lib/Target/Mips/MipsMCInstLower.cpp @@ -1,9 +1,8 @@ //===- MipsMCInstLower.cpp - Convert Mips MachineInstr to MCInst ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -117,6 +116,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO, case MipsII::MO_CALL_LO16: TargetKind = MipsMCExpr::MEK_CALL_LO16; break; + case MipsII::MO_JALR: + return MCOperand(); } switch (MOTy) { diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h index e19f21c98839..29af6f21de82 100644 --- a/lib/Target/Mips/MipsMCInstLower.h +++ b/lib/Target/Mips/MipsMCInstLower.h @@ -1,9 +1,8 @@ //===- MipsMCInstLower.h - Lower MachineInstr to MCInst --------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td index d4e225678184..2bfc92c85e96 100644 --- a/lib/Target/Mips/MipsMSAInstrFormats.td +++ b/lib/Target/Mips/MipsMSAInstrFormats.td @@ -1,9 +1,8 @@ //===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td index eecc7c573df1..907ed9ef746f 100644 --- a/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/lib/Target/Mips/MipsMSAInstrInfo.td @@ -1,9 +1,8 @@ //===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1240,6 +1239,7 @@ class MSA_COPY_PSEUDO_BASE { bit usesCustomInserter = 1; + bit hasNoSchedulingInfo = 1; } class MSA_I5_DESC_BASE { bit usesCustomInserter = 1; + bit hasNoSchedulingInfo = 1; string Constraints = "$wd = $wd_in"; } @@ -2044,7 +2045,7 @@ class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w, // 1.0 when we only need to match ISD::FEXP2. class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>; class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>; -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { class FEXP2_W_1_PSEUDO_DESC : MSAPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws), [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>; @@ -3738,6 +3739,7 @@ class MSA_CBRANCH_PSEUDO_DESC_BASE { bit usesCustomInserter = 1; + bit hasNoSchedulingInfo = 1; } def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE { - let usesCustomInserter = 1; - } - - def LD_F16 : MipsPseudo<(outs MSA128F16:$ws), (ins mem_simm10:$addr), - [(set MSA128F16:$ws, (f16 (load addrimm10:$addr)))]> { - let usesCustomInserter = 1; - } - - def MSA_FP_EXTEND_W_PSEUDO : MipsPseudo<(outs FGR32Opnd:$fd), - (ins MSA128F16:$ws), - [(set FGR32Opnd:$fd, - (f32 (fpextend MSA128F16:$ws)))]> { - let usesCustomInserter = 1; - } - - def MSA_FP_ROUND_W_PSEUDO : MipsPseudo<(outs MSA128F16:$wd), - (ins FGR32Opnd:$fs), - [(set MSA128F16:$wd, - (f16 (fpround FGR32Opnd:$fs)))]> { - let usesCustomInserter = 1; - } - - def MSA_FP_EXTEND_D_PSEUDO : MipsPseudo<(outs FGR64Opnd:$fd), - (ins MSA128F16:$ws), - [(set FGR64Opnd:$fd, - (f64 (fpextend MSA128F16:$ws)))]> { - let usesCustomInserter = 1; - } - - def MSA_FP_ROUND_D_PSEUDO : MipsPseudo<(outs MSA128F16:$wd), - (ins FGR64Opnd:$fs), - [(set MSA128F16:$wd, - (f16 (fpround FGR64Opnd:$fs)))]> { - let usesCustomInserter = 1; - } - - def : MipsPat<(MipsTruncIntFP MSA128F16:$ws), - (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>, ISA_MIPS1, - ASE_MSA; - - def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond), - (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws), - (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>, - ISA_MIPS1_NOT_32R6_64R6, ASE_MSA; + let usesCustomInserter = 1 in { + def ST_F16 : + MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr), + [(store (f16 MSA128F16:$ws), (addrimm10:$addr))]>; + def LD_F16 : + MipsPseudo<(outs MSA128F16:$ws), (ins mem_simm10:$addr), + [(set MSA128F16:$ws, (f16 (load addrimm10:$addr)))]>; + } + + let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def MSA_FP_EXTEND_W_PSEUDO : + MipsPseudo<(outs FGR32Opnd:$fd), (ins MSA128F16:$ws), + [(set FGR32Opnd:$fd, (f32 (fpextend MSA128F16:$ws)))]>; + def MSA_FP_ROUND_W_PSEUDO : + MipsPseudo<(outs MSA128F16:$wd), (ins FGR32Opnd:$fs), + [(set MSA128F16:$wd, (f16 (fpround FGR32Opnd:$fs)))]>; + def MSA_FP_EXTEND_D_PSEUDO : + MipsPseudo<(outs FGR64Opnd:$fd), (ins MSA128F16:$ws), + [(set FGR64Opnd:$fd, (f64 (fpextend MSA128F16:$ws)))]>; + def MSA_FP_ROUND_D_PSEUDO : + MipsPseudo<(outs MSA128F16:$wd), (ins FGR64Opnd:$fs), + [(set MSA128F16:$wd, (f16 (fpround FGR64Opnd:$fs)))]>; + } + + def : MipsPat<(MipsTruncIntFP MSA128F16:$ws), + (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>, + ISA_MIPS1, ASE_MSA; + + def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond), + (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws), + (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>, + ISA_MIPS1_NOT_32R6_64R6, ASE_MSA; } def vsplati64_imm_eq_63 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{ diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td index c2c22e2ad61c..22c290b1c114 100644 --- a/lib/Target/Mips/MipsMTInstrFormats.td +++ b/lib/Target/Mips/MipsMTInstrFormats.td @@ -1,9 +1,8 @@ //===-- MipsMTInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td index 72e626cbec40..3edeb57b1876 100644 --- a/lib/Target/Mips/MipsMTInstrInfo.td +++ b/lib/Target/Mips/MipsMTInstrInfo.td @@ -1,9 +1,8 @@ //===-- MipsMTInstrInfo.td - Mips MT Instruction Infos -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp index 81b4352670c0..85b20fc58231 100644 --- a/lib/Target/Mips/MipsMachineFunction.cpp +++ b/lib/Target/Mips/MipsMachineFunction.cpp @@ -1,9 +1,8 @@ //===-- MipsMachineFunctionInfo.cpp - Private data used for Mips ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -45,13 +44,109 @@ static const TargetRegisterClass &getGlobalBaseRegClass(MachineFunction &MF) { return Mips::GPR32RegClass; } -unsigned MipsFunctionInfo::getGlobalBaseReg() { +Register MipsFunctionInfo::getGlobalBaseReg() { if (!GlobalBaseReg) GlobalBaseReg = MF.getRegInfo().createVirtualRegister(&getGlobalBaseRegClass(MF)); return GlobalBaseReg; } +Register MipsFunctionInfo::getGlobalBaseRegForGlobalISel() { + if (!GlobalBaseReg) { + getGlobalBaseReg(); + initGlobalBaseReg(); + } + return GlobalBaseReg; +} + +void MipsFunctionInfo::initGlobalBaseReg() { + if (!GlobalBaseReg) + return; + + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator I = MBB.begin(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + DebugLoc DL; + unsigned V0, V1; + const TargetRegisterClass *RC; + const MipsABIInfo &ABI = + static_cast(MF.getTarget()).getABI(); + RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; + + V0 = RegInfo.createVirtualRegister(RC); + V1 = RegInfo.createVirtualRegister(RC); + + if (ABI.IsN64()) { + MF.getRegInfo().addLiveIn(Mips::T9_64); + MBB.addLiveIn(Mips::T9_64); + + // lui $v0, %hi(%neg(%gp_rel(fname))) + // daddu $v1, $v0, $t9 + // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) + const GlobalValue *FName = &MF.getFunction(); + BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0) + .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); + BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0) + .addReg(Mips::T9_64); + BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1) + .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); + return; + } + + if (!MF.getTarget().isPositionIndependent()) { + // Set global register to __gnu_local_gp. + // + // lui $v0, %hi(__gnu_local_gp) + // addiu $globalbasereg, $v0, %lo(__gnu_local_gp) + BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0) + .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI); + BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0) + .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO); + return; + } + + MF.getRegInfo().addLiveIn(Mips::T9); + MBB.addLiveIn(Mips::T9); + + if (ABI.IsN32()) { + // lui $v0, %hi(%neg(%gp_rel(fname))) + // addu $v1, $v0, $t9 + // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) + const GlobalValue *FName = &MF.getFunction(); + BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0) + .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); + BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9); + BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1) + .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); + return; + } + + assert(ABI.IsO32()); + + // For O32 ABI, the following instruction sequence is emitted to initialize + // the global base register: + // + // 0. lui $2, %hi(_gp_disp) + // 1. addiu $2, $2, %lo(_gp_disp) + // 2. addu $globalbasereg, $2, $t9 + // + // We emit only the last instruction here. + // + // GNU linker requires that the first two instructions appear at the beginning + // of a function and no instructions be inserted before or between them. + // The two instructions are emitted during lowering to MC layer in order to + // avoid any reordering. + // + // Register $2 (Mips::V0) is added to the list of live-in registers to ensure + // the value instruction 1 (addiu) defines is valid when instruction 2 (addu) + // reads it. + MF.getRegInfo().addLiveIn(Mips::V0); + MBB.addLiveIn(Mips::V0); + BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg) + .addReg(Mips::V0).addReg(Mips::T9); +} + void MipsFunctionInfo::createEhDataRegsFI() { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); for (int I = 0; I < 4; ++I) { diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h index 553a66703b26..aaa1e0e18441 100644 --- a/lib/Target/Mips/MipsMachineFunction.h +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -1,9 +1,8 @@ //===- MipsMachineFunctionInfo.h - Private data used for Mips ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,7 +32,12 @@ public: void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } bool globalBaseRegSet() const; - unsigned getGlobalBaseReg(); + Register getGlobalBaseReg(); + Register getGlobalBaseRegForGlobalISel(); + + // Insert instructions to initialize the global base register in the + // first MBB of the function. + void initGlobalBaseReg(); int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index 27bc4843f410..5ef07a2d283e 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -1,9 +1,8 @@ //===- MipsOptimizePICCall.cpp - Optimize PIC Calls -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h index 4708784063d3..7897095ef894 100644 --- a/lib/Target/Mips/MipsOptionRecord.h +++ b/lib/Target/Mips/MipsOptionRecord.h @@ -1,9 +1,8 @@ //===- MipsOptionRecord.h - Abstraction for storing information -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp index 4edcb3132ada..ac4e55f8a1f5 100644 --- a/lib/Target/Mips/MipsOs16.cpp +++ b/lib/Target/Mips/MipsOs16.cpp @@ -1,9 +1,8 @@ //===---- MipsOs16.cpp for Mips Option -Os16 --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp index 1cff1c8396ea..85076590d407 100644 --- a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -1,9 +1,8 @@ //=== lib/CodeGen/GlobalISel/MipsPreLegalizerCombiner.cpp --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,6 +13,7 @@ #include "MipsTargetMachine.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -35,6 +35,16 @@ public: bool MipsPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B); + + switch (MI.getOpcode()) { + default: + return false; + case TargetOpcode::G_LOAD: + case TargetOpcode::G_SEXTLOAD: + case TargetOpcode::G_ZEXTLOAD: + return Helper.tryCombineExtendingLoads(MI); + } return false; } diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp index 6af1f10189df..d8bcf16afd50 100644 --- a/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- MipsRegisterBankInfo.cpp ---------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -11,36 +10,55 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#include "MipsInstrInfo.h" #include "MipsRegisterBankInfo.h" +#include "MipsInstrInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #define GET_TARGET_REGBANK_IMPL -#define DEBUG_TYPE "registerbankinfo" - #include "MipsGenRegisterBank.inc" namespace llvm { namespace Mips { enum PartialMappingIdx { PMI_GPR, + PMI_SPR, + PMI_DPR, PMI_Min = PMI_GPR, }; RegisterBankInfo::PartialMapping PartMappings[]{ - {0, 32, GPRBRegBank} + {0, 32, GPRBRegBank}, + {0, 32, FPRBRegBank}, + {0, 64, FPRBRegBank} }; -enum ValueMappingIdx { InvalidIdx = 0, GPRIdx = 1 }; +enum ValueMappingIdx { + InvalidIdx = 0, + GPRIdx = 1, + SPRIdx = 4, + DPRIdx = 7 +}; RegisterBankInfo::ValueMapping ValueMappings[] = { // invalid {nullptr, 0}, - // 3 operands in GPRs + // up to 3 operands in GPRs {&PartMappings[PMI_GPR - PMI_Min], 1}, {&PartMappings[PMI_GPR - PMI_Min], 1}, - {&PartMappings[PMI_GPR - PMI_Min], 1}}; + {&PartMappings[PMI_GPR - PMI_Min], 1}, + // up to 3 ops operands FPRs - single precission + {&PartMappings[PMI_SPR - PMI_Min], 1}, + {&PartMappings[PMI_SPR - PMI_Min], 1}, + {&PartMappings[PMI_SPR - PMI_Min], 1}, + // up to 3 ops operands FPRs - double precission + {&PartMappings[PMI_DPR - PMI_Min], 1}, + {&PartMappings[PMI_DPR - PMI_Min], 1}, + {&PartMappings[PMI_DPR - PMI_Min], 1} +}; } // end namespace Mips } // end namespace llvm @@ -62,30 +80,313 @@ const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass( case Mips::GPRMM16MoveP_and_CPU16Regs_and_GPRMM16ZeroRegClassID: case Mips::GPRMM16MovePPairFirst_and_GPRMM16MovePPairSecondRegClassID: case Mips::SP32RegClassID: + case Mips::GP32RegClassID: return getRegBank(Mips::GPRBRegBankID); + case Mips::FGRCCRegClassID: + case Mips::FGR32RegClassID: + case Mips::FGR64RegClassID: + case Mips::AFGR64RegClassID: + return getRegBank(Mips::FPRBRegBankID); default: llvm_unreachable("Register class not supported"); } } +// Instructions where all register operands are floating point. +static bool isFloatingPointOpcode(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FABS: + case TargetOpcode::G_FSQRT: + case TargetOpcode::G_FCEIL: + case TargetOpcode::G_FFLOOR: + case TargetOpcode::G_FPEXT: + case TargetOpcode::G_FPTRUNC: + return true; + default: + return false; + } +} + +// Instructions where use operands are floating point registers. +// Def operands are general purpose. +static bool isFloatingPointOpcodeUse(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + case TargetOpcode::G_FCMP: + case Mips::MFC1: + case Mips::ExtractElementF64: + case Mips::ExtractElementF64_64: + return true; + default: + return isFloatingPointOpcode(Opc); + } +} + +// Instructions where def operands are floating point registers. +// Use operands are general purpose. +static bool isFloatingPointOpcodeDef(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + case Mips::MTC1: + case Mips::BuildPairF64: + case Mips::BuildPairF64_64: + return true; + default: + return isFloatingPointOpcode(Opc); + } +} + +static bool isAmbiguous(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + case TargetOpcode::G_PHI: + case TargetOpcode::G_SELECT: + return true; + default: + return false; + } +} + +void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addDefUses( + Register Reg, const MachineRegisterInfo &MRI) { + assert(!MRI.getType(Reg).isPointer() && + "Pointers are gprb, they should not be considered as ambiguous.\n"); + for (MachineInstr &UseMI : MRI.use_instructions(Reg)) { + MachineInstr *NonCopyInstr = skipCopiesOutgoing(&UseMI); + // Copy with many uses. + if (NonCopyInstr->getOpcode() == TargetOpcode::COPY && + !TargetRegisterInfo::isPhysicalRegister( + NonCopyInstr->getOperand(0).getReg())) + addDefUses(NonCopyInstr->getOperand(0).getReg(), MRI); + else + DefUses.push_back(skipCopiesOutgoing(&UseMI)); + } +} + +void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addUseDef( + Register Reg, const MachineRegisterInfo &MRI) { + assert(!MRI.getType(Reg).isPointer() && + "Pointers are gprb, they should not be considered as ambiguous.\n"); + MachineInstr *DefMI = MRI.getVRegDef(Reg); + UseDefs.push_back(skipCopiesIncoming(DefMI)); +} + +MachineInstr * +MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesOutgoing( + MachineInstr *MI) const { + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineInstr *Ret = MI; + while (Ret->getOpcode() == TargetOpcode::COPY && + !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(0).getReg()) && + MRI.hasOneUse(Ret->getOperand(0).getReg())) { + Ret = &(*MRI.use_instr_begin(Ret->getOperand(0).getReg())); + } + return Ret; +} + +MachineInstr * +MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesIncoming( + MachineInstr *MI) const { + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineInstr *Ret = MI; + while (Ret->getOpcode() == TargetOpcode::COPY && + !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(1).getReg())) + Ret = MRI.getVRegDef(Ret->getOperand(1).getReg()); + return Ret; +} + +MipsRegisterBankInfo::AmbiguousRegDefUseContainer::AmbiguousRegDefUseContainer( + const MachineInstr *MI) { + assert(isAmbiguous(MI->getOpcode()) && + "Not implemented for non Ambiguous opcode.\n"); + + const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + + if (MI->getOpcode() == TargetOpcode::G_LOAD) + addDefUses(MI->getOperand(0).getReg(), MRI); + + if (MI->getOpcode() == TargetOpcode::G_STORE) + addUseDef(MI->getOperand(0).getReg(), MRI); + + if (MI->getOpcode() == TargetOpcode::G_PHI) { + addDefUses(MI->getOperand(0).getReg(), MRI); + + for (unsigned i = 1; i < MI->getNumOperands(); i += 2) + addUseDef(MI->getOperand(i).getReg(), MRI); + } + + if (MI->getOpcode() == TargetOpcode::G_SELECT) { + addDefUses(MI->getOperand(0).getReg(), MRI); + + addUseDef(MI->getOperand(2).getReg(), MRI); + addUseDef(MI->getOperand(3).getReg(), MRI); + } +} + +bool MipsRegisterBankInfo::TypeInfoForMF::visit( + const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI) { + assert(isAmbiguous(MI->getOpcode()) && "Visiting non-Ambiguous opcode.\n"); + if (wasVisited(MI)) + return true; // InstType has already been determined for MI. + + startVisit(MI); + AmbiguousRegDefUseContainer DefUseContainer(MI); + + // Visit instructions where MI's DEF operands are USED. + if (visitAdjacentInstrs(MI, DefUseContainer.getDefUses(), true)) + return true; + + // Visit instructions that DEFINE MI's USE operands. + if (visitAdjacentInstrs(MI, DefUseContainer.getUseDefs(), false)) + return true; + + // All MI's adjacent instructions, are ambiguous. + if (!WaitingForTypeOfMI) { + // This is chain of ambiguous instructions. + setTypes(MI, InstType::Ambiguous); + return true; + } + // Excluding WaitingForTypeOfMI, MI is either connected to chains of ambiguous + // instructions or has no other adjacent instructions. Anyway InstType could + // not be determined. There could be unexplored path from some of + // WaitingForTypeOfMI's adjacent instructions to an instruction with only one + // mapping available. + // We are done with this branch, add MI to WaitingForTypeOfMI's WaitingQueue, + // this way when WaitingForTypeOfMI figures out its InstType same InstType + // will be assigned to all instructions in this branch. + addToWaitingQueue(WaitingForTypeOfMI, MI); + return false; +} + +bool MipsRegisterBankInfo::TypeInfoForMF::visitAdjacentInstrs( + const MachineInstr *MI, SmallVectorImpl &AdjacentInstrs, + bool isDefUse) { + while (!AdjacentInstrs.empty()) { + MachineInstr *AdjMI = AdjacentInstrs.pop_back_val(); + + if (isDefUse ? isFloatingPointOpcodeUse(AdjMI->getOpcode()) + : isFloatingPointOpcodeDef(AdjMI->getOpcode())) { + setTypes(MI, InstType::FloatingPoint); + return true; + } + + // Determine InstType from register bank of phys register that is + // 'isDefUse ? def : use' of this copy. + if (AdjMI->getOpcode() == TargetOpcode::COPY) { + setTypesAccordingToPhysicalRegister(MI, AdjMI, isDefUse ? 0 : 1); + return true; + } + + // Defaults to integer instruction. Includes G_MERGE_VALUES and + // G_UNMERGE_VALUES. + if (!isAmbiguous(AdjMI->getOpcode())) { + setTypes(MI, InstType::Integer); + return true; + } + + // When AdjMI was visited first, MI has to continue to explore remaining + // adjacent instructions and determine InstType without visiting AdjMI. + if (!wasVisited(AdjMI) || + getRecordedTypeForInstr(AdjMI) != InstType::NotDetermined) { + if (visit(AdjMI, MI)) { + // InstType is successfully determined and is same as for AdjMI. + setTypes(MI, getRecordedTypeForInstr(AdjMI)); + return true; + } + } + } + return false; +} + +void MipsRegisterBankInfo::TypeInfoForMF::setTypes(const MachineInstr *MI, + InstType InstTy) { + changeRecordedTypeForInstr(MI, InstTy); + for (const MachineInstr *WaitingInstr : getWaitingQueueFor(MI)) { + setTypes(WaitingInstr, InstTy); + } +} + +void MipsRegisterBankInfo::TypeInfoForMF::setTypesAccordingToPhysicalRegister( + const MachineInstr *MI, const MachineInstr *CopyInst, unsigned Op) { + assert((TargetRegisterInfo::isPhysicalRegister( + CopyInst->getOperand(Op).getReg())) && + "Copies of non physical registers should not be considered here.\n"); + + const MachineFunction &MF = *CopyInst->getMF(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const RegisterBankInfo &RBI = + *CopyInst->getMF()->getSubtarget().getRegBankInfo(); + const RegisterBank *Bank = + RBI.getRegBank(CopyInst->getOperand(Op).getReg(), MRI, TRI); + + if (Bank == &Mips::FPRBRegBank) + setTypes(MI, InstType::FloatingPoint); + else if (Bank == &Mips::GPRBRegBank) + setTypes(MI, InstType::Integer); + else + llvm_unreachable("Unsupported register bank.\n"); +} + +MipsRegisterBankInfo::InstType +MipsRegisterBankInfo::TypeInfoForMF::determineInstType(const MachineInstr *MI) { + visit(MI, nullptr); + return getRecordedTypeForInstr(MI); +} + +void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction( + llvm::StringRef FunctionName) { + if (MFName != FunctionName) { + MFName = FunctionName; + WaitingQueues.clear(); + Types.clear(); + } +} + const RegisterBankInfo::InstructionMapping & MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + static TypeInfoForMF TI; + + // Reset TI internal data when MF changes. + TI.cleanupIfNewFunction(MI.getMF()->getName()); + unsigned Opc = MI.getOpcode(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); - const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); - if (Mapping.isValid()) - return Mapping; + if (MI.getOpcode() != TargetOpcode::G_PHI) { + const RegisterBankInfo::InstructionMapping &Mapping = + getInstrMappingImpl(MI); + if (Mapping.isValid()) + return Mapping; + } using namespace TargetOpcode; unsigned NumOperands = MI.getNumOperands(); const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx]; + unsigned MappingID = DefaultMappingID; + const unsigned CustomMappingID = 1; switch (Opc) { + case G_TRUNC: case G_ADD: - case G_LOAD: - case G_STORE: + case G_SUB: + case G_MUL: + case G_UMULH: + case G_ZEXTLOAD: + case G_SEXTLOAD: case G_GEP: case G_AND: case G_OR: @@ -99,9 +400,183 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_UREM: OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx]; break; + case G_LOAD: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + InstType InstTy = InstType::Integer; + if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + InstTy = TI.determineInstType(&MI); + } + + if (InstTy == InstType::FloatingPoint || + (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb + OperandsMapping = + getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::GPRIdx]}); + break; + } else { // gprb + OperandsMapping = + getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::GPRIdx]}); + if (Size == 64) + MappingID = CustomMappingID; + } + + break; + } + case G_STORE: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + InstType InstTy = InstType::Integer; + if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + InstTy = TI.determineInstType(&MI); + } + + if (InstTy == InstType::FloatingPoint || + (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb + OperandsMapping = + getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::GPRIdx]}); + break; + } else { // gprb + OperandsMapping = + getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::GPRIdx]}); + if (Size == 64) + MappingID = CustomMappingID; + } + break; + } + case G_PHI: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + InstType InstTy = InstType::Integer; + if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + InstTy = TI.determineInstType(&MI); + } + + // PHI is copylike and should have one regbank in mapping for def register. + if (InstTy == InstType::Integer && Size == 64) { // fprb + OperandsMapping = + getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx]}); + return getInstructionMapping(CustomMappingID, /*Cost=*/1, OperandsMapping, + /*NumOperands=*/1); + } + // Use default handling for PHI, i.e. set reg bank of def operand to match + // register banks of use operands. + const RegisterBankInfo::InstructionMapping &Mapping = + getInstrMappingImpl(MI); + return Mapping; + } + case G_SELECT: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + InstType InstTy = InstType::Integer; + if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + InstTy = TI.determineInstType(&MI); + } + + if (InstTy == InstType::FloatingPoint || + (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb + const RegisterBankInfo::ValueMapping *Bank = + Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx]; + OperandsMapping = getOperandsMapping( + {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank}); + break; + } else { // gprb + const RegisterBankInfo::ValueMapping *Bank = + Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx]; + OperandsMapping = getOperandsMapping( + {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank}); + if (Size == 64) + MappingID = CustomMappingID; + } + break; + } + case G_UNMERGE_VALUES: { + OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], + &Mips::ValueMappings[Mips::GPRIdx], + &Mips::ValueMappings[Mips::DPRIdx]}); + MappingID = CustomMappingID; + break; + } + case G_MERGE_VALUES: { + OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::GPRIdx], + &Mips::ValueMappings[Mips::GPRIdx]}); + MappingID = CustomMappingID; + break; + } + case G_FADD: + case G_FSUB: + case G_FMUL: + case G_FDIV: + case G_FABS: + case G_FSQRT:{ + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + assert((Size == 32 || Size == 64) && "Unsupported floating point size"); + OperandsMapping = Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx]; + break; + } + case G_FCONSTANT: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + assert((Size == 32 || Size == 64) && "Unsupported floating point size"); + const RegisterBankInfo::ValueMapping *FPRValueMapping = + Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx]; + OperandsMapping = getOperandsMapping({FPRValueMapping, nullptr}); + break; + } + case G_FCMP: { + unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + assert((Size == 32 || Size == 64) && "Unsupported floating point size"); + const RegisterBankInfo::ValueMapping *FPRValueMapping = + Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx]; + OperandsMapping = + getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr, + FPRValueMapping, FPRValueMapping}); + break; + } + case G_FPEXT: + OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::SPRIdx]}); + break; + case G_FPTRUNC: + OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::SPRIdx], + &Mips::ValueMappings[Mips::DPRIdx]}); + break; + case G_FPTOSI: { + unsigned SizeFP = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 32) && + "Unsupported integer size"); + assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size"); + OperandsMapping = getOperandsMapping({ + &Mips::ValueMappings[Mips::GPRIdx], + SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx], + }); + break; + } + case G_SITOFP: { + unsigned SizeInt = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned SizeFP = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + (void)SizeInt; + assert((SizeInt == 32) && "Unsupported integer size"); + assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size"); + OperandsMapping = + getOperandsMapping({SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx], + &Mips::ValueMappings[Mips::GPRIdx]}); + break; + } case G_CONSTANT: case G_FRAME_INDEX: case G_GLOBAL_VALUE: + case G_BRCOND: OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr}); break; @@ -111,17 +586,92 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { &Mips::ValueMappings[Mips::GPRIdx], &Mips::ValueMappings[Mips::GPRIdx]}); break; - case G_SELECT: - OperandsMapping = - getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], - &Mips::ValueMappings[Mips::GPRIdx], - &Mips::ValueMappings[Mips::GPRIdx], - &Mips::ValueMappings[Mips::GPRIdx]}); - break; default: return getInvalidInstructionMapping(); } - return getInstructionMapping(DefaultMappingID, /*Cost=*/1, OperandsMapping, + return getInstructionMapping(MappingID, /*Cost=*/1, OperandsMapping, NumOperands); } + +using InstListTy = GISelWorkList<4>; +namespace { +class InstManager : public GISelChangeObserver { + InstListTy &InstList; + +public: + InstManager(InstListTy &Insts) : InstList(Insts) {} + + void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); } + void erasingInstr(MachineInstr &MI) override {} + void changingInstr(MachineInstr &MI) override {} + void changedInstr(MachineInstr &MI) override {} +}; +} // end anonymous namespace + +/// Here we have to narrowScalar s64 operands to s32, combine away +/// G_MERGE/G_UNMERGE and erase instructions that became dead in the process. +/// We manually assign 32 bit gprb to register operands of all new instructions +/// that got created in the process since they will not end up in RegBankSelect +/// loop. Careful not to delete instruction after MI i.e. MI.getIterator()++. +void MipsRegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + MachineInstr &MI = OpdMapper.getMI(); + InstListTy NewInstrs; + MachineIRBuilder B(MI); + MachineFunction *MF = MI.getMF(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + + InstManager NewInstrObserver(NewInstrs); + GISelObserverWrapper WrapperObserver(&NewInstrObserver); + LegalizerHelper Helper(*MF, WrapperObserver, B); + LegalizationArtifactCombiner ArtCombiner( + B, MF->getRegInfo(), *MF->getSubtarget().getLegalizerInfo()); + + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + case TargetOpcode::G_PHI: + case TargetOpcode::G_SELECT: { + Helper.narrowScalar(MI, 0, LLT::scalar(32)); + // Handle new instructions. + while (!NewInstrs.empty()) { + MachineInstr *NewMI = NewInstrs.pop_back_val(); + // This is new G_UNMERGE that was created during narrowScalar and will + // not be considered for regbank selection. RegBankSelect for mips + // visits/makes corresponding G_MERGE first. Combine them here. + if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { + SmallVector DeadInstrs; + ArtCombiner.tryCombineMerges(*NewMI, DeadInstrs); + for (MachineInstr *DeadMI : DeadInstrs) + DeadMI->eraseFromParent(); + } + // This G_MERGE will be combined away when its corresponding G_UNMERGE + // gets regBankSelected. + else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES) + continue; + else + // Manually set register banks for all register operands to 32 bit gprb. + for (auto Op : NewMI->operands()) { + if (Op.isReg()) { + assert(MRI.getType(Op.getReg()).getSizeInBits() == 32 && + "Only 32 bit gprb is handled here.\n"); + MRI.setRegBank(Op.getReg(), getRegBank(Mips::GPRBRegBankID)); + } + } + } + return; + } + case TargetOpcode::G_UNMERGE_VALUES: { + SmallVector DeadInstrs; + ArtCombiner.tryCombineMerges(MI, DeadInstrs); + for (MachineInstr *DeadMI : DeadInstrs) + DeadMI->eraseFromParent(); + return; + } + default: + break; + } + + return applyDefaultMapping(OpdMapper); +} diff --git a/lib/Target/Mips/MipsRegisterBankInfo.h b/lib/Target/Mips/MipsRegisterBankInfo.h index 64a79abaa74d..176813c031ed 100644 --- a/lib/Target/Mips/MipsRegisterBankInfo.h +++ b/lib/Target/Mips/MipsRegisterBankInfo.h @@ -1,9 +1,8 @@ //===- MipsRegisterBankInfo.h -----------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -38,6 +37,131 @@ public: const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override; + + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + +private: + /// Some instructions are used with both floating point and integer operands. + /// We assign InstType to such instructions as it helps us to avoid cross bank + /// copies. InstType deppends on context. + enum InstType { + /// Temporary type, when visit(..., nullptr) finishes will convert to one of + /// the remaining types: Integer, FloatingPoint or Ambiguous. + NotDetermined, + /// Connected with instruction that interprets 'bags of bits' as integers. + /// Select gprb to avoid cross bank copies. + Integer, + /// Connected with instruction that interprets 'bags of bits' as floating + /// point numbers. Select fprb to avoid cross bank copies. + FloatingPoint, + /// Represents moving 'bags of bits' around. Select same bank for entire + /// chain to avoid cross bank copies. Currently we select fprb for s64 and + /// gprb for s32 Ambiguous operands. + Ambiguous + }; + + /// Some generic instructions have operands that can be mapped to either fprb + /// or gprb e.g. for G_LOAD we consider only operand 0 as ambiguous, operand 1 + /// is always gprb since it is a pointer. + /// This class provides containers for MI's ambiguous: + /// DefUses : MachineInstrs that use one of MI's ambiguous def operands. + /// UseDefs : MachineInstrs that define MI's ambiguous use operands. + class AmbiguousRegDefUseContainer { + SmallVector DefUses; + SmallVector UseDefs; + + void addDefUses(Register Reg, const MachineRegisterInfo &MRI); + void addUseDef(Register Reg, const MachineRegisterInfo &MRI); + + /// Skip copy instructions until we get to a non-copy instruction or to a + /// copy with phys register as def. Used during search for DefUses. + /// MI : %5 = COPY %4 + /// %6 = COPY %5 + /// $v0 = COPY %6 <- we want this one. + MachineInstr *skipCopiesOutgoing(MachineInstr *MI) const; + + /// Skip copy instructions until we get to a non-copy instruction or to a + /// copy with phys register as use. Used during search for UseDefs. + /// %1 = COPY $a1 <- we want this one. + /// %2 = COPY %1 + /// MI = %3 = COPY %2 + MachineInstr *skipCopiesIncoming(MachineInstr *MI) const; + + public: + AmbiguousRegDefUseContainer(const MachineInstr *MI); + SmallVectorImpl &getDefUses() { return DefUses; } + SmallVectorImpl &getUseDefs() { return UseDefs; } + }; + + class TypeInfoForMF { + /// MachineFunction name is used to recognise when MF changes. + std::string MFName = ""; + /// : value is vector of all MachineInstrs that are waiting for + /// key to figure out type of some of its ambiguous operands. + DenseMap> + WaitingQueues; + /// Recorded InstTypes for visited instructions. + DenseMap Types; + + /// Recursively visit MI's adjacent instructions and find MI's InstType. + bool visit(const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI); + + /// Visit MI's adjacent UseDefs or DefUses. + bool visitAdjacentInstrs(const MachineInstr *MI, + SmallVectorImpl &AdjacentInstrs, + bool isDefUse); + + /// Set type for MI, and recursively for all instructions that are + /// waiting for MI's type. + void setTypes(const MachineInstr *MI, InstType ITy); + + /// InstType for MI is determined, set it to InstType that corresponds to + /// physical regisiter that is operand number Op in CopyInst. + void setTypesAccordingToPhysicalRegister(const MachineInstr *MI, + const MachineInstr *CopyInst, + unsigned Op); + + /// Set default values for MI in order to start visit. + void startVisit(const MachineInstr *MI) { + Types.try_emplace(MI, InstType::NotDetermined); + WaitingQueues.try_emplace(MI); + } + + /// Returns true if instruction was already visited. Type might not be + /// determined at this point but will be when visit(..., nullptr) finishes. + bool wasVisited(const MachineInstr *MI) const { return Types.count(MI); }; + + /// Returns recorded type for instruction. + const InstType &getRecordedTypeForInstr(const MachineInstr *MI) const { + assert(wasVisited(MI) && "Instruction was not visited!"); + return Types.find(MI)->getSecond(); + }; + + /// Change recorded type for instruction. + void changeRecordedTypeForInstr(const MachineInstr *MI, InstType InstTy) { + assert(wasVisited(MI) && "Instruction was not visited!"); + Types.find(MI)->getSecond() = InstTy; + }; + + /// Returns WaitingQueue for instruction. + const SmallVectorImpl & + getWaitingQueueFor(const MachineInstr *MI) const { + assert(WaitingQueues.count(MI) && "Instruction was not visited!"); + return WaitingQueues.find(MI)->getSecond(); + }; + + /// Add WaitingForMI to MI's WaitingQueue. + void addToWaitingQueue(const MachineInstr *MI, + const MachineInstr *WaitingForMI) { + assert(WaitingQueues.count(MI) && "Instruction was not visited!"); + WaitingQueues.find(MI)->getSecond().push_back(WaitingForMI); + }; + + public: + InstType determineInstType(const MachineInstr *MI); + + void cleanupIfNewFunction(llvm::StringRef FunctionName); + }; }; } // end namespace llvm #endif diff --git a/lib/Target/Mips/MipsRegisterBanks.td b/lib/Target/Mips/MipsRegisterBanks.td index 5f1687048fac..14a0181f8f11 100644 --- a/lib/Target/Mips/MipsRegisterBanks.td +++ b/lib/Target/Mips/MipsRegisterBanks.td @@ -1,9 +1,8 @@ //===- MipsRegisterBank.td ---------------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,3 +10,5 @@ //===----------------------------------------------------------------------===// def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>; + +def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64]>; diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 3c108c2ba9b7..7b02d126eb28 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -1,9 +1,8 @@ //===- MipsRegisterInfo.cpp - MIPS Register Information -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -160,8 +159,6 @@ getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const MipsSubtarget &Subtarget = MF.getSubtarget(); - using RegIter = TargetRegisterClass::const_iterator; - for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I) Reserved.set(ReservedGPR32[I]); @@ -183,14 +180,12 @@ getReservedRegs(const MachineFunction &MF) const { if (Subtarget.isFP64bit()) { // Reserve all registers in AFGR64. - for (RegIter Reg = Mips::AFGR64RegClass.begin(), - EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg) - Reserved.set(*Reg); + for (MCPhysReg Reg : Mips::AFGR64RegClass) + Reserved.set(Reg); } else { // Reserve all registers in FGR64. - for (RegIter Reg = Mips::FGR64RegClass.begin(), - EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg) - Reserved.set(*Reg); + for (MCPhysReg Reg : Mips::FGR64RegClass) + Reserved.set(Reg); } // Reserve FP if this function should have a dedicated frame pointer register. if (Subtarget.getFrameLowering()->hasFP(MF)) { @@ -222,14 +217,8 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::DSPOutFlag); // Reserve MSA control registers. - Reserved.set(Mips::MSAIR); - Reserved.set(Mips::MSACSR); - Reserved.set(Mips::MSAAccess); - Reserved.set(Mips::MSASave); - Reserved.set(Mips::MSAModify); - Reserved.set(Mips::MSARequest); - Reserved.set(Mips::MSAMap); - Reserved.set(Mips::MSAUnmap); + for (MCPhysReg Reg : Mips::MSACtrlRegClass) + Reserved.set(Reg); // Reserve RA if in mips16 mode. if (Subtarget.inMips16Mode()) { @@ -248,11 +237,6 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::GP_64); } - if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) { - for (const auto &Reg : Mips::OddSPRegClass) - Reserved.set(Reg); - } - return Reserved; } @@ -293,7 +277,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset); } -unsigned MipsRegisterInfo:: +Register MipsRegisterInfo:: getFrameRegister(const MachineFunction &MF) const { const MipsSubtarget &Subtarget = MF.getSubtarget(); const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); @@ -322,8 +306,8 @@ bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const { unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64; unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64; - // Support dynamic stack realignment only for targets with standard encoding. - if (!Subtarget.hasStandardEncoding()) + // Support dynamic stack realignment for all targets except Mips16. + if (Subtarget.inMips16Mode()) return false; // We can't perform dynamic stack realignment if we can't reserve the diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h index b84aaad05eb5..4ed32b09718b 100644 --- a/lib/Target/Mips/MipsRegisterInfo.h +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -1,9 +1,8 @@ //===- MipsRegisterInfo.h - Mips Register Information Impl ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -70,7 +69,7 @@ public: bool canRealignStack(const MachineFunction &MF) const override; /// Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; /// Return GPR register class. virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0; diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index a943a0ad4094..8a6279da46b7 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -1,9 +1,8 @@ //===-- MipsRegisterInfo.td - Mips Register defs -----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -259,6 +258,11 @@ let Namespace = "Mips" in { def MSARequest : MipsReg<5, "5">; def MSAMap : MipsReg<6, "6">; def MSAUnmap : MipsReg<7, "7">; + // MSA-ASE fake control registers. + // These registers do not exist, but instructions like `cfcmsa` + // and `ctcmsa` allows to specify them. + foreach I = 8-31 in + def MSA#I : MipsReg<#I, ""#I>; // Octeon multiplier and product registers def MPL0 : MipsReg<0, "mpl0">; @@ -383,10 +387,14 @@ def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable; // 32bit fp: // * FGR32 - 16 32-bit even registers // * FGR32 - 32 32-bit registers (single float only mode) -def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>; - -def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>, - Unallocatable; +def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)> { + // Do not allocate odd registers when given -mattr=+nooddspreg. + let AltOrders = [(decimate FGR32, 2)]; + let AltOrderSelect = [{ + const auto & S = MF.getSubtarget(); + return S.isABI_O32() && !S.useOddSPReg(); + }]; +} def AFGR64 : RegisterClass<"Mips", [f64], 64, (add // Return Values and Arguments @@ -400,16 +408,14 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add // Callee save D10, D11, D12, D13, D14, D15)>; -def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>; - -// Used to reserve odd registers when given -mattr=+nooddspreg -// FIXME: Remove double precision registers from this set. -def OddSP : RegisterClass<"Mips", [f32], 32, - (add (decimate (sequence "F%u", 1, 31), 2), - (decimate (sequence "F_HI%u", 1, 31), 2), - (decimate (sequence "D%u", 1, 15), 2), - (decimate (sequence "D%u_64", 1, 31), 2))>, - Unallocatable; +def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> { + // Do not allocate odd registers when given -mattr=+nooddspreg. + let AltOrders = [(decimate FGR64, 2)]; + let AltOrderSelect = [{ + const auto & S = MF.getSubtarget(); + return S.isABI_O32() && !S.useOddSPReg(); + }]; +} // FP control registers. def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>, @@ -437,7 +443,8 @@ def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128, (decimate (sequence "W%u", 0, 31), 2)>; def MSACtrl: RegisterClass<"Mips", [i32], 32, (add - MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>; + MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap, + (sequence "MSA%u", 8, 31))>, Unallocatable; // Hi/Lo Registers def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>; @@ -591,11 +598,6 @@ def StrictlyFGR32AsmOperand : MipsAsmRegOperand { let PredicateMethod = "isStrictlyFGRAsmReg"; } -def FGRH32AsmOperand : MipsAsmRegOperand { - let Name = "FGRH32AsmReg"; - let PredicateMethod = "isFGRAsmReg"; -} - def FCCRegsAsmOperand : MipsAsmRegOperand { let Name = "FCCAsmReg"; } @@ -703,10 +705,6 @@ def FGRCCOpnd : RegisterOperand { let ParserMatchClass = FGR32AsmOperand; } -def FGRH32Opnd : RegisterOperand { - let ParserMatchClass = FGRH32AsmOperand; -} - def FCCRegsOpnd : RegisterOperand { let ParserMatchClass = FCCRegsAsmOperand; } diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index ef1b3c09bdc4..4c6cc1ef771c 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -1,9 +1,8 @@ //===- MipsSEFrameLowering.cpp - Mips32/64 Frame Information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h index cb2119d6880b..78ffe161d9c6 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.h +++ b/lib/Target/Mips/MipsSEFrameLowering.h @@ -1,9 +1,8 @@ //===- MipsSEFrameLowering.h - Mips32/64 frame lowering ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index cf196b597278..703f99f37dd1 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -76,18 +75,8 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI, } unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const { - switch (cast(RegIdx)->getZExtValue()) { - default: - llvm_unreachable("Could not map int to register"); - case 0: return Mips::MSAIR; - case 1: return Mips::MSACSR; - case 2: return Mips::MSAAccess; - case 3: return Mips::MSASave; - case 4: return Mips::MSAModify; - case 5: return Mips::MSARequest; - case 6: return Mips::MSAMap; - case 7: return Mips::MSAUnmap; - } + uint64_t RegNum = cast(RegIdx)->getZExtValue(); + return Mips::MSACtrlRegClass.getRegister(RegNum); } bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI, @@ -135,97 +124,8 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI, return true; } -void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { - MipsFunctionInfo *MipsFI = MF.getInfo(); - - if (!MipsFI->globalBaseRegSet()) - return; - - MachineBasicBlock &MBB = MF.front(); - MachineBasicBlock::iterator I = MBB.begin(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL; - unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg(); - const TargetRegisterClass *RC; - const MipsABIInfo &ABI = static_cast(TM).getABI(); - RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - - V0 = RegInfo.createVirtualRegister(RC); - V1 = RegInfo.createVirtualRegister(RC); - - if (ABI.IsN64()) { - MF.getRegInfo().addLiveIn(Mips::T9_64); - MBB.addLiveIn(Mips::T9_64); - - // lui $v0, %hi(%neg(%gp_rel(fname))) - // daddu $v1, $v0, $t9 - // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) - const GlobalValue *FName = &MF.getFunction(); - BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0) - .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); - BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0) - .addReg(Mips::T9_64); - BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1) - .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); - return; - } - - if (!MF.getTarget().isPositionIndependent()) { - // Set global register to __gnu_local_gp. - // - // lui $v0, %hi(__gnu_local_gp) - // addiu $globalbasereg, $v0, %lo(__gnu_local_gp) - BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0) - .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI); - BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0) - .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO); - return; - } - - MF.getRegInfo().addLiveIn(Mips::T9); - MBB.addLiveIn(Mips::T9); - - if (ABI.IsN32()) { - // lui $v0, %hi(%neg(%gp_rel(fname))) - // addu $v1, $v0, $t9 - // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) - const GlobalValue *FName = &MF.getFunction(); - BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0) - .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); - BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9); - BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1) - .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); - return; - } - - assert(ABI.IsO32()); - - // For O32 ABI, the following instruction sequence is emitted to initialize - // the global base register: - // - // 0. lui $2, %hi(_gp_disp) - // 1. addiu $2, $2, %lo(_gp_disp) - // 2. addu $globalbasereg, $2, $t9 - // - // We emit only the last instruction here. - // - // GNU linker requires that the first two instructions appear at the beginning - // of a function and no instructions be inserted before or between them. - // The two instructions are emitted during lowering to MC layer in order to - // avoid any reordering. - // - // Register $2 (Mips::V0) is added to the list of live-in registers to ensure - // the value instruction 1 (addiu) defines is valid when instruction 2 (addu) - // reads it. - MF.getRegInfo().addLiveIn(Mips::V0); - MBB.addLiveIn(Mips::V0); - BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg) - .addReg(Mips::V0).addReg(Mips::T9); -} - void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { - initGlobalBaseReg(MF); + MF.getInfo()->initGlobalBaseReg(); MachineRegisterInfo *MRI = &MF.getRegInfo(); @@ -1337,6 +1237,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); return false; case InlineAsm::Constraint_m: + case InlineAsm::Constraint_o: if (selectAddrRegImm16(Op, Base, Offset)) { OutOps.push_back(Base); OutOps.push_back(Offset); diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h index eb3657aae050..ce594e1fb4fa 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -1,9 +1,8 @@ //===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -131,10 +130,6 @@ private: void processFunctionAfterISel(MachineFunction &MF) override; - // Insert instructions to initialize the global base register in the - // first MBB of the function. - void initGlobalBaseReg(MachineFunction &MF); - bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) override; diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index a78e544c35f0..edf57a3840d1 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -1,9 +1,8 @@ //===- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -214,6 +213,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + if (Subtarget.hasMips32r2() && !Subtarget.useSoftFloat() && + !Subtarget.hasMips64()) { + setOperationAction(ISD::BITCAST, MVT::i64, Custom); + } + if (NoDPLoadStore) { setOperationAction(ISD::LOAD, MVT::f64, Custom); setOperationAction(ISD::STORE, MVT::f64, Custom); @@ -415,11 +419,8 @@ SDValue MipsSETargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { Op->getOperand(2)); } -bool -MipsSETargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, - bool *Fast) const { +bool MipsSETargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const { MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy; if (Subtarget.systemSupportsUnalignedAccess()) { @@ -463,6 +464,7 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op, case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); + case ISD::BITCAST: return lowerBITCAST(Op, DAG); } return MipsTargetLowering::LowerOperation(Op, DAG); @@ -714,8 +716,31 @@ static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT, SelectionDAG &DAG, const MipsSubtarget &Subtarget) { // Estimate the number of operations the below transform will turn a - // constant multiply into. The number is approximately how many powers - // of two summed together that the constant can be broken down into. + // constant multiply into. The number is approximately equal to the minimal + // number of powers of two that constant can be broken down to by adding + // or subtracting them. + // + // If we have taken more than 12[1] / 8[2] steps to attempt the + // optimization for a native sized value, it is more than likely that this + // optimization will make things worse. + // + // [1] MIPS64 requires 6 instructions at most to materialize any constant, + // multiplication requires at least 4 cycles, but another cycle (or two) + // to retrieve the result from the HI/LO registers. + // + // [2] For MIPS32, more than 8 steps is expensive as the constant could be + // materialized in 2 instructions, multiplication requires at least 4 + // cycles, but another cycle (or two) to retrieve the result from the + // HI/LO registers. + // + // TODO: + // - MaxSteps needs to consider the `VT` of the constant for the current + // target. + // - Consider to perform this optimization after type legalization. + // That allows to remove a workaround for types not supported natively. + // - Take in account `-Os, -Oz` flags because this optimization + // increases code size. + unsigned MaxSteps = Subtarget.isABI_O32() ? 8 : 12; SmallVector WorkStack(1, C); unsigned Steps = 0; @@ -727,6 +752,9 @@ static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT, if (Val == 0 || Val == 1) continue; + if (Steps >= MaxSteps) + return false; + if (Val.isPowerOf2()) { ++Steps; continue; @@ -735,36 +763,15 @@ static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT, APInt Floor = APInt(BitWidth, 1) << Val.logBase2(); APInt Ceil = Val.isNegative() ? APInt(BitWidth, 0) : APInt(BitWidth, 1) << C.ceilLogBase2(); - if ((Val - Floor).ule(Ceil - Val)) { WorkStack.push_back(Floor); WorkStack.push_back(Val - Floor); - ++Steps; - continue; + } else { + WorkStack.push_back(Ceil); + WorkStack.push_back(Ceil - Val); } - WorkStack.push_back(Ceil); - WorkStack.push_back(Ceil - Val); ++Steps; - - // If we have taken more than 12[1] / 8[2] steps to attempt the - // optimization for a native sized value, it is more than likely that this - // optimization will make things worse. - // - // [1] MIPS64 requires 6 instructions at most to materialize any constant, - // multiplication requires at least 4 cycles, but another cycle (or two) - // to retrieve the result from the HI/LO registers. - // - // [2] For MIPS32, more than 8 steps is expensive as the constant could be - // materialized in 2 instructions, multiplication requires at least 4 - // cycles, but another cycle (or two) to retrieve the result from the - // HI/LO registers. - - if (Steps > 12 && (Subtarget.isABI_N32() || Subtarget.isABI_N64())) - return false; - - if (Steps > 8 && Subtarget.isABI_O32()) - return false; } // If the value being multiplied is not supported natively, we have to pay @@ -1221,6 +1228,36 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const { Nd.getMemOperand()->getFlags(), Nd.getAAInfo()); } +SDValue MipsSETargetLowering::lowerBITCAST(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT Src = Op.getOperand(0).getValueType().getSimpleVT(); + MVT Dest = Op.getValueType().getSimpleVT(); + + // Bitcast i64 to double. + if (Src == MVT::i64 && Dest == MVT::f64) { + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Op.getOperand(0), DAG.getIntPtrConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Op.getOperand(0), DAG.getIntPtrConstant(1, DL)); + return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi); + } + + // Bitcast double to i64. + if (Src == MVT::f64 && Dest == MVT::i64) { + SDValue Lo = + DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0), + DAG.getConstant(0, DL, MVT::i32)); + SDValue Hi = + DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0), + DAG.getConstant(1, DL, MVT::i32)); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + } + + // Skip other cases of bitcast and use default lowering. + return SDValue(); +} + SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi, SelectionDAG &DAG) const { @@ -1379,9 +1416,10 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) { static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG, bool IsSigned = false) { + auto *CImm = cast(Op->getOperand(ImmOp)); return DAG.getConstant( APInt(Op->getValueType(0).getScalarType().getSizeInBits(), - Op->getConstantOperandVal(ImmOp), IsSigned), + IsSigned ? CImm->getSExtValue() : CImm->getZExtValue(), IsSigned), SDLoc(Op), Op->getValueType(0)); } @@ -3725,8 +3763,8 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Fd = MI.getOperand(0).getReg(); - unsigned Ws = MI.getOperand(1).getReg(); + Register Fd = MI.getOperand(0).getReg(); + Register Ws = MI.getOperand(1).getReg(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); const TargetRegisterClass *GPRRC = @@ -3734,10 +3772,10 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI, unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1); - unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W; + Register COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W; - unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); - unsigned WPHI = Wtemp; + Register Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + Register WPHI = Wtemp; BuildMI(*BB, MI, DL, TII->get(Mips::FEXUPR_W), Wtemp).addReg(Ws); if (IsFGR64) { @@ -3746,15 +3784,15 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI, } // Perform the safety regclass copy mentioned above. - unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC); - unsigned FPRPHI = IsFGR64onMips32 + Register Rtemp = RegInfo.createVirtualRegister(GPRRC); + Register FPRPHI = IsFGR64onMips32 ? RegInfo.createVirtualRegister(&Mips::FGR64RegClass) : Fd; BuildMI(*BB, MI, DL, TII->get(COPYOpc), Rtemp).addReg(WPHI).addImm(0); BuildMI(*BB, MI, DL, TII->get(MTC1Opc), FPRPHI).addReg(Rtemp); if (IsFGR64onMips32) { - unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC); + Register Rtemp2 = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(Mips::COPY_S_W), Rtemp2) .addReg(WPHI) .addImm(1); diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h index 761ff3b1fa4d..433d019332cf 100644 --- a/lib/Target/Mips/MipsSEISelLowering.h +++ b/lib/Target/Mips/MipsSEISelLowering.h @@ -1,9 +1,8 @@ //===- MipsSEISelLowering.h - MipsSE DAG Lowering Interface -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -41,9 +40,10 @@ class TargetRegisterClass; void addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC); - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS = 0, - unsigned Align = 1, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AS = 0, unsigned Align = 1, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *Fast = nullptr) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -73,6 +73,7 @@ class TargetRegisterClass; SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi, SelectionDAG &DAG) const; diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index c7ab90ed2a3b..4e49f5e7d9d1 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,7 @@ //===----------------------------------------------------------------------===// #include "MipsSEInstrInfo.h" -#include "InstPrinter/MipsInstPrinter.h" +#include "MCTargetDesc/MipsInstPrinter.h" #include "MipsAnalyzeImmediate.h" #include "MipsMachineFunction.h" #include "MipsTargetMachine.h" @@ -447,6 +446,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case Mips::PseudoMTLOHI_DSP: expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true); break; + case Mips::PseudoMTLOHI_MM: + expandPseudoMTLoHi(MBB, MI, Mips::MTLO_MM, Mips::MTHI_MM, false); + break; case Mips::PseudoCVT_S_W: expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false); break; diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h index fce0fe5f58ad..3111d1c21a0a 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.h +++ b/lib/Target/Mips/MipsSEInstrInfo.h @@ -1,9 +1,8 @@ //===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp index e7d720a4b769..f4b164d5c0ab 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h index ebae1909d233..82ddf40f56a7 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.h +++ b/lib/Target/Mips/MipsSERegisterInfo.h @@ -1,9 +1,8 @@ //===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td index 410fa655a225..0c0ddeab22c4 100644 --- a/lib/Target/Mips/MipsSchedule.td +++ b/lib/Target/Mips/MipsSchedule.td @@ -1,9 +1,8 @@ //===-- MipsSchedule.td - Mips Scheduling Definitions ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td index 80ffe7ada7c8..e8a0a30b8e9b 100644 --- a/lib/Target/Mips/MipsScheduleGeneric.td +++ b/lib/Target/Mips/MipsScheduleGeneric.td @@ -1,9 +1,8 @@ //=- MipsScheduleGeneric.td - Generic Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,11 +24,11 @@ def MipsGenericModel : SchedMachineModel { int HighLatency = 37; list UnsupportedFeatures = []; - let CompleteModel = 0; + let CompleteModel = 1; let PostRAScheduler = 1; // FIXME: Remove when all errors have been fixed. - let FullInstRWOverlapCheck = 0; + let FullInstRWOverlapCheck = 1; } let SchedModel = MipsGenericModel in { @@ -42,35 +41,122 @@ def GenericIssueALU : ProcResource<1> { let Super = GenericALU; } def GenericWriteALU : SchedWriteRes<[GenericIssueALU]>; -// and, lui, nor, or, slti, sltiu, sub, subu, xor -// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu, -// xori -def : ItinRW<[GenericWriteALU], [II_ADD, II_ADDU, II_ADDI, II_ADDIU, II_ANDI, - II_AND, II_ANDI, II_CLO, II_CLZ, II_EXT, - II_INS, II_LUI, II_MULT, II_MULTU, II_NOR, - II_ORI, II_OR, II_ROTR, II_ROTRV, II_SEB, - II_SEH, II_SLTI_SLTIU, II_SLT_SLTU, II_SLL, - II_SRA, II_SRL, II_SLLV, II_SRAV, II_SRLV, - II_SSNOP, II_SUB, II_SUBU, II_WSBH, II_XOR, - II_XORI]>; +// add, addi, addiu, addu, and, andi, clo, clz, ext, ins, lui, nor, or, ori, +// rotr, rotrv, seb, seh, sll, sllv, slt, slti, sltiu, sltu, sra, srav, srl, +// srlv, ssnop, sub, subu, wsbh, xor, xori +def : InstRW<[GenericWriteALU], (instrs ADD, ADDi, ADDiu, ADDu, AND, ANDi, + CLO, CLZ, EXT, INS, LEA_ADDiu, LUi, NOP, + NOR, OR, ORi, ROTR, ROTRV, SEB, SEH, SLL, + SLLV, SLT, SLTi, SLTiu, SLTu, SRA, SRAV, SRL, + SRLV, SSNOP, SUB, SUBu, WSBH, XOR, XORi)>; def : InstRW<[GenericWriteALU], (instrs COPY)>; +// MIPSR6 +// ====== + +// addiupc, align, aluipc, aui, auipc, bitswap, clo, clz, lsa, seleqz, selnez +def : InstRW<[GenericWriteALU], (instrs ADDIUPC, ALIGN, ALUIPC, AUI, + AUIPC, BITSWAP, CLO_R6, CLZ_R6, LSA_R6, + SELEQZ, SELNEZ)>; + +// MIPS16e +// ======= + +def : InstRW<[GenericWriteALU], (instrs AddiuRxImmX16, AddiuRxRxImm16, + AddiuRxRxImmX16, AddiuRxRyOffMemX16, + AddiuRxPcImmX16, AddiuSpImm16, AddiuSpImmX16, + AdduRxRyRz16, AndRxRxRy16, CmpRxRy16, + CmpiRxImm16, CmpiRxImmX16, LiRxImm16, + LiRxImmX16, LiRxImmAlignX16, Move32R16, + MoveR3216, Mfhi16, Mflo16, NegRxRy16, + NotRxRy16, OrRxRxRy16, SebRx16, SehRx16, + SllX16, SllvRxRy16, SltiRxImm16, + SltiRxImmX16, SltiCCRxImmX16, + SltiuRxImm16, SltiuRxImmX16, SltiuCCRxImmX16, + SltRxRy16, SltCCRxRy16, SltuRxRy16, + SltuRxRyRz16, SltuCCRxRy16, SravRxRy16, + SraX16, SrlvRxRy16, SrlX16, SubuRxRyRz16, + XorRxRxRy16)>; + +def : InstRW<[GenericWriteALU], (instrs Constant32, LwConstant32, + GotPrologue16, CONSTPOOL_ENTRY)>; + +// microMIPS +// ========= + +def : InstRW<[GenericWriteALU], (instrs ADDIUPC_MM, ADDIUR1SP_MM, ADDIUR2_MM, + ADDIUS5_MM, ADDIUSP_MM, ADDU16_MM, ADD_MM, + ADDi_MM, ADDiu_MM, ADDu_MM, AND16_MM, + ANDI16_MM, AND_MM, ANDi_MM, CLO_MM, CLZ_MM, + EXT_MM, INS_MM, LEA_ADDiu_MM, LI16_MM, + LUi_MM, MOVE16_MM, MOVEP_MM, NOR_MM, + NOT16_MM, OR16_MM, OR_MM, ORi_MM, ROTRV_MM, + ROTR_MM, SEB_MM, SEH_MM, SLL16_MM, SLLV_MM, + SLL_MM, SLT_MM, SLTi_MM, SLTiu_MM, SLTu_MM, + SRAV_MM, SRA_MM, SRL16_MM, SRLV_MM, SRL_MM, + SSNOP_MM, SUBU16_MM, SUB_MM, SUBu_MM, + WSBH_MM, XOR16_MM, XOR_MM, XORi_MM)>; + +// microMIPS32r6 +// ============= + +def : InstRW<[GenericWriteALU], (instrs ADDIUPC_MMR6, ADDIU_MMR6, ADDU16_MMR6, + ADDU_MMR6, ADD_MMR6, ALIGN_MMR6, ALUIPC_MMR6, + AND16_MMR6, ANDI16_MMR6, ANDI_MMR6, AND_MMR6, + AUIPC_MMR6, AUI_MMR6, BITSWAP_MMR6, CLO_MMR6, + CLZ_MMR6, EXT_MMR6, INS_MMR6, LI16_MMR6, + LSA_MMR6, LUI_MMR6, MOVE16_MMR6, NOR_MMR6, + NOT16_MMR6, OR16_MMR6, ORI_MMR6, OR_MMR6, + SELEQZ_MMR6, SELNEZ_MMR6, SLL16_MMR6, + SLL_MMR6, SRL16_MMR6, SSNOP_MMR6, SUBU16_MMR6, + SUBU_MMR6, SUB_MMR6, WSBH_MMR6, XOR16_MMR6, + XORI_MMR6, XOR_MMR6)>; + +// MIPS64 +// ====== + +def : InstRW<[GenericWriteALU], (instrs AND64, ANDi64, DEXT64_32, DSLL64_32, + ORi64, SEB64, SEH64, SLL64_32, SLL64_64, + SLT64, SLTi64, SLTiu64, SLTu64, XOR64, + XORi64)>; + +def : InstRW<[GenericWriteALU], (instrs DADD, DADDi, DADDiu, DADDu, DCLO, + DCLZ, DEXT, DEXTM, DEXTU, DINS, DINSM, DINSU, + DROTR, DROTR32, DROTRV, DSBH, DSHD, DSLL, + DSLL32, DSLLV, DSRA, DSRA32, DSRAV, DSRL, + DSRL32, DSRLV, DSUB, DSUBu, LEA_ADDiu64, + LUi64, NOR64, OR64)>; + +// MIPS64R6 +// ======== + +def : InstRW<[GenericWriteALU], (instrs DALIGN, DAHI, DATI, DAUI, DCLO_R6, + DCLZ_R6, DBITSWAP, DLSA, DLSA_R6, SELEQZ64, + SELNEZ64)>; + + def GenericMDU : ProcResource<1> { let BufferSize = 1; } def GenericIssueMDU : ProcResource<1> { let Super = GenericALU; } def GenericIssueDIV : ProcResource<1> { let Super = GenericMDU; } def GenericWriteHILO : SchedWriteRes<[GenericIssueMDU]>; def GenericWriteALULong : SchedWriteRes<[GenericIssueALU]> { let Latency = 5; } def GenericWriteMove : SchedWriteRes<[GenericIssueALU]> { let Latency = 2; } +def GenericWriteMul : SchedWriteRes<[GenericIssueMDU]> { let Latency = 4; } + +def : InstRW<[GenericWriteHILO], (instrs MADD, MADDU, MSUB, MSUBU)>; -def : ItinRW<[GenericWriteHILO], [II_MADD, II_MADDU, II_MSUB, II_MSUBU]>; +def : InstRW<[GenericWriteHILO], (instrs PseudoMADD_MM, PseudoMADDU_MM, + PseudoMSUB_MM, PseudoMSUBU_MM, + PseudoMULT_MM, PseudoMULTu_MM)>; + +def : InstRW<[GenericWriteHILO], (instrs PseudoMADD, PseudoMADDU, PseudoMSUB, + PseudoMSUBU, PseudoMULT, PseudoMULTu)>; def GenericWriteMDUtoGPR : SchedWriteRes<[GenericIssueMDU]> { let Latency = 5; } -def : ItinRW<[GenericWriteMDUtoGPR], [II_MUL]>; - def GenericWriteDIV : SchedWriteRes<[GenericIssueDIV]> { // Estimated worst case let Latency = 33; @@ -82,63 +168,105 @@ def GenericWriteDIVU : SchedWriteRes<[GenericIssueDIV]> { let ResourceCycles = [31]; } -def : ItinRW<[GenericWriteDIV], [II_DIV]>; +// mul +def : InstRW<[GenericWriteMDUtoGPR], (instrs MUL)>; -def : ItinRW<[GenericWriteDIVU], [II_DIVU]>; +// mult, multu +def : InstRW<[GenericWriteMul], (instrs MULT, MULTu)>; -// MIPS64 -// ====== +// div, sdiv +def : InstRW<[GenericWriteDIV], (instrs PseudoSDIV, SDIV)>; + +def : InstRW<[GenericWriteDIVU], (instrs PseudoUDIV, UDIV)>; + +// mfhi, mflo, movn, mthi, mtlo, rdwhr +def : InstRW<[GenericWriteALULong], (instrs MFHI, MFLO, PseudoMFHI, + PseudoMFLO)>; + +def : InstRW<[GenericWriteALULong], (instrs PseudoMFHI_MM, PseudoMFLO_MM)>; -def : ItinRW<[GenericWriteALU], [II_DADDIU, II_DADDU, II_DADDI, II_DADD, - II_DCLO, II_DCLZ, II_DROTR, II_DROTR32, - II_DROTRV, II_DSBH, II_DSHD, II_DSLL, - II_DSLL32, II_DSLLV, II_DSRA, II_DSRA32, - II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, - II_DSUBU, II_DSUB]>; +def : InstRW<[GenericWriteMove], (instrs MTHI, MTLO, RDHWR, PseudoMTLOHI)>; +def : InstRW<[GenericWriteMove], (instrs PseudoMTLOHI_MM)>; -def : ItinRW<[GenericWriteDIV], [II_DDIV]>; +def : InstRW<[GenericWriteALU], (instrs MOVN_I_I, MOVZ_I_I)>; -def : ItinRW<[GenericWriteDIVU], [II_DDIVU]>; +// MIPSR6 +// ====== -def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUL]>; +// muh, muhu, mulu, mul +def : InstRW<[GenericWriteMul], (instrs MUH, MUHU, MULU, MUL_R6)>; + +// divu, udiv +def : InstRW<[GenericWriteDIV], (instrs MOD, MODU, DIV, DIVU)>; -def : ItinRW<[GenericWriteHILO], [II_DMULU, II_DMULT, II_DMULTU]>; // MIPS16e // ======= -def : ItinRW<[GenericWriteALU], [IIM16Alu, IIPseudo]>; +def : InstRW<[GenericWriteHILO], (instrs MultRxRy16, MultuRxRy16, + MultRxRyRz16, MultuRxRyRz16)>; + +def : InstRW<[GenericWriteDIV], (instrs DivRxRy16)>; + +def : InstRW<[GenericWriteDIVU], (instrs DivuRxRy16)>; // microMIPS // ========= -def : ItinRW<[GenericWriteALU], [II_MOVE, II_LI, II_NOT]>; +def : InstRW<[GenericWriteMul], (instrs MULT_MM, MULTu_MM, MADD_MM, MADDU_MM, + MSUB_MM, MSUBU_MM)>; -// MIPSR6 +def : InstRW<[GenericWriteALULong], (instrs MUL_MM)>; + +def : InstRW<[GenericWriteDIV], (instrs SDIV_MM, SDIV_MM_Pseudo)>; + +def : InstRW<[GenericWriteDIVU], (instrs UDIV_MM, UDIV_MM_Pseudo)>; + +def : InstRW<[GenericWriteMove], (instrs MFHI16_MM, MFLO16_MM, MOVF_I_MM, + MOVT_I_MM, MFHI_MM, MFLO_MM, MTHI_MM, + MTLO_MM)>; + +def : InstRW<[GenericWriteMove], (instrs RDHWR_MM)>; + +// microMIPS32r6 +// ============= + +def : InstRW<[GenericWriteMul], (instrs MUHU_MMR6, MUH_MMR6, MULU_MMR6, + MUL_MMR6)>; + +def : InstRW<[GenericWriteDIV], (instrs MODU_MMR6, MOD_MMR6, DIVU_MMR6, + DIV_MMR6)>; + +def : InstRW<[GenericWriteMove], (instrs RDHWR_MMR6)>; + +// MIPS64 // ====== -def GenericWriteMul : SchedWriteRes<[GenericIssueMDU]> { let Latency = 4; } -def : ItinRW<[GenericWriteMul], [II_MUH, II_MUHU, II_MULU]>; +def : InstRW<[GenericWriteHILO], (instrs DMULU, DMULT, DMULTu, PseudoDMULT, + PseudoDMULTu)>; + +def : InstRW<[GenericWriteDIV], (instrs DSDIV, PseudoDSDIV)>; -def : ItinRW<[GenericWriteDIV], [II_MOD, II_MODU]>; +def : InstRW<[GenericWriteDIVU], (instrs DUDIV, PseudoDUDIV)>; + +def : InstRW<[GenericWriteALULong], (instrs MFHI64, MFLO64, PseudoMFHI64, + PseudoMFLO64, PseudoMTLOHI64)>; + +def : InstRW<[GenericWriteMove], (instrs MTHI64, MTLO64, RDHWR64)>; + +// mov[zn] +def : InstRW<[GenericWriteALU], (instrs MOVN_I_I64, MOVN_I64_I, MOVN_I64_I64, + MOVZ_I_I64, MOVZ_I64_I, MOVZ_I64_I64)>; -def : ItinRW<[GenericWriteALU], [II_ADDIUPC, II_ALIGN, II_ALUIPC, II_AUI, - II_AUIPC, II_BITSWAP, II_LSA, II_SELCCZ]>; // MIPS64R6 // ======== -def : ItinRW<[GenericWriteALU], [II_DALIGN, II_DAHI, II_DATI, II_DAUI, - II_DBITSWAP, II_DLSA]>; - -def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUH, II_DMUHU]>; -def : ItinRW<[GenericWriteDIV], [II_DMOD, II_DMODU]>; +def : InstRW<[GenericWriteMDUtoGPR], (instrs DMUH, DMUHU, DMUL_R6)>; -// clo, clz, di, mfhi, mflo -def : ItinRW<[GenericWriteALULong], [II_MFHI_MFLO]>; -def : ItinRW<[GenericWriteALU], [II_MOVN, II_MOVZ]>; -def : ItinRW<[GenericWriteMove], [II_MTHI_MTLO, II_RDHWR]>; +def : InstRW<[GenericWriteDIV], (instrs DDIV, DMOD)>; +def : InstRW<[GenericWriteDIVU], (instrs DDIVU, DMODU)>; // CTISTD Pipeline // --------------- @@ -155,31 +283,150 @@ def GenericWriteJumpAndLink : SchedWriteRes<[GenericIssueCTISTD]> { // b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx, // jalr, jr.hb, jr, jalr.hb, jarlc, jialc -def : ItinRW<[GenericWriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, - II_JR, II_JR_HB, II_ERET, II_ERETNC, - II_DERET]>; +def : InstRW<[GenericWriteJump], (instrs B, BAL, BAL_BR, BEQ, BNE, BGTZ, BGEZ, + BLEZ, BLTZ, BLTZAL, J, JALX, JR, JR_HB, ERET, + ERet, ERETNC, DERET)>; + +def : InstRW<[GenericWriteJump], (instrs BEQL, BNEL, BGEZL, BGTZL, BLEZL, + BLTZL)>; + +def : InstRW<[GenericWriteJump], (instrs TAILCALL, TAILCALLREG, + TAILCALLREGHB, PseudoIndirectBranch, + PseudoIndirectHazardBranch, PseudoReturn, + RetRA)>; + +def : InstRW<[GenericWriteJumpAndLink], (instrs BGEZAL, JAL, JALR, JALR_HB, + JALRHBPseudo, JALRPseudo)>; -def : ItinRW<[GenericWriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB, - II_BC2CCZ]>; +def : InstRW<[GenericWriteJumpAndLink], (instrs BGEZALL, BLTZALL)>; -def : ItinRW<[GenericWriteJump], [II_JRC, II_JRADDIUSP]>; +def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>; -def : ItinRW<[GenericWriteJumpAndLink], [II_BCCZALS, II_JALS, II_JALRS]>; +def : InstRW<[GenericWriteTrap], (instrs BREAK, SYSCALL, TEQ, TEQI, + TGE, TGEI, TGEIU, TGEU, TNE, + TNEI, TLT, TLTI, TLTU, TTLTIU, + TRAP, SDBBP)>; // MIPSR6 // ====== -def : ItinRW<[GenericWriteJumpAndLink], [II_BALC, II_JALRC, II_JIALC]>; +def : InstRW<[GenericWriteJumpAndLink], (instrs BALC, BEQZALC, BGEZALC, + BGTZALC, BLEZALC, BLTZALC, + BNEZALC, + JIALC)>; -def : ItinRW<[GenericWriteJump], [II_JIC, II_BC, II_BCCC, II_BCCZC]>; +def : InstRW<[GenericWriteJump], (instrs BC, BC2EQZ, BC2NEZ, BEQC, BEQZC, BGEC, + BGEUC, BGEZC, BGTZC, BLEZC, BLTC, BLTUC, + BLTZC, BNEC, BNEZC, BNVC, BOVC, JIC, JR_HB_R6, + SIGRIE, PseudoIndirectBranchR6, + PseudoIndrectHazardBranchR6)>; +def : InstRW<[GenericWriteJump], (instrs TAILCALLR6REG, TAILCALLHBR6REG)>; -def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>; +def : InstRW<[GenericWriteTrap], (instrs SDBBP_R6)>; + +// MIPS16e +// ======= + +def : InstRW<[GenericWriteJump], (instrs Bimm16, BimmX16, BeqzRxImm16, + BeqzRxImmX16, BnezRxImm16, BnezRxImmX16, + Bteqz16, BteqzX16, BteqzT8CmpX16, + BteqzT8CmpiX16, BteqzT8SltX16, + BteqzT8SltuX16, BteqzT8SltiX16, + BteqzT8SltiuX16, Btnez16, BtnezX16, + BtnezT8CmpX16, BtnezT8CmpiX16, + BtnezT8SltX16, BtnezT8SltuX16, + BtnezT8SltiX16, BtnezT8SltiuX16, JrRa16, + JrcRa16, JrcRx16, RetRA16)>; + +def : InstRW<[GenericWriteJumpAndLink], (instrs Jal16, JalB16, JumpLinkReg16)>; + +def : InstRW<[GenericWriteTrap], (instrs Break16)>; + +def : InstRW<[GenericWriteALULong], (instrs SelBeqZ, SelTBteqZCmp, + SelTBteqZCmpi, SelTBteqZSlt, + SelTBteqZSlti, SelTBteqZSltu, + SelTBteqZSltiu, SelBneZ, SelTBtneZCmp, + SelTBtneZCmpi, SelTBtneZSlt, + SelTBtneZSlti, SelTBtneZSltu, + SelTBtneZSltiu)>; + +// microMIPS +// ========= + +def : InstRW<[GenericWriteJump], (instrs B16_MM, BAL_BR_MM, BC1F_MM, BC1T_MM, + BEQZ16_MM, BEQZC_MM, BEQ_MM, BGEZ_MM, + BGTZ_MM, BLEZ_MM, BLTZ_MM, BNEZ16_MM, + BNEZC_MM, BNE_MM, B_MM, DERET_MM, ERET_MM, + JR16_MM, JR_MM, J_MM, B_MM_Pseudo)>; + +def : InstRW<[GenericWriteJumpAndLink], (instrs BGEZALS_MM, BGEZAL_MM, + BLTZALS_MM, BLTZAL_MM, JALR16_MM, + JALRS16_MM, JALRS_MM, JALR_MM, + JALS_MM, JALX_MM, JAL_MM)>; + +def : InstRW<[GenericWriteJump], (instrs TAILCALLREG_MM, TAILCALL_MM, + PseudoIndirectBranch_MM)>; + +def : InstRW<[GenericWriteTrap], (instrs BREAK16_MM, BREAK_MM, SDBBP16_MM, + SDBBP_MM, SYSCALL_MM, TEQI_MM, TEQ_MM, + TGEIU_MM, TGEI_MM, TGEU_MM, TGE_MM, TLTIU_MM, + TLTI_MM, TLTU_MM, TLT_MM, TNEI_MM, TNE_MM, + TRAP_MM)>; + +// microMIPS32r6 +// ============= -def : ItinRW<[GenericWriteTrap], [II_BREAK, II_SYSCALL, II_TEQ, II_TEQI, - II_TGE, II_TGEI, II_TGEIU, II_TGEU, II_TNE, - II_TNEI, II_TLT, II_TLTI, II_TLTU, II_TTLTIU, - II_TRAP, II_SDBBP, II_SIGRIE]>; +def : InstRW<[GenericWriteJump], (instrs BC16_MMR6, BC1EQZC_MMR6, BC1NEZC_MMR6, + BC2EQZC_MMR6, BC2NEZC_MMR6, BC_MMR6, + BEQC_MMR6, BEQZC16_MMR6, BEQZC_MMR6, + BGEC_MMR6, BGEUC_MMR6, BGEZC_MMR6, + BGTZC_MMR6, BLEZC_MMR6, BLTC_MMR6, + BLTUC_MMR6, BLTZC_MMR6, BNEC_MMR6, + BNEZC16_MMR6, BNEZC_MMR6, BNVC_MMR6, + BOVC_MMR6, DERET_MMR6, ERETNC_MMR6, JAL_MMR6, + ERET_MMR6, JIC_MMR6, JRADDIUSP, JRC16_MM, + JRC16_MMR6, JRCADDIUSP_MMR6, SIGRIE_MMR6, + B_MMR6_Pseudo, PseudoIndirectBranch_MMR6)>; + +def : InstRW<[GenericWriteJumpAndLink], (instrs BALC_MMR6, BEQZALC_MMR6, + BGEZALC_MMR6, BGTZALC_MMR6, + BLEZALC_MMR6, BLTZALC_MMR6, + BNEZALC_MMR6, JALRC16_MMR6, + JALRC_HB_MMR6, JALRC_MMR6, + JIALC_MMR6)>; + +def : InstRW<[GenericWriteJump], (instrs TAILCALLREG_MMR6, TAILCALL_MMR6)>; + +def : InstRW<[GenericWriteTrap], (instrs BREAK16_MMR6, BREAK_MMR6, SDBBP_MMR6, + SDBBP16_MMR6)>; + +// MIPS64 +// ====== + +def : InstRW<[GenericWriteJump], (instrs BEQ64, BGEZ64, BGTZ64, BLEZ64, + BLTZ64, BNE64, JR64)>; + +def : InstRW<[GenericWriteJumpAndLink], (instrs JALR64, JALR64Pseudo, + JALRHB64Pseudo, JALR_HB64)>; + +def : InstRW<[GenericWriteJump], (instrs JR_HB64, TAILCALLREG64, + TAILCALLREGHB64, PseudoReturn64)>; + +// MIPS64R6 +// ======== + +def : InstRW<[GenericWriteJump], (instrs BEQC64, BEQZC64, BGEC64, BGEUC64, + BGEZC64, BGTZC64, BLEZC64, BLTC64, BLTUC64, + BLTZC64, BNEC64, BNEZC64, JIC64, + PseudoIndirectBranch64, + PseudoIndirectHazardBranch64)>; + +def : InstRW<[GenericWriteJumpAndLink], (instrs JIALC64)>; + +def : InstRW<[GenericWriteJump], (instrs JR_HB64_R6, TAILCALL64R6REG, + TAILCALLHB64R6REG, PseudoIndirectBranch64R6, + PseudoIndrectHazardBranch64R6)>; // COP0 Pipeline // ============= @@ -196,35 +443,100 @@ def GenericReadWriteCOP0Long : SchedWriteRes<[GenericIssueCOP0]> { } def GenericWriteCOP0Short : SchedWriteRes<[GenericIssueCOP0]>; -def : ItinRW<[GenericWriteCOP0TLB], [II_TLBP, II_TLBR, II_TLBWI, II_TLBWR]>; -def : ItinRW<[GenericWriteCOP0TLB], [II_TLBINV, II_TLBINVF]>; +def : InstRW<[GenericWriteCOP0TLB], (instrs TLBP, TLBR, TLBWI, TLBWR)>; +def : InstRW<[GenericWriteCOP0TLB], (instrs TLBINV, TLBINVF)>; -def : ItinRW<[GenericReadCOP0], [II_MFC0]>; -def : ItinRW<[GenericWriteCOP0], [II_MTC0]>; +def : InstRW<[GenericReadCOP0], (instrs MFC0)>; +def : InstRW<[GenericWriteCOP0], (instrs MTC0)>; -def : ItinRW<[GenericWriteCOP0], [II_EVP, II_DVP]>; +def : InstRW<[GenericWriteCOP0], (instrs EVP, DVP)>; -// MIPSR5 -// ====== -def : ItinRW<[GenericReadCOP0], [II_MFHC0]>; -def : ItinRW<[GenericWriteCOP0], [II_MTHC0]>; +def : InstRW<[GenericWriteCOP0], (instrs DI, EI)>; + +def : InstRW<[GenericWriteCOP0], (instrs EHB, PAUSE, WAIT)>; + +// microMIPS +// ========= + +def : InstRW<[GenericWriteCOP0TLB], (instrs TLBP_MM, TLBR_MM, TLBWI_MM, + TLBWR_MM)>; + +def : InstRW<[GenericWriteCOP0], (instrs DI_MM, EI_MM)>; + +def : InstRW<[GenericWriteCOP0], (instrs EHB_MM, PAUSE_MM, WAIT_MM)>; + + +// microMIPS32R6 +// ============= + +def : InstRW<[GenericWriteCOP0], (instrs RDPGPR_MMR6, WRPGPR_MMR6)>; + +def : InstRW<[GenericWriteCOP0TLB], (instrs TLBINV_MMR6, TLBINVF_MMR6)>; + +def : InstRW<[GenericReadCOP0], (instrs MFHC0_MMR6, MFC0_MMR6, MFHC2_MMR6, + MFC2_MMR6)>; + +def : InstRW<[GenericWriteCOP0], (instrs MTHC0_MMR6, MTC0_MMR6, MTHC2_MMR6, + MTC2_MMR6)>; + +def : InstRW<[GenericWriteCOP0], (instrs EVP_MMR6, DVP_MMR6)>; + +def : InstRW<[GenericWriteCOP0], (instrs DI_MMR6, EI_MMR6)>; + +def : InstRW<[GenericWriteCOP0], (instrs EHB_MMR6, PAUSE_MMR6, WAIT_MMR6)>; // MIPS64 // ====== -def : ItinRW<[GenericReadCOP0], [II_DMFC0]>; -def : ItinRW<[GenericWriteCOP0], [II_DMTC0]>; +def : InstRW<[GenericReadCOP0], (instrs DMFC0)>; -def : ItinRW<[GenericWriteCOP0], [II_RDPGPR, II_WRPGPR]>; +def : InstRW<[GenericWriteCOP0], (instrs DMTC0)>; -def : ItinRW<[GenericWriteCOP0], [II_DI, II_EI]>; - -def : ItinRW<[GenericWriteCOP0], [II_EHB, II_PAUSE, II_WAIT]>; def GenericCOP2 : ProcResource<1> { let BufferSize = 1; } def GenericWriteCOPOther : SchedWriteRes<[GenericCOP2]>; -def : ItinRW<[GenericWriteCOPOther], [II_MFC2, II_MTC2, II_DMFC2, II_DMTC2]>; +def : InstRW<[GenericWriteCOPOther], (instrs MFC2, MTC2)>; + +def : InstRW<[GenericWriteCOPOther], (instrs DMFC2, DMTC2)>; + +// microMIPS32R6 +// ============= + +// The latency and repeat rate of these instructions are implementation +// dependant. +def : InstRW<[GenericWriteMove], (instrs CFC2_MM, CTC2_MM)>; + + +// MIPS MT ASE - hasMT +// ==================== + +def : InstRW<[GenericWriteMove], (instrs DMT, DVPE, EMT, EVPE, MFTR, + MTTR)>; + +def : InstRW<[GenericReadWriteCOP0Long], (instrs YIELD)>; + +def : InstRW<[GenericWriteCOP0Short], (instrs FORK)>; + +// MIPS Virtualization ASE +// ======================= + +def : InstRW<[GenericWriteCOP0Short], (instrs HYPCALL, TLBGINV, TLBGINVF, TLBGP, + TLBGR, TLBGWI, TLBGWR, MFGC0, MFHGC0, + MTGC0, MTHGC0)>; + +// MIPS64 Virtualization ASE +// ========================= + +def : InstRW<[GenericWriteCOP0Short], (instrs DMFGC0, DMTGC0)>; + +// microMIPS virtualization ASE +// ============================ + +def : InstRW<[GenericWriteCOP0Short], (instrs HYPCALL_MM, TLBGINVF_MM, + TLBGINV_MM, TLBGP_MM, TLBGR_MM, + TLBGWI_MM, TLBGWR_MM, MFGC0_MM, + MFHGC0_MM, MTGC0_MM, MTHGC0_MM)>; // LDST Pipeline // ------------- @@ -250,97 +562,168 @@ def GenericWriteLoadToOtherUnits : SchedWriteRes<[GenericIssueLDST]> { } // l[bhw], l[bh]u, ll -def : ItinRW<[GenericWriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LL, - II_LWC2, II_LWC3, II_LDC2, II_LDC3]>; +def : InstRW<[GenericWriteLoad], (instrs LB, LBu, LH, LHu, LW, LL, + LWC2, LWC3, LDC2, LDC3)>; // lw[lr] -def : ItinRW<[GenericWriteLoad], [II_LWL, II_LWR]>; +def : InstRW<[GenericWriteLoad], (instrs LWL, LWR)>; -// MIPS64 loads -def : ItinRW<[GenericWriteLoad], [II_LD, II_LLD, II_LWU]>; +// s[bhw], sc, s[dw]c[23] +def : InstRW<[GenericWriteStore], (instrs SB, SH, SW, SWC2, SWC3, + SDC2, SDC3)>; -// ld[lr] -def : ItinRW<[GenericWriteLoad], [II_LDL, II_LDR]>; +// PreMIPSR6 sw[lr] +def : InstRW<[GenericWriteStore], (instrs SWL, SWR)>; -// MIPS32 EVA -def : ItinRW<[GenericWriteLoad], [II_LBE, II_LBUE, II_LHE, II_LHUE, II_LWE, - II_LLE]>; +def : InstRW<[GenericWriteStoreSC], (instrs SC, SC_MMR6)>; -def : ItinRW<[GenericWriteLoad], [II_LWLE, II_LWRE]>; +// pref +def : InstRW<[GenericWritePref], (instrs PREF)>; +// cache +def : InstRW<[GenericWriteCache], (instrs CACHE)>; -// MIPS MT instructions -// ==================== +// sync +def : InstRW<[GenericWriteSync], (instrs SYNC, SYNCI)>; -def : ItinRW<[GenericWriteMove], [II_DMT, II_DVPE, II_EMT, II_EVPE, II_MFTR, - II_MTTR]>; +// MIPSR6 +// ====== -def : ItinRW<[GenericReadWriteCOP0Long], [II_YIELD]>; +def : InstRW<[GenericWriteLoad], (instrs LDC2_R6, LL_R6, LWC2_R6, LWPC)>; -def : ItinRW<[GenericWriteCOP0Short], [II_FORK]>; +def : InstRW<[GenericWriteStore], (instrs SWC2_R6, SDC2_R6)>; -// MIPS32R6 and MIPS16e -// ==================== +def : InstRW<[GenericWriteStoreSC], (instrs SC_R6)>; -def : ItinRW<[GenericWriteLoad], [II_LWPC]>; +def : InstRW<[GenericWritePref], (instrs PREF_R6)>; -// MIPS64R6 -// ==================== +def : InstRW<[GenericWriteCache], (instrs CACHE_R6)>; + +def : InstRW<[GenericWriteSync], (instrs GINVI, GINVT)>; -def : ItinRW<[GenericWriteLoad], [II_LWUPC, II_LDPC]>; +// MIPS32 EVA +// ========== +def : InstRW<[GenericWriteLoad], (instrs LBE, LBuE, LHE, LHuE, LWE, + LLE)>; -// s[bhw], sc, s[dw]c[23] -def : ItinRW<[GenericWriteStore], [II_SB, II_SH, II_SW, II_SWC2, II_SWC3, - II_SDC2, II_SDC3]>; +def : InstRW<[GenericWriteStore], (instrs SBE, SHE, SWE, SCE)>; -def : ItinRW<[GenericWriteStoreSC], [II_SC]>; +def : InstRW<[GenericWriteLoad], (instrs LWLE, LWRE)>; -// PreMIPSR6 sw[lr] -def : ItinRW<[GenericWriteStore], [II_SWL, II_SWR]>; +def : InstRW<[GenericWriteStore], (instrs SWLE, SWRE)>; -// EVA ASE stores -def : ItinRW<[GenericWriteStore], [II_SBE, II_SHE, II_SWE, II_SCE]>; +def : InstRW<[GenericWritePref], (instrs PREFE)>; -def : ItinRW<[GenericWriteStore], [II_SWLE, II_SWRE]>; +def : InstRW<[GenericWriteCache], (instrs CACHEE)>; -// MIPS64 -// ====== +// microMIPS EVA ASE - InMicroMipsMode, hasEVA +// =========================================== -def : ItinRW<[GenericWriteStore], [II_SD, II_SCD]>; +def : InstRW<[GenericWriteLoad], (instrs LBE_MM, LBuE_MM, LHE_MM, LHuE_MM, + LWE_MM, LWLE_MM, LWRE_MM, LLE_MM)>; -// PreMIPSR6 stores -// ================ +def : InstRW<[GenericWriteStore], (instrs SBE_MM, SB_MM, SHE_MM, SWE_MM, + SWLE_MM, SWRE_MM, SCE_MM)>; + +def : InstRW<[GenericWritePref], (instrs PREFE_MM)>; +def : InstRW<[GenericWriteCache], (instrs CACHEE_MM)>; -def : ItinRW<[GenericWriteStore], [II_SDL, II_SDR]>; // MIPS16e // ======= -def : ItinRW<[GenericWriteLoad], [II_RESTORE]>; +def : InstRW<[GenericWriteLoad], (instrs Restore16, RestoreX16, + LbRxRyOffMemX16, + LbuRxRyOffMemX16, LhRxRyOffMemX16, + LhuRxRyOffMemX16, LwRxRyOffMemX16, + LwRxSpImmX16, LwRxPcTcp16, LwRxPcTcpX16)>; -def : ItinRW<[GenericWriteStore], [II_SAVE]>; +def : InstRW<[GenericWriteStore], (instrs Save16, SaveX16, SbRxRyOffMemX16, + ShRxRyOffMemX16, SwRxRyOffMemX16, + SwRxSpImmX16)>; // microMIPS // ========= -def : ItinRW<[GenericWriteLoad], [II_LWM, II_LWP, II_LWXS]>; +def : InstRW<[GenericWriteLoad], (instrs LBU16_MM, LB_MM, LBu_MM, LHU16_MM, + LH_MM, LHu_MM, LL_MM, LW16_MM, LWGP_MM, + LWL_MM, LWM16_MM, LWM32_MM, LWP_MM, LWR_MM, + LWSP_MM, LWU_MM, LWXS_MM, LW_MM)>; -def : ItinRW<[GenericWriteStore], [II_SWM, II_SWP]>; +def : InstRW<[GenericWriteStore], (instrs SB16_MM, SC_MM, SH16_MM, SH_MM, + SW16_MM, SWL_MM, SWM16_MM, SWM32_MM, SWM_MM, + SWP_MM, SWR_MM, SWSP_MM, SW_MM)>; -// pref -def : ItinRW<[GenericWritePref], [II_PREF]>; -def : ItinRW<[GenericWritePref], [II_PREFE]>; +def : InstRW<[GenericWritePref], (instrs PREF_MM, PREFX_MM)>; -// cache -def : ItinRW<[GenericWriteCache], [II_CACHE]>; +def : InstRW<[GenericWriteCache], (instrs CACHE_MM)>; -def : ItinRW<[GenericWriteCache], [II_CACHEE]>; +def : InstRW<[GenericWriteSync], (instrs SYNC_MM, SYNCI_MM)>; +def : InstRW<[GenericWriteSync], (instrs GINVI_MMR6, GINVT_MMR6)>; -// sync -def : ItinRW<[GenericWriteSync], [II_SYNC]>; +// microMIPS32r6 +// ============= + +def : InstRW<[GenericWriteLoad], (instrs LBU_MMR6, LB_MMR6, LDC2_MMR6, LL_MMR6, + LWM16_MMR6, LWC2_MMR6, LWPC_MMR6, LW_MMR6)>; + +def : InstRW<[GenericWriteStore], (instrs SB16_MMR6, SB_MMR6, SDC2_MMR6, + SH16_MMR6, SH_MMR6, SW16_MMR6, SWC2_MMR6, + SWM16_MMR6, SWSP_MMR6, SW_MMR6)>; + +def : InstRW<[GenericWriteSync], (instrs SYNC_MMR6, SYNCI_MMR6)>; + +def : InstRW<[GenericWritePref], (instrs PREF_MMR6)>; -def : ItinRW<[GenericWriteSync], [II_SYNCI]>; +def : InstRW<[GenericWriteCache], (instrs CACHE_MMR6)>; + +// MIPS64 +// ====== + +def : InstRW<[GenericWriteLoad], (instrs LD, LL64, LLD, LWu, LB64, LBu64, + LH64, LHu64, LW64)>; + +// l[dw][lr] +def : InstRW<[GenericWriteLoad], (instrs LWL64, LWR64, LDL, LDR)>; + +def : InstRW<[GenericWriteStore], (instrs SD, SC64, SCD, SB64, SH64, SW64, + SWL64, SWR64)>; + +def : InstRW<[GenericWriteStore], (instrs SDL, SDR)>; + +// MIPS64R6 +// ======== + +def : InstRW<[GenericWriteLoad], (instrs LWUPC, LDPC)>; + +def : InstRW<[GenericWriteLoad], (instrs LLD_R6, LL64_R6)>; + +def : InstRW<[GenericWriteStoreSC], (instrs SC64_R6, SCD_R6)>; + +// MIPSR6 CRC ASE - hasCRC +// ======================= + +def : InstRW<[GenericWriteALU], (instrs CRC32B, CRC32H, CRC32W, CRC32CB, + CRC32CH, CRC32CW)>; + +// MIPS64R6 CRC ASE - hasCRC +// ------------------------- + +def : InstRW<[GenericWriteALU], (instrs CRC32D, CRC32CD)>; + + +// Cavium Networks MIPS (cnMIPS) - Octeon, HasCnMips +// ================================================= + +def : InstRW<[GenericWriteALU], (instrs BADDu, BBIT0, BBIT032, BBIT1, BBIT132, + CINS, CINS32, CINS64_32, CINS_i32, + DMFC2_OCTEON, DMTC2_OCTEON, DPOP, EXTS, + EXTS32, MTM0, MTM1, MTM2, MTP0, MTP1, MTP2, + POP, SEQ, SEQi, SNE, SNEi, V3MULU, VMM0, + VMULU)>; + +def : InstRW<[GenericWriteMDUtoGPR], (instrs DMUL)>; // FPU Pipelines // ============= @@ -408,10 +791,10 @@ def GenericWriteFPUSqrtD : SchedWriteRes<[GenericFPUDivSqrt]> { // --------------------------------- // // c..[ds], bc1[tf], bc1[tf]l -def : ItinRW<[GenericWriteFPUCmp], [II_C_CC_D, II_C_CC_S, II_BC1F, II_BC1T, - II_BC1FL, II_BC1TL]>; +def : InstRW<[GenericWriteFPUCmp], (instrs FCMP_D32, FCMP_D64, FCMP_S32, BC1F, + BC1T, BC1FL, BC1TL)>; -def : ItinRW<[GenericWriteFPUCmp], [II_CMP_CC_D, II_CMP_CC_S]>; +def : InstRW<[GenericWriteFPUCmp], (instregex "C_[A-Z]+_(S|D32|D64)$")>; // Short Pipe // ---------- @@ -419,21 +802,10 @@ def : ItinRW<[GenericWriteFPUCmp], [II_CMP_CC_D, II_CMP_CC_S]>; // abs.[ds], abs.ps, add.[ds], neg.[ds], neg.ps, madd.s, msub.s, nmadd,s // nmsub.s, sub.[ds], mul.s -def : ItinRW<[GenericWriteFPUS], [II_ABS, II_ADD_D, II_ADD_S, II_MADD_S, - II_MSUB_S, II_MUL_S, II_NEG, II_NMADD_S, - II_NMSUB_S, II_SUB_S, II_SUB_D]>; -// mov[tf].[ds] - -def : ItinRW<[GenericWriteFPUS], [II_MOVF_S, II_MOVF_D, II_MOVT_S, II_MOVT_D]>; - -// MIPSR6 -// ------ -// -// sel(eq|ne).[ds], max.[ds], maxa.[ds], min.[ds], mina.[ds], class.[ds] -def : ItinRW<[GenericWriteFPUS], [II_SELCCZ_S, II_SELCCZ_D, II_MAX_S, - II_MAX_D, II_MAXA_S, II_MAXA_D, II_MIN_S, - II_MIN_D, II_MINA_S, II_MINA_D, II_CLASS_S, - II_CLASS_D]>; +def : InstRW<[GenericWriteFPUS], (instrs FABS_S, FABS_D32, FABS_D64, FADD_D32, + FADD_D64, FADD_S, MADD_S, MSUB_S, FMUL_S, + FNEG_S, FNEG_D32, FNEG_D64, NMADD_S, NMSUB_S, + FSUB_S, FSUB_D32, FSUB_D64)>; // Long Pipe // ---------- @@ -445,71 +817,211 @@ def : ItinRW<[GenericWriteFPUS], [II_SELCCZ_S, II_SELCCZ_D, II_MAX_S, // madd.d, msub.dm mul.d, mul.ps, nmadd.d, nmsub.d, ceil.[wl].[sd], cvt.d.[sw], // cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, round.[lw].[ds], floor.[lw].ds, // trunc.w.[ds], trunc.w.ps, -def : ItinRW<[GenericWriteFPUL], [II_MADD_D, II_MSUB_D, II_MUL_D, II_NMADD_D, - II_NMSUB_D, II_CEIL, II_CVT, - II_FLOOR, II_ROUND, II_TRUNC]>; +def : InstRW<[GenericWriteFPUL], (instrs CEIL_L_D64, CEIL_L_S, CEIL_W_D32, + CEIL_W_D64, CEIL_W_S, CVT_D32_S, CVT_D32_W, + CVT_D64_L, CVT_D64_S, CVT_D64_W, CVT_L_D64, + CVT_L_S, CVT_S_D32, CVT_S_D64, CVT_S_L, + CVT_S_W, CVT_W_D32, CVT_W_D64, CVT_W_S, + CVT_PS_S64, CVT_S_PL64, CVT_S_PU64, + FLOOR_L_D64, FLOOR_L_S, FLOOR_W_D32, + FLOOR_W_D64, FLOOR_W_S, FMUL_D32, FMUL_D64, + MADD_D32, MADD_D64, MSUB_D32, MSUB_D64, + NMADD_D32, NMADD_D64, NMSUB_D32, NMSUB_D64, + PLL_PS64, PLU_PS64, + ROUND_L_D64, ROUND_L_S, ROUND_W_D32, + ROUND_W_D64, ROUND_W_S, TRUNC_L_D64, + TRUNC_L_S, TRUNC_W_D32, TRUNC_W_D64, + TRUNC_W_S, PseudoTRUNC_W_D, + PseudoTRUNC_W_D32, PseudoTRUNC_W_S)>; + +// Pseudo convert instruction +def : InstRW<[GenericWriteFPUL], (instrs PseudoCVT_D32_W, PseudoCVT_D64_L, + PseudoCVT_D64_W, PseudoCVT_S_L, + PseudoCVT_S_W)>; // div.[ds], div.ps -def : ItinRW<[GenericWriteFPUDivS], [II_DIV_S]>; -def : ItinRW<[GenericWriteFPUDivD], [II_DIV_D]>; +def : InstRW<[GenericWriteFPUDivS], (instrs FDIV_S)>; +def : InstRW<[GenericWriteFPUDivD], (instrs FDIV_D32, FDIV_D64)>; // sqrt.[ds], sqrt.ps -def : ItinRW<[GenericWriteFPUSqrtS], [II_SQRT_S]>; -def : ItinRW<[GenericWriteFPUSqrtD], [II_SQRT_D]>; +def : InstRW<[GenericWriteFPUSqrtS], (instrs FSQRT_S)>; +def : InstRW<[GenericWriteFPUSqrtD], (instrs FSQRT_D32, FSQRT_D64)>; // rsqrt.[ds], recip.[ds] -def : ItinRW<[GenericWriteFPURcpS], [II_RECIP_S, II_RSQRT_S]>; -def : ItinRW<[GenericWriteFPURcpD], [II_RECIP_D, II_RSQRT_D]>; +def : InstRW<[GenericWriteFPURcpS], (instrs RECIP_S, RSQRT_S)>; +def : InstRW<[GenericWriteFPURcpD], (instrs RECIP_D32, RECIP_D64, + RSQRT_D32, RSQRT_D64)>; -// MIPSR6 -// ====== -// -// rint.[ds] -def : ItinRW<[GenericWriteFPUL], [II_RINT_S, II_RINT_D]>; // Load Pipe // --------- // ctc1, mtc1, mthc1, cfc1, mfc1, mfhc1 -def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_CFC1, II_CTC1, II_MFC1, II_MFHC1, - II_MTC1, II_MTHC1]>; +def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs BuildPairF64, + BuildPairF64_64, ExtractElementF64, + ExtractElementF64_64, CFC1, CTC1, + MFC1, MFC1_D64, MFHC1_D32, + MFHC1_D64, MTC1, MTC1_D64, + MTHC1_D32, MTHC1_D64)>; // swc1, swxc1 -def : ItinRW<[GenericWriteFPUStore], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1, - II_SWXC1]>; +def : InstRW<[GenericWriteFPUStore], (instrs SDC1, SDC164, SDXC1, SDXC164, + SUXC1, SUXC164, SWC1, SWXC1)>; + +def : InstRW<[GenericWriteFPUMoveFP], (instrs FMOV_D32, FMOV_D64, FMOV_S)>; + // movn.[ds], movz.[ds] -def : ItinRW<[GenericWriteFPUMoveFP], [II_MOV_D, II_MOV_S, II_MOVF, II_MOVT, - II_MOVN_D, II_MOVN_S, II_MOVZ_D, - II_MOVZ_S]>; +def : InstRW<[GenericWriteFPUMoveFP], (instrs MOVF_I, MOVF_D32, MOVF_D64, + MOVF_S, MOVT_I, MOVT_D32, MOVT_D64, + MOVT_S, MOVN_I_D32, MOVN_I_D64, + MOVN_I_S, MOVZ_I_D32, MOVZ_I_D64, + MOVZ_I_S)>; + +def : InstRW<[GenericWriteFPUMoveFP], (instrs MOVT_I64, MOVF_I64, MOVZ_I64_S, + MOVN_I64_D64, MOVN_I64_S, + MOVZ_I64_D64)>; // l[dw]x?c1 -def : ItinRW<[GenericWriteFPULoad], [II_LDC1, II_LDXC1, II_LUXC1, II_LWC1, - II_LWXC1]>; +def : InstRW<[GenericWriteFPULoad], (instrs LDC1, LDC164, LDXC1, LDXC164, + LUXC1, LUXC164, LWC1, LWXC1)>; -// MIPS64 +// MIPSR6 // ====== -def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_DMFC1, II_DMTC1]>; +// sel(eq|ne).[ds], max.[ds], maxa.[ds], min.[ds], mina.[ds], class.[ds] +def : InstRW<[GenericWriteFPUS], (instrs SELEQZ_S, SELNEZ_S, SELEQZ_D, SELNEZ_D, + MAX_S, MAX_D, MAXA_S, MAXA_D, MIN_S, MIN_D, + MINA_S, MINA_D, CLASS_S, CLASS_D)>; -// MIPSR6 -// ====== +def : InstRW<[GenericWriteFPUL], (instrs RINT_S, RINT_D)>; -def : ItinRW<[GenericWriteFPUS], [II_MADDF_S, II_MSUBF_S]>; +def : InstRW<[GenericWriteFPUCmp], (instrs BC1EQZ, BC1NEZ, SEL_D, SEL_S)>; -def : ItinRW<[GenericWriteFPUS], [II_MADDF_D, II_MSUBF_D]>; +def : InstRW<[GenericWriteFPUS], (instrs MADDF_S, MSUBF_S, MADDF_D, MSUBF_D)>; -def : ItinRW<[GenericWriteFPUCmp], [II_BC1CCZ, II_SEL_D, II_SEL_S]>; -// Cavium Networks MIPS (cnMIPS) - Octeon, HasCnMips -// ================================================= +// microMIPS +// ========= + +def : InstRW<[GenericWriteFPUMoveFP], (instrs MOVF_D32_MM, MOVF_S_MM, + MOVN_I_D32_MM, MOVN_I_S_MM, + MOVT_D32_MM, MOVT_S_MM, MOVZ_I_D32_MM, + MOVZ_I_S_MM)>; + + +// cvt.?.?, ceil.?, floor.?, round.?, trunc.? (n)madd.? (n)msub.? +def : InstRW<[GenericWriteFPUL], (instrs CVT_D32_S_MM, CVT_D32_W_MM, + CVT_D64_S_MM, CVT_D64_W_MM, CVT_L_D64_MM, + CVT_L_S_MM, CVT_S_D32_MM, CVT_S_D64_MM, + CVT_S_W_MM, CVT_W_D32_MM, CVT_W_D64_MM, + CVT_W_S_MM, CEIL_W_MM, CEIL_W_S_MM, + FLOOR_W_MM, FLOOR_W_S_MM, NMADD_S_MM, + NMADD_D32_MM, NMSUB_S_MM, NMSUB_D32_MM, + MADD_S_MM, MADD_D32_MM, ROUND_W_MM, + ROUND_W_S_MM, TRUNC_W_MM, TRUNC_W_S_MM)>; + +def : InstRW<[GenericWriteFPUCmp], (instregex "^C_[A-Z]_(S|D32|D64)_MM$")>; +def : InstRW<[GenericWriteFPUCmp], (instregex "^C_[A-Z][A-Z]_(S|D32|D64)_MM$")>; +def : InstRW<[GenericWriteFPUCmp], (instregex "^C_[A-Z][A-Z][A-Z]_(S|D32|D64)_MM$")>; +def : InstRW<[GenericWriteFPUCmp], (instregex "^C_NGLE_(S|D32|D64)_MM$")>; +def : InstRW<[GenericWriteFPUCmp], (instrs FCMP_S32_MM, FCMP_D32_MM)>; + +def : InstRW<[GenericWriteFPUS], (instrs MFC1_MM, MFHC1_D32_MM, MFHC1_D64_MM, + MTC1_MM, MTC1_D64_MM, + MTHC1_D32_MM, MTHC1_D64_MM)>; + +def : InstRW<[GenericWriteFPUS], (instrs FABS_D32_MM, FABS_D64_MM, FABS_S_MM, + FNEG_D32_MM, FNEG_D64_MM, FNEG_S_MM, + FADD_D32_MM, FADD_D64_MM, FADD_S_MM, + FMOV_D32_MM, FMOV_D64_MM, FMOV_S_MM, + FMUL_D32_MM, FMUL_D64_MM, FMUL_S_MM, + FSUB_D32_MM, FSUB_D64_MM, FSUB_S_MM, + MSUB_S_MM, MSUB_D32_MM)>; + +def : InstRW<[GenericWriteFPUDivS], (instrs FDIV_S_MM)>; +def : InstRW<[GenericWriteFPUDivD], (instrs FDIV_D32_MM, FDIV_D64_MM)>; + +def : InstRW<[GenericWriteFPUSqrtS], (instrs FSQRT_S_MM)>; +def : InstRW<[GenericWriteFPUSqrtD], (instrs FSQRT_D32_MM, FSQRT_D64_MM)>; + +def : InstRW<[GenericWriteFPURcpS], (instrs RECIP_S_MM, RSQRT_S_MM)>; +def : InstRW<[GenericWriteFPURcpD], (instrs RECIP_D32_MM, RECIP_D64_MM, + RSQRT_D32_MM, RSQRT_D64_MM)>; + +def : InstRW<[GenericWriteFPUStore], (instrs SDC1_MM, SWC1_MM, SUXC1_MM, + SWXC1_MM)>; + +def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs CFC1_MM, CTC1_MM)>; + +def : InstRW<[GenericWriteFPULoad], (instrs LDC1_MM, LUXC1_MM, LWC1_MM, + LWXC1_MM)>; + +// microMIPS32r6 +// ============= + +def : InstRW<[GenericWriteFPUS], (instrs FNEG_S_MMR6)>; + +def : InstRW<[GenericWriteFPUCmp], (instregex "CMP_[A-Z][A-Z]_(S|D)_MMR6")>; +def : InstRW<[GenericWriteFPUCmp], + (instregex "CMP_[A-Z][A-Z][A-Z]_(S|D)_MMR6")>; +def : InstRW<[GenericWriteFPUCmp], + (instregex "CMP_[A-Z][A-Z][A-Z][A-Z]_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUL], + (instregex "CVT_(L|D|S|W)_(L|D|S|L|W)_MMR6")>; -def : ItinRW<[GenericWriteALU], [II_SEQ_SNE, II_SEQI_SNEI, II_POP, II_BADDU, - II_BBIT]>; +def : InstRW<[GenericWriteFPUL], + (instregex "TRUNC_(L|W)_(D|S)_MMR6")>; + +def : InstRW<[GenericWriteFPUL], + (instregex "ROUND_(L|W)_(D|S)_MMR6")>; + +def : InstRW<[GenericWriteFPUL], + (instregex "FLOOR_(L|W)_(D|S)_MMR6")>; + +def : InstRW<[GenericWriteFPUL], + (instregex "CEIL_(L|W)_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUS], + (instrs MFC1_MMR6, MTC1_MMR6, CLASS_S_MMR6, CLASS_D_MMR6, + FADD_S_MMR6)>; + +def : InstRW<[GenericWriteFPUS], (instregex "M(IN|AX)_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUS], (instregex "M(IN|AX)A_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUS], (instregex "SEL(EQ|NE)Z_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUS], (instregex "SEL_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUL], (instrs RINT_S_MMR6, RINT_D_MMR6)>; + +def : InstRW<[GenericWriteFPUS], (instregex "M(ADD|SUB)F_(S|D)_MMR6")>; + +def : InstRW<[GenericWriteFPUS], (instrs FMOV_S_MMR6, FMUL_S_MMR6, + FSUB_S_MMR6, FMOV_D_MMR6)>; + +def : InstRW<[GenericWriteFPUL], (instrs FDIV_S_MMR6)>; + +def : InstRW<[GenericWriteFPUStore], (instrs SDC1_D64_MMR6)>; + +def : InstRW<[GenericWriteFPULoad], (instrs LDC1_D64_MMR6)>; + +// MIPS64 +// ====== + +def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs DMFC1, DMTC1)>; // MIPS DSP ASE, HasDSP // ==================== +def : InstRW<[GenericWriteStore], (instrs SWDSP)>; + +def : InstRW<[GenericWriteLoad], (instrs LWDSP)>; + +def : InstRW<[GenericWriteMove], (instrs PseudoMTLOHI_DSP)>; + def GenericDSP : ProcResource<1> { let BufferSize = 1; } def GenericDSPShort : SchedWriteRes<[GenericDSP]> { let Latency = 2; } def GenericDSPLong : SchedWriteRes<[GenericDSP]> { let Latency = 6; } @@ -634,6 +1146,11 @@ def : InstRW<[GenericDSPShort], (instregex "^SUBU_QB$")>; def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_QB$")>; def : InstRW<[GenericDSPShort], (instregex "^WRDSP$")>; +def : InstRW<[GenericDSPShort], + (instregex "^Pseudo(CMP|CMPU)_(EQ|LE|LT)_(PH|QB)$")>; +def : InstRW<[GenericDSPShort], + (instregex "^PseudoPICK_(PH|QB)$")>; + // MIPS DSP R2 - hasDSP, HasDSPR2, InMicroMips // =========================================== @@ -687,6 +1204,10 @@ def : InstRW<[GenericDSPShort], (instregex "^SUBUH_R_QB$")>; // microMIPS DSP R1 - HasDSP, InMicroMips // ====================================== +def : InstRW<[GenericWriteLoad], (instrs LWDSP_MM)>; + +def : InstRW<[GenericWriteStore], (instrs SWDSP_MM)>; + def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_PH_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_W_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^ADDQ_PH_MM$")>; @@ -740,7 +1261,6 @@ def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHR_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^MODSUB_MM$")>; -def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MMR6$")>; def : InstRW<[GenericDSPShort], (instregex "^MOVN_I_MM$")>; def : InstRW<[GenericDSPShort], (instregex "^MOVZ_I_MM$")>; @@ -902,12 +1422,14 @@ def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>; -// and.v, andi.b, move.v, ldi.[bhwd], xor.v, nor.v, xori.b, nori.b +// and.v, andi.b, move.v, ldi.[bhwd], xor.v, nor.v, xori.b, nori.b, lsa def : InstRW<[GenericWriteMSAShortLogic], (instregex "^MOVE_V$")>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>; +def : InstRW<[GenericWriteMSAShortLogic], (instrs LSA)>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>; -def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>; +def : InstRW<[GenericWriteMSAShortLogic], + (instregex "^(AND|OR|[XN]OR)_V_[DHW]_PSEUDO$")>; // vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd], // bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b @@ -921,8 +1443,10 @@ def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^BMN*Z.*$")>; +def : InstRW<[GenericWriteMSAShortInt], + (instregex "^BSEL_(H|W|D|FW|FD)_PSEUDO$")>; -// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd] +// pcnt.[bhwd], sat_s.[bhwd], sat_u.[bhwd] def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>; def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>; @@ -935,10 +1459,6 @@ def : InstRW<[GenericWriteMSAShortInt], (instregex "^SHF_[BHW]$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^FILL_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortInt], (instregex "^(SPLAT|SPLATI)_[BHWD]$")>; -// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd] -def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>; -def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>; - // fexp2_w, fexp2_d def : InstRW<[GenericWriteFPUS], (instregex "^FEXP2_(W|D)$")>; @@ -953,6 +1473,15 @@ def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LT_(S|D)$")>; def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULT_(S|D)$")>; def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LE_(S|D)$")>; def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULE_(S|D)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_F_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SAF_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SEQ_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SLE_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SLT_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SUEQ_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SULE_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SULT_(D|S)$")>; +def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SUN_(D|S)$")>; def : InstRW<[GenericWriteFPUS], (instregex "^FS(AF|EQ|LT|LE|NE|OR)_(W|D)$")>; def : InstRW<[GenericWriteFPUS], (instregex "^FSUEQ_(W|D)$")>; def : InstRW<[GenericWriteFPUS], (instregex "^FSULE_(W|D)$")>; @@ -995,7 +1524,6 @@ def : InstRW<[GenericWriteFPUS], (instregex "^FLOG2_(W|D)$")>; // interleave right/left, interleave even/odd, insert def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVR|ILVL)_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVEV|ILVOD)_[BHWD]$")>; -def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>; // subs_?.[bhwd], subsus_?.[bhwd], subsuu_?.[bhwd], subvi.[bhwd], subv.[bhwd], def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBS_(S|U)_[BHWD]$")>; @@ -1027,6 +1555,8 @@ def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>; def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>; +def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSERT_F(D|W)_PSEUDO$")>; +def : InstRW<[GenericWriteMSAShortLogic], (instregex "^FILL_F(D|W)_PSEUDO$")>; // dpadd_?.[bhwd], dpsub_?.[bhwd], dotp_?.[bhwd], msubv.[bhwd], maddv.[bhwd] // mulv.[bhwd]. @@ -1062,5 +1592,23 @@ def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_U_[BHW]$")>; def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_S_[BHWD]$")>; def : InstRW<[GenericWriteFPUStore], (instregex "^ST_[BHWD]$")>; +def : InstRW<[GenericWriteFPUStore], (instrs ST_F16)>; def : InstRW<[GenericWriteFPULoad], (instregex "^LD_[BHWD]$")>; +def : InstRW<[GenericWriteFPULoad], (instrs LD_F16)>; + +// Atomic instructions + +// FIXME: Define `WriteAtomic` in the MipsSchedule.td and +// attach it to the Atomic2OpsPostRA, AtomicCmpSwapPostRA, ... +// classes. Then just define resources for the `WriteAtomic` in each +// machine models. +def GenericAtomic : ProcResource<1> { let BufferSize = 1; } +def GenericWriteAtomic : SchedWriteRes<[GenericAtomic]> { let Latency = 2; } + +def : InstRW<[GenericWriteAtomic], + (instregex "^ATOMIC_SWAP_I(8|16|32|64)_POSTRA$")>; +def : InstRW<[GenericWriteAtomic], + (instregex "^ATOMIC_CMP_SWAP_I(8|16|32|64)_POSTRA$")>; +def : InstRW<[GenericWriteAtomic], + (instregex "^ATOMIC_LOAD_(ADD|SUB|AND|OR|XOR|NAND)_I(8|16|32|64)_POSTRA$")>; } diff --git a/lib/Target/Mips/MipsScheduleP5600.td b/lib/Target/Mips/MipsScheduleP5600.td index 846fa11494c7..f97b03bff08e 100644 --- a/lib/Target/Mips/MipsScheduleP5600.td +++ b/lib/Target/Mips/MipsScheduleP5600.td @@ -1,9 +1,8 @@ //==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,12 +12,13 @@ def MipsP5600Model : SchedMachineModel { int LoadLatency = 4; int MispredictPenalty = 8; // TODO: Estimated - let CompleteModel = 0; + let CompleteModel = 1; let FullInstRWOverlapCheck = 1; - list UnsupportedFeatures = [HasMips32r6, HasMips64r6, - HasMips3, HasMips64r2, HasCnMips, - InMicroMips, InMips16Mode, + list UnsupportedFeatures = [HasMips3, HasMips32r6, HasMips64, + HasMips64r2, HasMips64r5, HasMips64r6, + IsGP64bit, IsPTR64bit, + InMicroMips, InMips16Mode, HasCnMips, HasDSP, HasDSPR2, HasMT, HasCRC]; } @@ -59,15 +59,21 @@ def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> { let Latency = 2; } +def P5600Nop : SchedWriteRes<[P5600IssueCTISTD]> { + let Latency = 0; +} + +def : InstRW<[P5600Nop], (instrs SSNOP, NOP)>; + // b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, // jalr, jr.hb, jr def : InstRW<[P5600WriteJump], (instrs B, BAL, BAL_BR, BEQ, BEQL, BGEZ, BGEZAL, BGEZALL, BGEZL, BGTZ, BGTZL, BLEZ, BLEZL, BLTZ, BLTZAL, BLTZALL, BLTZL, BNE, BNEL, BREAK, - DERET, ERET, ERETNC, J, JR, JR_HB, + DERET, ERET, ERet, ERETNC, J, JR, JR_HB, PseudoIndirectBranch, PseudoIndirectHazardBranch, PseudoReturn, - SDBBP, SSNOP, SYSCALL, TAILCALL, TAILCALLREG, + SDBBP, SYSCALL, RetRA, TAILCALL, TAILCALLREG, TAILCALLREGHB, TEQ, TEQI, TGE, TGEI, TGEIU, TGEU, TLT, TLTI, TLTU, TNE, TNEI, TRAP, TTLTIU, WAIT, PAUSE)>; @@ -90,6 +96,11 @@ def : InstRW<[P5600COP2], (instrs MFC2, MTC2)> { let Unsupported = 1; } +// MIPS Virtualization ASE +// ======================= +def : InstRW<[P5600COP0], (instrs HYPCALL, MFGC0, MFHGC0, MTGC0, MTHGC0, + TLBGINV, TLBGINVF, TLBGP, TLBGR, TLBGWI, TLBGWR)>; + // LDST Pipeline // ------------- @@ -288,6 +299,8 @@ def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>; def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>; def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>; def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>; +def : InstRW<[P5600WriteMSAShortInt], + (instregex "^BSEL_(H|W|D|FW|FD)_PSEUDO$")>; // pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd] def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>; @@ -335,6 +348,10 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>; def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>; def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>; def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>; +def : InstRW<[P5600WriteMSAShortLogic], + (instregex "^(AND|OR|[XN]OR)_V_[DHW]_PSEUDO$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^FILL_F(D|W)_PSEUDO$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSERT_F(D|W)_PSEUDO$")>; // fexp2_w, fexp2_d def : InstRW<[P5600WriteFPUS], (instregex "^FEXP2_(W|D)$")>; @@ -427,17 +444,19 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>; // ---------- // // add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, -// cvt.ps.[sw], c..[ds], c..ps, mul.[ds], mul.ps, sub.[ds], sub.ps, -// trunc.w.[ds], trunc.w.ps +// cvt.ps.[sw], cvt.s.(pl|pu), c..[ds], c..ps, mul.[ds], mul.ps, +// pl[lu].ps, sub.[ds], sub.ps, trunc.w.[ds], trunc.w.ps def : InstRW<[P5600WriteFPUL], (instrs FADD_D32, FADD_D64, FADD_S, FMUL_D32, FMUL_D64, FMUL_S, FSUB_D32, FSUB_D64, FSUB_S)>; def : InstRW<[P5600WriteFPUL], (instregex "^TRUNC_(L|W)_(S|D32|D64)$")>; def : InstRW<[P5600WriteFPUL], (instregex "^CVT_(S|D32|D64|L|W)_(S|D32|D64|L|W)$")>; +def : InstRW<[P5600WriteFPUL], (instrs CVT_PS_S64, CVT_S_PL64, CVT_S_PU64)>; def : InstRW<[P5600WriteFPUL], (instregex "^C_[A-Z]+_(S|D32|D64)$")>; def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>; def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>; +def : InstRW<[P5600WriteFPUL], (instrs PLL_PS64, PLU_PS64)>; // div.[ds], div.ps def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>; @@ -555,16 +574,20 @@ def : InstRW<[P5600WriteMoveFPUToGPR], (instrs BC1F, BC1FL, BC1T, BC1TL, CFC1, ExtractElementF64_64)>; // swc1, swxc1, st.[bhwd] -def : InstRW<[P5600WriteStoreFPUS], (instrs SDC1, SDXC1, SUXC1, SWC1, SWXC1)>; +def : InstRW<[P5600WriteStoreFPUS], (instrs SDC1, SDC164, SDXC1, SDXC164, + SWC1, SWXC1, SUXC1, SUXC164)>; def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>; +def : InstRW<[P5600WriteStoreFPUS], (instrs ST_F16)>; // movn.[ds], movz.[ds] def : InstRW<[P5600WriteStoreFPUL], (instrs MOVN_I_D32, MOVN_I_D64, MOVN_I_S, MOVZ_I_D32, MOVZ_I_D64, MOVZ_I_S)>; // l[dw]x?c1, ld.[bhwd] -def : InstRW<[P5600WriteLoadFPU], (instrs LDC1, LDXC1, LWC1, LWXC1, LUXC1)>; +def : InstRW<[P5600WriteLoadFPU], (instrs LDC1, LDC164, LDXC1, LDXC164, + LWC1, LWXC1, LUXC1, LUXC164)>; def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>; +def : InstRW<[P5600WriteLoadFPU], (instrs LD_F16)>; // Unsupported Instructions // ======================== @@ -593,4 +616,20 @@ def : InstRW<[P5600WriteFPUL], (instregex "^ROUND_(L|W)_(S|D32|D64)$")>; // Reason behind guess: rotr is in the same category and the two register forms // generally follow the immediate forms in this category def : InstRW<[P5600WriteEitherALU], (instrs ROTRV)>; + +// Atomic instructions + +// FIXME: Define `WriteAtomic` in the MipsSchedule.td and +// attach it to the Atomic2OpsPostRA, AtomicCmpSwapPostRA, ... +// classes. Then just define resources for the `WriteAtomic` in each +// machine models. +def P5600Atomic : ProcResource<1> { let BufferSize = 1; } +def P5600WriteAtomic : SchedWriteRes<[P5600Atomic]> { let Latency = 2; } + +def : InstRW<[P5600WriteAtomic], + (instregex "^ATOMIC_SWAP_I(8|16|32|64)_POSTRA$")>; +def : InstRW<[P5600WriteAtomic], + (instregex "^ATOMIC_CMP_SWAP_I(8|16|32|64)_POSTRA$")>; +def : InstRW<[P5600WriteAtomic], + (instregex "^ATOMIC_LOAD_(ADD|SUB|AND|OR|XOR|NAND)_I(8|16|32|64)_POSTRA$")>; } diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 0c39a45467c4..d021b3d021b1 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -1,9 +1,8 @@ //===-- MipsSubtarget.cpp - Mips Subtarget Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -73,7 +72,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, unsigned StackAlignOverride) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault), IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false), - NoABICalls(false), IsFP64bit(false), UseOddSPReg(true), + NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false), HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false), @@ -109,6 +108,11 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, "See -mattr=+fp64.", false); + if (isFP64bit() && !hasMips64() && hasMips32() && !hasMips32r2()) + report_fatal_error( + "FPU with 64-bit registers is not available on MIPS32 pre revision 2. " + "Use -mcpu=mips32r2 or greater."); + if (!isABI_O32() && !useOddSPReg()) report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false); @@ -129,11 +133,18 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, report_fatal_error( "indirect jumps with hazard barriers requires MIPS32R2 or later"); } + if (inAbs2008Mode() && hasMips32() && !hasMips32r2()) { + report_fatal_error("IEEE 754-2008 abs.fmt is not supported for the given " + "architecture.", + false); + } + if (hasMips32r6()) { StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6"; assert(isFP64bit()); assert(isNaN2008()); + assert(inAbs2008Mode()); if (hasDSP()) report_fatal_error(ISA + " is not compatible with the DSP ASE", false); } diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index ad8f4848b870..aa1200579fc8 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -1,9 +1,8 @@ //===-- MipsSubtarget.h - Define Subtarget for the Mips ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -87,6 +86,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // NoABICalls - Disable SVR4-style position-independent code. bool NoABICalls; + // Abs2008 - Use IEEE 754-2008 abs.fmt instruction. + bool Abs2008; + // IsFP64bit - The target processor has 64-bit floating point registers. bool IsFP64bit; @@ -273,6 +275,7 @@ public: bool useOddSPReg() const { return UseOddSPReg; } bool noOddSPReg() const { return !UseOddSPReg; } bool isNaN2008() const { return IsNaN2008bit; } + bool inAbs2008Mode() const { return Abs2008; } bool isGP64bit() const { return IsGP64bit; } bool isGP32bit() const { return !IsGP64bit; } unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; } diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 8466298cf36f..c878abb042e4 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,6 +18,7 @@ #include "MipsSEISelDAGToDAG.h" #include "MipsSubtarget.h" #include "MipsTargetObjectFile.h" +#include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -205,8 +205,7 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const { void MipsTargetMachine::resetSubtarget(MachineFunction *MF) { LLVM_DEBUG(dbgs() << "resetSubtarget\n"); - Subtarget = const_cast(getSubtargetImpl(MF->getFunction())); - MF->setSubtarget(Subtarget); + Subtarget = &MF->getSubtarget(); } namespace { @@ -240,6 +239,8 @@ public: bool addLegalizeMachineIR() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; + + std::unique_ptr getCSEConfig() const override; }; } // end anonymous namespace @@ -248,6 +249,10 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) { return new MipsPassConfig(*this, PM); } +std::unique_ptr MipsPassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} + void MipsPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); addPass(createAtomicExpandPass()); diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index d9b73d151119..25300504a02d 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -1,9 +1,8 @@ //===- MipsTargetMachine.h - Define TargetMachine for Mips ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,7 +29,7 @@ class MipsTargetMachine : public LLVMTargetMachine { std::unique_ptr TLOF; // Selected ABI MipsABIInfo ABI; - MipsSubtarget *Subtarget; + const MipsSubtarget *Subtarget; MipsSubtarget DefaultSubtarget; MipsSubtarget NoMips16Subtarget; MipsSubtarget Mips16Subtarget; @@ -66,10 +65,6 @@ public: bool isLittleEndian() const { return isLittle; } const MipsABIInfo &getABI() const { return ABI; } - - bool isMachineVerifierClean() const override { - return false; - } }; /// Mips32/64 big endian target machine. diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp index f53ee0631b5e..0852b5a18c68 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h index a37ec154ff79..bdf485f83260 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.h +++ b/lib/Target/Mips/MipsTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index a282366f6d40..1fa8ebadd643 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -1,9 +1,8 @@ //===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -92,6 +91,7 @@ public: // PIC support virtual void emitDirectiveCpLoad(unsigned RegNo); + virtual void emitDirectiveCpLocal(unsigned RegNo); virtual bool emitDirectiveCpRestore(int Offset, function_ref GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI); @@ -200,6 +200,7 @@ protected: bool FrameInfoSet; int FrameOffset; unsigned FrameReg; + unsigned GPReg; unsigned ReturnReg; private: @@ -275,6 +276,7 @@ public: // PIC support void emitDirectiveCpLoad(unsigned RegNo) override; + void emitDirectiveCpLocal(unsigned RegNo) override; /// Emit a .cprestore directive. If the offset is out of range then it will /// be synthesized using the assembler temporary. @@ -346,6 +348,7 @@ public: // PIC support void emitDirectiveCpLoad(unsigned RegNo) override; + void emitDirectiveCpLocal(unsigned RegNo) override; bool emitDirectiveCpRestore(int Offset, function_ref GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) override; void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp index 22be564b6502..0082ca34cdbd 100644 --- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp +++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp @@ -1,14 +1,12 @@ //===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "Mips.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/MipsTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.h b/lib/Target/Mips/TargetInfo/MipsTargetInfo.h new file mode 100644 index 000000000000..d91a2719108d --- /dev/null +++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.h @@ -0,0 +1,23 @@ +//===-- MipsTargetInfo.h - Mips Target Implementation -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_MIPS_TARGETINFO_MIPSTARGETINFO_H +#define LLVM_LIB_TARGET_MIPS_TARGETINFO_MIPSTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheMipsTarget(); +Target &getTheMipselTarget(); +Target &getTheMips64Target(); +Target &getTheMips64elTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_MIPS_TARGETINFO_MIPSTARGETINFO_H diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp deleted file mode 100644 index b774fe169d71..000000000000 --- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp +++ /dev/null @@ -1,296 +0,0 @@ -//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Print MCInst instructions to .ptx format. -// -//===----------------------------------------------------------------------===// - -#include "InstPrinter/NVPTXInstPrinter.h" -#include "MCTargetDesc/NVPTXBaseInfo.h" -#include "NVPTX.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" -#include -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "NVPTXGenAsmWriter.inc" - -NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - -void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - // Decode the virtual register - // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister - unsigned RCId = (RegNo >> 28); - switch (RCId) { - default: report_fatal_error("Bad virtual register encoding"); - case 0: - // This is actually a physical register, so defer to the autogenerated - // register printer - OS << getRegisterName(RegNo); - return; - case 1: - OS << "%p"; - break; - case 2: - OS << "%rs"; - break; - case 3: - OS << "%r"; - break; - case 4: - OS << "%rd"; - break; - case 5: - OS << "%f"; - break; - case 6: - OS << "%fd"; - break; - case 7: - OS << "%h"; - break; - case 8: - OS << "%hh"; - break; - } - - unsigned VReg = RegNo & 0x0FFFFFFF; - OS << VReg; -} - -void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { - printInstruction(MI, OS); - - // Next always print the annotation. - printAnnotation(OS, Annot); -} - -void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - printRegName(O, Reg); - } else if (Op.isImm()) { - O << markup(""); - } else { - assert(Op.isExpr() && "Unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI); - } -} - -void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier) { - const MCOperand &MO = MI->getOperand(OpNum); - int64_t Imm = MO.getImm(); - - if (strcmp(Modifier, "ftz") == 0) { - // FTZ flag - if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG) - O << ".ftz"; - } else if (strcmp(Modifier, "sat") == 0) { - // SAT flag - if (Imm & NVPTX::PTXCvtMode::SAT_FLAG) - O << ".sat"; - } else if (strcmp(Modifier, "base") == 0) { - // Default operand - switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) { - default: - return; - case NVPTX::PTXCvtMode::NONE: - break; - case NVPTX::PTXCvtMode::RNI: - O << ".rni"; - break; - case NVPTX::PTXCvtMode::RZI: - O << ".rzi"; - break; - case NVPTX::PTXCvtMode::RMI: - O << ".rmi"; - break; - case NVPTX::PTXCvtMode::RPI: - O << ".rpi"; - break; - case NVPTX::PTXCvtMode::RN: - O << ".rn"; - break; - case NVPTX::PTXCvtMode::RZ: - O << ".rz"; - break; - case NVPTX::PTXCvtMode::RM: - O << ".rm"; - break; - case NVPTX::PTXCvtMode::RP: - O << ".rp"; - break; - } - } else { - llvm_unreachable("Invalid conversion modifier"); - } -} - -void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier) { - const MCOperand &MO = MI->getOperand(OpNum); - int64_t Imm = MO.getImm(); - - if (strcmp(Modifier, "ftz") == 0) { - // FTZ flag - if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG) - O << ".ftz"; - } else if (strcmp(Modifier, "base") == 0) { - switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) { - default: - return; - case NVPTX::PTXCmpMode::EQ: - O << ".eq"; - break; - case NVPTX::PTXCmpMode::NE: - O << ".ne"; - break; - case NVPTX::PTXCmpMode::LT: - O << ".lt"; - break; - case NVPTX::PTXCmpMode::LE: - O << ".le"; - break; - case NVPTX::PTXCmpMode::GT: - O << ".gt"; - break; - case NVPTX::PTXCmpMode::GE: - O << ".ge"; - break; - case NVPTX::PTXCmpMode::LO: - O << ".lo"; - break; - case NVPTX::PTXCmpMode::LS: - O << ".ls"; - break; - case NVPTX::PTXCmpMode::HI: - O << ".hi"; - break; - case NVPTX::PTXCmpMode::HS: - O << ".hs"; - break; - case NVPTX::PTXCmpMode::EQU: - O << ".equ"; - break; - case NVPTX::PTXCmpMode::NEU: - O << ".neu"; - break; - case NVPTX::PTXCmpMode::LTU: - O << ".ltu"; - break; - case NVPTX::PTXCmpMode::LEU: - O << ".leu"; - break; - case NVPTX::PTXCmpMode::GTU: - O << ".gtu"; - break; - case NVPTX::PTXCmpMode::GEU: - O << ".geu"; - break; - case NVPTX::PTXCmpMode::NUM: - O << ".num"; - break; - case NVPTX::PTXCmpMode::NotANumber: - O << ".nan"; - break; - } - } else { - llvm_unreachable("Empty Modifier"); - } -} - -void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { - if (Modifier) { - const MCOperand &MO = MI->getOperand(OpNum); - int Imm = (int) MO.getImm(); - if (!strcmp(Modifier, "volatile")) { - if (Imm) - O << ".volatile"; - } else if (!strcmp(Modifier, "addsp")) { - switch (Imm) { - case NVPTX::PTXLdStInstCode::GLOBAL: - O << ".global"; - break; - case NVPTX::PTXLdStInstCode::SHARED: - O << ".shared"; - break; - case NVPTX::PTXLdStInstCode::LOCAL: - O << ".local"; - break; - case NVPTX::PTXLdStInstCode::PARAM: - O << ".param"; - break; - case NVPTX::PTXLdStInstCode::CONSTANT: - O << ".const"; - break; - case NVPTX::PTXLdStInstCode::GENERIC: - break; - default: - llvm_unreachable("Wrong Address Space"); - } - } else if (!strcmp(Modifier, "sign")) { - if (Imm == NVPTX::PTXLdStInstCode::Signed) - O << "s"; - else if (Imm == NVPTX::PTXLdStInstCode::Unsigned) - O << "u"; - else if (Imm == NVPTX::PTXLdStInstCode::Untyped) - O << "b"; - else if (Imm == NVPTX::PTXLdStInstCode::Float) - O << "f"; - else - llvm_unreachable("Unknown register type"); - } else if (!strcmp(Modifier, "vec")) { - if (Imm == NVPTX::PTXLdStInstCode::V2) - O << ".v2"; - else if (Imm == NVPTX::PTXLdStInstCode::V4) - O << ".v4"; - } else - llvm_unreachable("Unknown Modifier"); - } else - llvm_unreachable("Empty Modifier"); -} - -void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { - printOperand(MI, OpNum, O); - - if (Modifier && !strcmp(Modifier, "add")) { - O << ", "; - printOperand(MI, OpNum + 1, O); - } else { - if (MI->getOperand(OpNum + 1).isImm() && - MI->getOperand(OpNum + 1).getImm() == 0) - return; // don't print ',0' or '+0' - O << "+"; - printOperand(MI, OpNum + 1, O); - } -} - -void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { - const MCOperand &Op = MI->getOperand(OpNum); - assert(Op.isExpr() && "Call prototype is not an MCExpr?"); - const MCExpr *Expr = Op.getExpr(); - const MCSymbol &Sym = cast(Expr)->getSymbol(); - O << Sym.getName(); -} diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h deleted file mode 100644 index f0f223aa057b..000000000000 --- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h +++ /dev/null @@ -1,52 +0,0 @@ -//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an NVPTX MCInst to .ptx file syntax. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H -#define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class MCSubtargetInfo; - -class NVPTXInstPrinter : public MCInstPrinter { -public: - NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI); - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) override; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - // End - - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = nullptr); - void printMemOperand(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = nullptr); - void printProtoIdent(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = nullptr); -}; - -} - -#endif diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h index 1cb92005979d..815b600fe93a 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -1,9 +1,8 @@ //===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp new file mode 100644 index 000000000000..b6eefe206268 --- /dev/null +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -0,0 +1,309 @@ +//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Print MCInst instructions to .ptx format. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/NVPTXInstPrinter.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" +#include "NVPTX.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "NVPTXGenAsmWriter.inc" + +NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + +void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + // Decode the virtual register + // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister + unsigned RCId = (RegNo >> 28); + switch (RCId) { + default: report_fatal_error("Bad virtual register encoding"); + case 0: + // This is actually a physical register, so defer to the autogenerated + // register printer + OS << getRegisterName(RegNo); + return; + case 1: + OS << "%p"; + break; + case 2: + OS << "%rs"; + break; + case 3: + OS << "%r"; + break; + case 4: + OS << "%rd"; + break; + case 5: + OS << "%f"; + break; + case 6: + OS << "%fd"; + break; + case 7: + OS << "%h"; + break; + case 8: + OS << "%hh"; + break; + } + + unsigned VReg = RegNo & 0x0FFFFFFF; + OS << VReg; +} + +void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); +} + +void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + printRegName(O, Reg); + } else if (Op.isImm()) { + O << markup(""); + } else { + assert(Op.isExpr() && "Unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier) { + const MCOperand &MO = MI->getOperand(OpNum); + int64_t Imm = MO.getImm(); + + if (strcmp(Modifier, "ftz") == 0) { + // FTZ flag + if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG) + O << ".ftz"; + } else if (strcmp(Modifier, "sat") == 0) { + // SAT flag + if (Imm & NVPTX::PTXCvtMode::SAT_FLAG) + O << ".sat"; + } else if (strcmp(Modifier, "base") == 0) { + // Default operand + switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) { + default: + return; + case NVPTX::PTXCvtMode::NONE: + break; + case NVPTX::PTXCvtMode::RNI: + O << ".rni"; + break; + case NVPTX::PTXCvtMode::RZI: + O << ".rzi"; + break; + case NVPTX::PTXCvtMode::RMI: + O << ".rmi"; + break; + case NVPTX::PTXCvtMode::RPI: + O << ".rpi"; + break; + case NVPTX::PTXCvtMode::RN: + O << ".rn"; + break; + case NVPTX::PTXCvtMode::RZ: + O << ".rz"; + break; + case NVPTX::PTXCvtMode::RM: + O << ".rm"; + break; + case NVPTX::PTXCvtMode::RP: + O << ".rp"; + break; + } + } else { + llvm_unreachable("Invalid conversion modifier"); + } +} + +void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier) { + const MCOperand &MO = MI->getOperand(OpNum); + int64_t Imm = MO.getImm(); + + if (strcmp(Modifier, "ftz") == 0) { + // FTZ flag + if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG) + O << ".ftz"; + } else if (strcmp(Modifier, "base") == 0) { + switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) { + default: + return; + case NVPTX::PTXCmpMode::EQ: + O << ".eq"; + break; + case NVPTX::PTXCmpMode::NE: + O << ".ne"; + break; + case NVPTX::PTXCmpMode::LT: + O << ".lt"; + break; + case NVPTX::PTXCmpMode::LE: + O << ".le"; + break; + case NVPTX::PTXCmpMode::GT: + O << ".gt"; + break; + case NVPTX::PTXCmpMode::GE: + O << ".ge"; + break; + case NVPTX::PTXCmpMode::LO: + O << ".lo"; + break; + case NVPTX::PTXCmpMode::LS: + O << ".ls"; + break; + case NVPTX::PTXCmpMode::HI: + O << ".hi"; + break; + case NVPTX::PTXCmpMode::HS: + O << ".hs"; + break; + case NVPTX::PTXCmpMode::EQU: + O << ".equ"; + break; + case NVPTX::PTXCmpMode::NEU: + O << ".neu"; + break; + case NVPTX::PTXCmpMode::LTU: + O << ".ltu"; + break; + case NVPTX::PTXCmpMode::LEU: + O << ".leu"; + break; + case NVPTX::PTXCmpMode::GTU: + O << ".gtu"; + break; + case NVPTX::PTXCmpMode::GEU: + O << ".geu"; + break; + case NVPTX::PTXCmpMode::NUM: + O << ".num"; + break; + case NVPTX::PTXCmpMode::NotANumber: + O << ".nan"; + break; + } + } else { + llvm_unreachable("Empty Modifier"); + } +} + +void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + if (Modifier) { + const MCOperand &MO = MI->getOperand(OpNum); + int Imm = (int) MO.getImm(); + if (!strcmp(Modifier, "volatile")) { + if (Imm) + O << ".volatile"; + } else if (!strcmp(Modifier, "addsp")) { + switch (Imm) { + case NVPTX::PTXLdStInstCode::GLOBAL: + O << ".global"; + break; + case NVPTX::PTXLdStInstCode::SHARED: + O << ".shared"; + break; + case NVPTX::PTXLdStInstCode::LOCAL: + O << ".local"; + break; + case NVPTX::PTXLdStInstCode::PARAM: + O << ".param"; + break; + case NVPTX::PTXLdStInstCode::CONSTANT: + O << ".const"; + break; + case NVPTX::PTXLdStInstCode::GENERIC: + break; + default: + llvm_unreachable("Wrong Address Space"); + } + } else if (!strcmp(Modifier, "sign")) { + if (Imm == NVPTX::PTXLdStInstCode::Signed) + O << "s"; + else if (Imm == NVPTX::PTXLdStInstCode::Unsigned) + O << "u"; + else if (Imm == NVPTX::PTXLdStInstCode::Untyped) + O << "b"; + else if (Imm == NVPTX::PTXLdStInstCode::Float) + O << "f"; + else + llvm_unreachable("Unknown register type"); + } else if (!strcmp(Modifier, "vec")) { + if (Imm == NVPTX::PTXLdStInstCode::V2) + O << ".v2"; + else if (Imm == NVPTX::PTXLdStInstCode::V4) + O << ".v4"; + } else + llvm_unreachable("Unknown Modifier"); + } else + llvm_unreachable("Empty Modifier"); +} + +void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier) { + const MCOperand &MO = MI->getOperand(OpNum); + int Imm = (int)MO.getImm(); + if (Modifier == nullptr || strcmp(Modifier, "version") == 0) { + O << Imm; // Just print out PTX version + } else if (strcmp(Modifier, "aligned") == 0) { + // PTX63 requires '.aligned' in the name of the instruction. + if (Imm >= 63) + O << ".aligned"; + } else + llvm_unreachable("Unknown Modifier"); +} + +void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + printOperand(MI, OpNum, O); + + if (Modifier && !strcmp(Modifier, "add")) { + O << ", "; + printOperand(MI, OpNum + 1, O); + } else { + if (MI->getOperand(OpNum + 1).isImm() && + MI->getOperand(OpNum + 1).getImm() == 0) + return; // don't print ',0' or '+0' + O << "+"; + printOperand(MI, OpNum + 1, O); + } +} + +void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + const MCOperand &Op = MI->getOperand(OpNum); + assert(Op.isExpr() && "Call prototype is not an MCExpr?"); + const MCExpr *Expr = Op.getExpr(); + const MCSymbol &Sym = cast(Expr)->getSymbol(); + O << Sym.getName(); +} diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h new file mode 100644 index 000000000000..c38472925a29 --- /dev/null +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -0,0 +1,53 @@ +//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an NVPTX MCInst to .ptx file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXINSTPRINTER_H +#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class MCSubtargetInfo; + +class NVPTXInstPrinter : public MCInstPrinter { +public: + NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + // End + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier = nullptr); + void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier = nullptr); + void printLdStCode(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier = nullptr); + void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier = nullptr); + void printMemOperand(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier = nullptr); + void printProtoIdent(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier = nullptr); +}; + +} + +#endif diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index f6cbd23f01c4..556745825a15 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -38,12 +37,11 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) { HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid; ProtectedVisibilityAttr = MCSA_Invalid; - // FIXME: remove comment once debug info is properly supported. - Data8bitsDirective = "// .b8 "; + Data8bitsDirective = ".b8 "; Data16bitsDirective = nullptr; // not supported - Data32bitsDirective = "// .b32 "; - Data64bitsDirective = "// .b64 "; - ZeroDirective = "// .b8"; + Data32bitsDirective = ".b32 "; + Data64bitsDirective = ".b64 "; + ZeroDirective = ".b8"; AsciiDirective = nullptr; // not supported AscizDirective = nullptr; // not supported SupportsQuotedNames = false; diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index 9fd7600cf67f..e888526da898 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index b1a77a17ec15..c8b85b2718a6 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,10 +10,11 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/NVPTXInstPrinter.h" +#include "NVPTXInstPrinter.h" #include "NVPTXMCAsmInfo.h" #include "NVPTXMCTargetDesc.h" #include "NVPTXTargetStreamer.h" +#include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h index 0c9ad977e7ec..e1691d2384e6 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,9 +18,6 @@ namespace llvm { class Target; -Target &getTheNVPTXTarget32(); -Target &getTheNVPTXTarget64(); - } // End llvm namespace // Defines symbolic names for PTX registers. diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp index f7b4cf3a0f72..17f5ba7d900b 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp @@ -1,9 +1,8 @@ //=====- NVPTXTargetStreamer.cpp - NVPTXTargetStreamer class ------------=====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,6 +30,11 @@ void NVPTXTargetStreamer::outputDwarfFileDirectives() { DwarfFiles.clear(); } +void NVPTXTargetStreamer::closeLastSection() { + if (HasSections) + getStreamer().EmitRawText("\t}"); +} + void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) { DwarfFiles.emplace_back(Directive); } @@ -82,22 +86,27 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection, raw_ostream &OS) { assert(!SubSection && "SubSection is not null!"); const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo(); - // FIXME: remove comment once debug info is properly supported. // Emit closing brace for DWARF sections only. if (isDwarfSection(FI, CurSection)) - OS << "//\t}\n"; + OS << "\t}\n"; if (isDwarfSection(FI, Section)) { // Emit DWARF .file directives in the outermost scope. outputDwarfFileDirectives(); - OS << "//\t.section"; + OS << "\t.section"; Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(), FI->getTargetTriple(), OS, SubSection); // DWARF sections are enclosed into braces - emit the open one. - OS << "//\t{\n"; + OS << "\t{\n"; + HasSections = true; } } void NVPTXTargetStreamer::emitRawBytes(StringRef Data) { + MCTargetStreamer::emitRawBytes(Data); + // TODO: enable this once the bug in the ptxas with the packed bytes is + // resolved. Currently, (it is confirmed by NVidia) it causes a crash in + // ptxas. +#if 0 const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo(); const char *Directive = MAI->getData8bitsDirective(); unsigned NumElements = Data.size(); @@ -121,5 +130,6 @@ void NVPTXTargetStreamer::emitRawBytes(StringRef Data) { } Streamer.EmitRawText(OS.str()); } +#endif } diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h index f18e61cdca57..8185efadefdb 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h @@ -1,9 +1,8 @@ //=====-- NVPTXTargetStreamer.h - NVPTX Target Streamer ------*- C++ -*--=====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -19,6 +18,7 @@ class MCSection; class NVPTXTargetStreamer : public MCTargetStreamer { private: SmallVector DwarfFiles; + bool HasSections = false; public: NVPTXTargetStreamer(MCStreamer &S); @@ -26,6 +26,8 @@ public: /// Outputs the list of the DWARF '.file' directives to the streamer. void outputDwarfFileDirectives(); + /// Close last section. + void closeLastSection(); /// Record DWARF file directives for later output. /// According to PTX ISA, CUDA Toolkit documentation, 11.5.3. Debugging diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h index 7fc0156216f5..bbcbb4598040 100644 --- a/lib/Target/NVPTX/ManagedStringPool.h +++ b/lib/Target/NVPTX/ManagedStringPool.h @@ -1,9 +1,8 @@ //===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 07bfc58a8da7..6530c40ea100 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -1,9 +1,8 @@ //===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,14 +14,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H #define LLVM_LIB_TARGET_NVPTX_NVPTX_H -#include "MCTargetDesc/NVPTXBaseInfo.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetMachine.h" -#include -#include +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" namespace llvm { class NVPTXTargetMachine; @@ -55,9 +48,6 @@ BasicBlockPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); -Target &getTheNVPTXTarget32(); -Target &getTheNVPTXTarget64(); - namespace NVPTX { enum DrvInterface { NVCL, diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td index 3731b2f37f6c..1d947ef1ce62 100644 --- a/lib/Target/NVPTX/NVPTX.td +++ b/lib/Target/NVPTX/NVPTX.td @@ -1,9 +1,8 @@ //===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This is the top level entry point for the NVPTX target. @@ -76,6 +75,8 @@ def PTX61 : SubtargetFeature<"ptx61", "PTXVersion", "61", "Use PTX version 6.1">; def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63", "Use PTX version 6.3">; +def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64", + "Use PTX version 6.4">; //===----------------------------------------------------------------------===// // NVPTX supported processors. diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp index bf922eb8a195..f2c7751df1df 100644 --- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp @@ -1,9 +1,8 @@ //===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h index 7a6fc7d9b14d..d7de8e3a2f46 100644 --- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h @@ -1,9 +1,8 @@ //===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 6284ad8b82e8..5f38b4a3c4c5 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "NVPTXAsmPrinter.h" -#include "InstPrinter/NVPTXInstPrinter.h" #include "MCTargetDesc/NVPTXBaseInfo.h" +#include "MCTargetDesc/NVPTXInstPrinter.h" #include "MCTargetDesc/NVPTXMCAsmInfo.h" #include "MCTargetDesc/NVPTXTargetStreamer.h" #include "NVPTX.h" @@ -24,6 +23,7 @@ #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" +#include "TargetInfo/NVPTXTargetInfo.h" #include "cl_common_defines.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -473,6 +473,9 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { // Emit open brace for function body. OutStreamer->EmitRawText(StringRef("{\n")); setAndEmitFunctionVirtualRegisters(*MF); + // Emit initial .loc debug directive for correct relocation symbol data. + if (MMI && MMI->hasDebugInfo()) + emitInitialRawDwarfLocDirective(*MF); } bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) { @@ -597,36 +600,6 @@ void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, O << getVirtualRegisterName(vr); } -void NVPTXAsmPrinter::printVecModifiedImmediate( - const MachineOperand &MO, const char *Modifier, raw_ostream &O) { - static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' }; - int Imm = (int) MO.getImm(); - if (0 == strcmp(Modifier, "vecelem")) - O << "_" << vecelem[Imm]; - else if (0 == strcmp(Modifier, "vecv4comm1")) { - if ((Imm < 0) || (Imm > 3)) - O << "//"; - } else if (0 == strcmp(Modifier, "vecv4comm2")) { - if ((Imm < 4) || (Imm > 7)) - O << "//"; - } else if (0 == strcmp(Modifier, "vecv4pos")) { - if (Imm < 0) - Imm = 0; - O << "_" << vecelem[Imm % 4]; - } else if (0 == strcmp(Modifier, "vecv2comm1")) { - if ((Imm < 0) || (Imm > 1)) - O << "//"; - } else if (0 == strcmp(Modifier, "vecv2comm2")) { - if ((Imm < 2) || (Imm > 3)) - O << "//"; - } else if (0 == strcmp(Modifier, "vecv2pos")) { - if (Imm < 0) - Imm = 0; - O << "_" << vecelem[Imm % 2]; - } else - llvm_unreachable("Unknown Modifier on immediate operand"); -} - void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) { emitLinkageDirective(F, O); if (isKernelFunction(*F)) @@ -899,9 +872,8 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O, if (HasFullDebugInfo) break; } - // FIXME: remove comment once debug info is properly supported. if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo) - O << "//, debug"; + O << ", debug"; O << "\n"; @@ -952,10 +924,13 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { clearAnnotationCache(&M); delete[] gv_array; - // FIXME: remove comment once debug info is properly supported. // Close the last emitted section - if (HasDebugInfo) - OutStreamer->EmitRawText("//\t}"); + if (HasDebugInfo) { + static_cast(OutStreamer->getTargetStreamer()) + ->closeLastSection(); + // Emit empty .debug_loc section for better support of the empty files. + OutStreamer->EmitRawText("\t.section\t.debug_loc\t{\t}"); + } // Output last DWARF .file directives, if any. static_cast(OutStreamer->getTargetStreamer()) @@ -2199,7 +2174,6 @@ void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) { /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) @@ -2208,7 +2182,7 @@ bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); case 'r': break; } @@ -2219,9 +2193,10 @@ bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return false; } -bool NVPTXAsmPrinter::PrintAsmMemoryOperand( - const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, - const char *ExtraCode, raw_ostream &O) { +bool NVPTXAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + const char *ExtraCode, + raw_ostream &O) { if (ExtraCode && ExtraCode[0]) return true; // Unknown modifier @@ -2233,7 +2208,7 @@ bool NVPTXAsmPrinter::PrintAsmMemoryOperand( } void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { const MachineOperand &MO = MI->getOperand(opNum); switch (MO.getType()) { case MachineOperand::MO_Register: @@ -2245,29 +2220,23 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, } else { emitVirtualRegister(MO.getReg(), O); } - return; + break; case MachineOperand::MO_Immediate: - if (!Modifier) - O << MO.getImm(); - else if (strstr(Modifier, "vec") == Modifier) - printVecModifiedImmediate(MO, Modifier, O); - else - llvm_unreachable( - "Don't know how to handle modifier on immediate operand"); - return; + O << MO.getImm(); + break; case MachineOperand::MO_FPImmediate: printFPConstant(MO.getFPImm(), O); break; case MachineOperand::MO_GlobalAddress: - getSymbol(MO.getGlobal())->print(O, MAI); + PrintSymbolOperand(MO, O); break; case MachineOperand::MO_MachineBasicBlock: MO.getMBB()->getSymbol()->print(O, MAI); - return; + break; default: llvm_unreachable("Operand type not supported."); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 44a09f5fe513..43ae57ac1262 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -1,9 +1,8 @@ //===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -213,8 +212,6 @@ private: MCOperand GetSymbolRef(const MCSymbol *Symbol); unsigned encodeVirtualRegister(unsigned Reg); - void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier, - raw_ostream &O); void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier = nullptr); void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, @@ -231,13 +228,10 @@ private: void printReturnValStr(const Function *, raw_ostream &O); void printReturnValStr(const MachineFunction &MF, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &) override; - void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O, - const char *Modifier = nullptr); + const char *ExtraCode, raw_ostream &) override; + void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O); bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &) override; + const char *ExtraCode, raw_ostream &) override; const MCExpr *lowerConstantForGV(const Constant *CV, bool ProcessingGeneric); void printMCExpr(const MCExpr &Expr, raw_ostream &OS); diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp index 41e9ae827180..a8a43cee9ab7 100644 --- a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -1,9 +1,8 @@ //===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index e5e6637967b2..46f08b23d31a 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -1,9 +1,8 @@ //=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 0a7856b9d5de..40269f58f06e 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -1,9 +1,8 @@ //===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index fd63fdbaced6..b36d9b2e240a 100644 --- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -1,9 +1,8 @@ //===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ffc6a59cd6c8..3d2447d75c77 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "NVPTXISelDAGToDAG.h" #include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" @@ -702,11 +702,11 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly // because the former looks through phi nodes while the latter does not. We // need to look through phi nodes to handle pointer induction variables. - SmallVector Objs; - GetUnderlyingObjects(const_cast(N->getMemOperand()->getValue()), + SmallVector Objs; + GetUnderlyingObjects(N->getMemOperand()->getValue(), Objs, F->getDataLayout()); - return all_of(Objs, [&](Value *V) { + return all_of(Objs, [&](const Value *V) { if (auto *A = dyn_cast(V)) return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); if (auto *GV = dyn_cast(V)) diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index e911ba0c167d..e4e5069b7a80 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -1,9 +1,8 @@ //===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,6 +17,7 @@ #include "NVPTXISelLowering.h" #include "NVPTXRegisterInfo.h" #include "NVPTXTargetMachine.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Compiler.h" diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index bec8ece29050..ae1aa98da0e8 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1,9 +1,8 @@ //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -547,13 +546,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // These map to conversion instructions for scalar FP types. for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, - ISD::FROUND, ISD::FTRUNC}) { + ISD::FTRUNC}) { setOperationAction(Op, MVT::f16, Legal); setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); setOperationAction(Op, MVT::v2f16, Expand); } + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::v2f16, Expand); + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + + // 'Expand' implements FCOPYSIGN without calling an external library. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); @@ -1503,7 +1508,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (unsigned j = 0, je = VTs.size(); j != je; ++j) { // New store. if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceeding store."); + assert(StoreOperands.empty() && "Unfinished preceding store."); StoreOperands.push_back(Chain); StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); @@ -2069,6 +2074,100 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, } } +SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + +// This is the the rounding method used in CUDA libdevice in C like code: +// float roundf(float A) +// { +// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); +// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; +// return abs(A) < 0.5 ? (float)(int)A : RoundedA; +// } +SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue A = Op.getOperand(0); + EVT VT = Op.getValueType(); + + SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); + + // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) + SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); + const int SignBitMask = 0x80000000; + SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, + DAG.getConstant(SignBitMask, SL, MVT::i32)); + const int PointFiveInBits = 0x3F000000; + SDValue PointFiveWithSignRaw = + DAG.getNode(ISD::OR, SL, MVT::i32, Sign, + DAG.getConstant(PointFiveInBits, SL, MVT::i32)); + SDValue PointFiveWithSign = + DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); + SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); + SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); + + // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue IsLarge = + DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), + ISD::SETOGT); + RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); + + // return abs(A) < 0.5 ? (float)(int)A : RoundedA; + SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, + DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); + SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); + return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); +} + +// The implementation of round(double) is similar to that of round(float) in +// that they both separate the value range into three regions and use a method +// specific to the region to round the values. However, round(double) first +// calculates the round of the absolute value and then adds the sign back while +// round(float) directly rounds the value with sign. +SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue A = Op.getOperand(0); + EVT VT = Op.getValueType(); + + SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); + + // double RoundedA = (double) (int) (abs(A) + 0.5f); + SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, + DAG.getConstantFP(0.5, SL, VT)); + SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); + + // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, + DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); + RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, + DAG.getConstantFP(0, SL, VT), + RoundedA); + + // Add sign to rounded_A + RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); + DAG.getNode(ISD::FTRUNC, SL, VT, A); + + // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; + SDValue IsLarge = + DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), + ISD::SETOGT); + return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); +} + + + SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2099,6 +2198,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerShiftRightParts(Op, DAG); case ISD::SELECT: return LowerSelect(Op, DAG); + case ISD::FROUND: + return LowerFROUND(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -2130,7 +2231,7 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast(Op); EVT MemVT = Load->getMemoryVT(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - Load->getAddressSpace(), Load->getAlignment())) { + *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, SDLoc(Op)); @@ -2173,7 +2274,7 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // stores and have to handle it here. if (VT == MVT::v2f16 && !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - Store->getAddressSpace(), Store->getAlignment())) + *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); if (VT.isVector()) @@ -3399,6 +3500,94 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align = 16; return true; } + case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v2i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 8; + return true; + } + + case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: + + case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v4i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: + + case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: + case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: + case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: + case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: + case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: + case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: + case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: + case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 4; + return true; + } case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: @@ -3442,6 +3631,44 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: + case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: + case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: + case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v8i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: + case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: + case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: + case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: + case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: + case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: + case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v2i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 8; + return true; + } + case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: @@ -3484,8 +3711,44 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } - case Intrinsic::nvvm_atomic_load_add_f32: - case Intrinsic::nvvm_atomic_load_add_f64: + case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: + case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: + case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: + case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v8i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOStore; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: + case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: + case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: + case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: + case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: + case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: + case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: + case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v2i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOStore; + Info.align = 8; + return true; + } + case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 66fab2b6f480..ef645fc1e541 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -1,9 +1,8 @@ //===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -557,6 +556,10 @@ private: SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index ad1d7cbb52fc..74ab2f7b8453 100644 --- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -1,9 +1,8 @@ //===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td index ffcb5d5273a2..77961c386827 100644 --- a/lib/Target/NVPTX/NVPTXInstrFormats.td +++ b/lib/Target/NVPTX/NVPTXInstrFormats.td @@ -1,9 +1,8 @@ //===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 50815bff6c67..f928b44c91e0 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -1,9 +1,8 @@ //===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h index 4ab1bb481958..7c0912808f7b 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -1,9 +1,8 @@ //===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the niversity of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 02a40b9f5262..62da3c79f465 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1,9 +1,8 @@ //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -143,9 +142,12 @@ def true : Predicate<"true">; def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; +def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; +def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; +def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; @@ -1549,6 +1551,10 @@ def LdStCode : Operand { let PrintMethod = "printLdStCode"; } +def MmaCode : Operand { + let PrintMethod = "printMmaCode"; +} + def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; @@ -3003,15 +3009,6 @@ def : Pat<(ffloor Float32Regs:$a), def : Pat<(ffloor Float64Regs:$a), (CVT_f64_f64 Float64Regs:$a, CvtRMI)>; -def : Pat<(f16 (fround Float16Regs:$a)), - (CVT_f16_f16 Float16Regs:$a, CvtRNI)>; -def : Pat<(fround Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fround Float32Regs:$a)), - (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(f64 (fround Float64Regs:$a)), - (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; - def : Pat<(ftrunc Float16Regs:$a), (CVT_f16_f16 Float16Regs:$a, CvtRZI)>; def : Pat<(ftrunc Float32Regs:$a), diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 47dcdcf6e0bd..1752d3e0575e 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1,9 +1,8 @@ //===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -27,7 +26,35 @@ def immDouble1 : PatLeaf<(fpimm), [{ return (d==1.0); }]>; +def AS_match { + code generic = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC); + }]; + code shared = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); + }]; + code global = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); + }]; +} +// A node that will be replaced with the current PTX version. +class PTX { + SDNodeXForm PTXVerXform = SDNodeXFormgetPTXVersion(), SDLoc(N)); + }]>; + // (i32 0) will be XForm'ed to the currently used PTX version. + dag version = (PTXVerXform (i32 0)); +} +def ptx : PTX; + +// Generates list of n sequential register names. +// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ] +class RegSeq { + list ret = !if(n, !listconcat(RegSeq.ret, + [prefix # !add(n, -1)]), + []); +} //----------------------------------- // Synchronization and shuffle functions @@ -1007,17 +1034,11 @@ def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$ //----------------------------------- class ATOMIC_GLOBAL_CHK - : PatFrag; + : PatFrag; class ATOMIC_SHARED_CHK - : PatFrag; + : PatFrag; class ATOMIC_GENERIC_CHK - : PatFrag; + : PatFrag; multiclass F_ATOMIC_2_imp; def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_add_64 node:$a, node:$b)>; -def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; -def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; -def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; -def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; -def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; -def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_fadd node:$a, node:$b)>; +def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_fadd node:$a, node:$b)>; +def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_fadd node:$a, node:$b)>; defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2; @@ -1145,18 +1160,18 @@ defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2; defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2; + atomic_load_add_g, f32imm, fpimm>; defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2; + atomic_load_add_s, f32imm, fpimm>; defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2; + atomic_load_add_gen, f32imm, fpimm>; defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2; + atomic_load_add_g, f64imm, fpimm, [hasAtomAddF64]>; defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2; + atomic_load_add_s, f64imm, fpimm, [hasAtomAddF64]>; defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2; + atomic_load_add_gen, f64imm, fpimm, [hasAtomAddF64]>; // atom_sub @@ -7381,383 +7396,258 @@ def INT_PTX_SREG_WARPSIZE : NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;", [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>; +// Helper class that represents a 'fragment' of an NVPTX *MMA instruction. +// In addition to target-independent fields provided by WMMA_REGS, it adds +// the fields commonly used to implement specific PTX instruction -- register +// types and names, constraints, parts of assembly, etc. +class WMMA_REGINFO + : WMMA_REGS { + // NVPTX register types used to carry fragment data. + NVPTXRegClass regclass = !cond( + !eq(ptx_elt_type, "f16") : Float16x2Regs, + !eq(ptx_elt_type, "f32") : Float32Regs, + !eq(ptx_elt_type, "s32") : Int32Regs, + !eq(ptx_elt_type, "s8") : Int32Regs, + !eq(ptx_elt_type, "u8") : Int32Regs, + !eq(ptx_elt_type, "s4") : Int32Regs, + !eq(ptx_elt_type, "u4") : Int32Regs, + !eq(ptx_elt_type, "b1") : Int32Regs); + + // Instruction input/output arguments for the fragment. + list ptx_regs = !foreach(tmp, regs, regclass); + + // List of register names for the fragment -- ["ra0", "ra1",...] + list reg_names = RegSeq.ret; + + // Generates "{{$r0, $r1,.... $rN-1}}" for use in asm string construction. + string regstring = "{{$" # !head(reg_names) + # !foldl("", !tail(reg_names), a, b, + !strconcat(a, ", $", b)) + # "}}"; + + // Predicates for particular fragment variant. Technically those are + // per-instruction predicates, but currently all fragments that can be used in + // a given instruction are subject to the same constraints, so an instruction + // can use predicates from any of its fragments. If/when this is no + // longer the case, we can concat all per-fragment predicates to enforce that + // all fragments of the instruction are viable. + list Predicates = !cond( + // fp16 -> fp16/fp32 @ m16n16k16 + !and(!eq(geom, "m16n16k16"), + !or(!eq(ptx_elt_type, "f16"), + !eq(ptx_elt_type, "f32"))) : [hasSM70, hasPTX60], + + // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16 + !and(!or(!eq(geom, "m8n32k16"), + !eq(geom, "m32n8k16")), + !or(!eq(ptx_elt_type, "f16"), + !eq(ptx_elt_type, "f32"))) : [hasSM70, hasPTX61], + + // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16 + !and(!or(!eq(geom,"m16n16k16"), + !eq(geom,"m8n32k16"), + !eq(geom,"m32n8k16")), + !or(!eq(ptx_elt_type, "u8"), + !eq(ptx_elt_type, "s8"), + !eq(ptx_elt_type, "s32"))) : [hasSM72, hasPTX63], + + // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1) + !or(!eq(geom,"m8n8k128"), + !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]); + + // template DAGs for instruction inputs/output. + dag Outs = !dag(outs, ptx_regs, reg_names); + dag Ins = !dag(ins, ptx_regs, reg_names); +} + +// Convert dag of arguments into a dag to match given intrinsic. +class BuildPatternI { + // Build a dag pattern that matches the intrinsic call. + dag ret = !foreach(tmp, Ins, + !subst(imem, ADDRvar, + !subst(MEMri64, ADDRri64, + !subst(MEMri, ADDRri, + !subst(ins, Intr, tmp))))); +} + +// Same as above, but uses PatFrag instead of an Intrinsic. +class BuildPatternPF { + // Build a dag pattern that matches the intrinsic call. + dag ret = !foreach(tmp, Ins, + !subst(imem, ADDRvar, + !subst(MEMri64, ADDRri64, + !subst(MEMri, ADDRri, + !subst(ins, Intr, tmp))))); +} + +// Common WMMA-related fields used for building patterns for all MMA instructions. +class WMMA_INSTR _Args> + : NVPTXInst<(outs), (ins), "?", []> { + Intrinsic Intr = !cast(_Intr); + // Concatenate all arguments into a single dag. + dag Args = !foldl((ins), _Args, a, b, !con(a,b)); + // Pre-build the pattern to match (intrinsic arg0, arg1, ...). + dag IntrinsicPattern = BuildPatternI(Intr), Args>.ret; +} + // // wmma.load.[a|b|c].sync.[row|col].m16n16k16[|.global|.shared].[f16|f32] // -class EmptyNVPTXInst : NVPTXInst<(outs), (ins), "?", []>; - -class WMMA_LOAD_GALSTOS - : EmptyNVPTXInst, - Requires<[!if(!eq(Geometry, "m16n16k16"), - hasPTX60, - hasPTX61), - hasSM70]> { - // Pattern (created by WMMA_LOAD_INTR_HELPER below) that matches the intrinsic - // for this function. - PatFrag IntrMatcher = !cast("INT_WMMA_" - # Geometry # "_load_" - # !subst("c", "c_" # Type, Abc) - # "_" # Layout - # !subst(".", "_", Space) - # !if(WithStride,"_stride", "") - # "_Intr"); - dag OutsR03 = (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3); - dag OutsR47 = (outs regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7); - dag Outs = !if(!eq(Abc#Type,"cf16"), OutsR03, !con(OutsR03, OutsR47)); - - dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins)); - dag Ins = !con((ins SrcOp:$src), StrideArg); - - // Build a dag pattern that matches the intrinsic call. - // We want a dag that looks like this: - // (set , (intrinsic )) where input and - // output arguments are named patterns that would match corresponding - // input/output arguments of the instruction. - // - // First we construct (set ) from instruction's outs dag by - // replacing dag operator 'outs' with 'set'. - dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp)); - // Similarly, construct (intrinsic ) sub-dag from - // instruction's input arguments, only now we also need to replace operands - // with patterns that would match them and the operator 'ins' with the - // intrinsic. - dag PatArgs = !foreach(tmp, Ins, - !subst(imem, ADDRvar, - !subst(MEMri64, ADDRri64, - !subst(MEMri, ADDRri, - !subst(ins, IntrMatcher, tmp))))); - // Finally, consatenate both parts together. !con() requires both dags to have - // the same operator, so we wrap PatArgs in a (set ...) dag. - let Pattern = [!con(PatOuts, (set PatArgs))]; - let OutOperandList = Outs; - let InOperandList = Ins; +class WMMA_LOAD + : WMMA_INSTR.record, + [!con((ins SrcOp:$src), + !if(WithStride, (ins Int32Regs:$ldm), (ins)))]>, + Requires { + // Load/store intrinsics are overloaded on pointer's address space. + // To match the right intrinsic, we need to build AS-constrained PatFrag. + // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....). + dag PFOperands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src)); + // Build PatFrag that only matches particular address space. + PatFrag IntrFrag = PatFrag; + // Build AS-constrained pattern. + let IntrinsicPattern = BuildPatternPF.ret; + + let OutOperandList = Frag.Outs; + let InOperandList = !con(Args, (ins MmaCode:$ptx)); let AsmString = "wmma.load." - # Abc + # Frag.frag # ".sync" + # "${ptx:aligned}" # "." # Layout - # "." # Geometry + # "." # Frag.geom # Space - # "." # Type # " \t" - # !if(!eq(Abc#Type, "cf16"), - "{{$r0, $r1, $r2, $r3}}", - "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}") + # "." # Frag.ptx_elt_type # " \t" + # Frag.regstring # ", [$src]" # !if(WithStride, ", $ldm", "") # ";"; } -class WMMA_LOAD_INTR_HELPER - : PatFrag <(ops),(ops)> { - // Intrinsic that matches this instruction. - Intrinsic Intr = !cast("int_nvvm_wmma" - # "_" # Geometry # "_load_" - # Abc # "_" # Type # "_" # Layout - # !if(WithStride,"_stride", "")); - code match_generic = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC); - }]; - code match_shared = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); - }]; - code match_global = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); - }]; - - let Operands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src)); - let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))]; - let PredicateCode = !if(!eq(Space, ".shared"), match_shared, - !if(!eq(Space, ".global"), match_global, match_generic)); -} - -multiclass WMMA_LOAD_GALSTS { - def _avar: WMMA_LOAD_GALSTOS; - def _areg: WMMA_LOAD_GALSTOS; - def _areg64: WMMA_LOAD_GALSTOS; - def _ari: WMMA_LOAD_GALSTOS; - def _ari64: WMMA_LOAD_GALSTOS; -} - -multiclass WMMA_LOAD_GALSTSh { - // Define a PatFrag that matches appropriate intrinsic that loads from the - // given address space. - def _Intr: WMMA_LOAD_INTR_HELPER; - defm NAME: WMMA_LOAD_GALSTS; -} - -multiclass WMMA_LOAD_GALST { - defm _stride: WMMA_LOAD_GALSTSh; - defm NAME: WMMA_LOAD_GALSTSh; -} - -multiclass WMMA_LOAD_GALT { - defm _global: WMMA_LOAD_GALST; - defm _shared: WMMA_LOAD_GALST; - defm NAME: WMMA_LOAD_GALST; -} - -multiclass WMMA_LOAD_GAT { - defm _row: WMMA_LOAD_GALT; - defm _col: WMMA_LOAD_GALT; -} - -multiclass WMMA_LOAD_G { - defm _load_a: WMMA_LOAD_GAT; - defm _load_b: WMMA_LOAD_GAT; - defm _load_c_f16: WMMA_LOAD_GAT; - defm _load_c_f32: WMMA_LOAD_GAT; -} - -defm INT_WMMA_m32n8k16: WMMA_LOAD_G<"m32n8k16">; -defm INT_WMMA_m16n16k16: WMMA_LOAD_G<"m16n16k16">; -defm INT_WMMA_m8n32k16: WMMA_LOAD_G<"m8n32k16">; - // // wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32] // -class WMMA_STORE_D_GLSTSO - : EmptyNVPTXInst, - Requires<[!if(!eq(Geometry, "m16n16k16"), - hasPTX60, - hasPTX61), - hasSM70]> { - PatFrag IntrMatcher = !cast("INT_WMMA" - # "_" # Geometry # "_store_d" - # "_" # Type - # "_" # Layout - # !subst(".", "_", Space) - # !if(WithStride,"_stride", "") - # "_Intr"); - dag InsR03 = (ins DstOp:$src, regclass:$r0, regclass:$r1, - regclass:$r2, regclass:$r3); - dag InsR47 = (ins regclass:$r4, regclass:$r5, - regclass:$r6, regclass:$r7); - dag InsR = !if(!eq(Type,"f16"), InsR03, !con(InsR03, InsR47)); - dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins)); - dag Ins = !con(InsR, StrideArg); - - // Construct the pattern to match corresponding intrinsic call. See the - // details in the comments in WMMA_LOAD_ALSTOS. - dag PatArgs = !foreach(tmp, Ins, - !subst(imem, ADDRvar, - !subst(MEMri64, ADDRri64, - !subst(MEMri, ADDRri, - !subst(ins, IntrMatcher, tmp))))); - let Pattern = [PatArgs]; +class WMMA_STORE_D + : WMMA_INSTR.record, + [!con((ins DstOp:$dst), + Frag.Ins, + !if(WithStride, (ins Int32Regs:$ldm), (ins)))]>, + Requires { + + // Load/store intrinsics are overloaded on pointer's address space. + // To match the right intrinsic, we need to build AS-constrained PatFrag. + // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....). + dag PFOperands = !con((ops node:$dst), + !dag(ops, !foreach(tmp, Frag.regs, node), Frag.reg_names), + !if(WithStride, (ops node:$ldm), (ops))); + // Build PatFrag that only matches particular address space. + PatFrag IntrFrag = PatFrag; + // Build AS-constrained pattern. + let IntrinsicPattern = BuildPatternPF.ret; + + let InOperandList = !con(Args, (ins MmaCode:$ptx)); let OutOperandList = (outs); - let InOperandList = Ins; - let AsmString = "wmma.store.d.sync." - # Layout - # "." # Geometry + let AsmString = "wmma.store.d.sync" + # "${ptx:aligned}" + # "." # Layout + # "." # Frag.geom # Space - # "." # Type - # " \t[$src]," - # !if(!eq(Type,"f16"), - "{{$r0, $r1, $r2, $r3}}", - "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}") + # "." # Frag.ptx_elt_type + # " \t[$dst]," + # Frag.regstring # !if(WithStride, ", $ldm", "") # ";"; - -} - -class WMMA_STORE_INTR_HELPER - : PatFrag <(ops),(ops)> { - // Intrinsic that matches this instruction. - Intrinsic Intr = !cast("int_nvvm_wmma_" - # Geometry - # "_store_d" - # "_" # Type - # "_" # Layout - # !if(WithStride, "_stride", "")); - code match_generic = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC); - }]; - code match_shared = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); - }]; - code match_global = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); - }]; - - dag Args = !if(!eq(Type,"f16"), - (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3), - (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3, - node:$r4, node:$r5, node:$r6, node:$r7)); - dag StrideArg = !if(WithStride, (ops node:$ldm), (ops)); - let Operands = !con(Args, StrideArg); - let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))]; - let PredicateCode = !if(!eq(Space, ".shared"), match_shared, - !if(!eq(Space, ".global"), match_global, match_generic)); -} - -multiclass WMMA_STORE_D_GLSTS { - def _avar: WMMA_STORE_D_GLSTSO; - def _areg: WMMA_STORE_D_GLSTSO; - def _areg64: WMMA_STORE_D_GLSTSO; - def _ari: WMMA_STORE_D_GLSTSO; - def _ari64: WMMA_STORE_D_GLSTSO; } -multiclass WMMA_STORE_D_GLSTSh { - // Define a PatFrag that matches appropriate intrinsic that loads from the - // given address space. - def _Intr: WMMA_STORE_INTR_HELPER; - defm NAME: WMMA_STORE_D_GLSTS; -} - -multiclass WMMA_STORE_D_GLST { - defm _stride: WMMA_STORE_D_GLSTSh; - defm NAME: WMMA_STORE_D_GLSTSh; -} - -multiclass WMMA_STORE_D_GLT { - defm _global: WMMA_STORE_D_GLST; - defm _shared: WMMA_STORE_D_GLST; - defm NAME: WMMA_STORE_D_GLST; -} - -multiclass WMMA_STORE_D_GT { - defm _row: WMMA_STORE_D_GLT; - defm _col: WMMA_STORE_D_GLT; -} - -multiclass WMMA_STORE_D_G { - defm _store_d_f16: WMMA_STORE_D_GT; - defm _store_d_f32: WMMA_STORE_D_GT; -} - -defm INT_WMMA_m32n8k16: WMMA_STORE_D_G<"m32n8k16">; -defm INT_WMMA_m16n16k16: WMMA_STORE_D_G<"m16n16k16">; -defm INT_WMMA_m8n32k16: WMMA_STORE_D_G<"m8n32k16">; +// Create all load/store variants +defset list MMA_LDSTs = { + foreach layout = ["row", "col"] in { + foreach stride = [0, 1] in { + foreach space = [".global", ".shared", ""] in { + foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in { + foreach frag = NVVM_MMA_OPS.all_ld_ops in + foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in + def : WMMA_LOAD, layout, space, stride, addr>; + foreach frag = NVVM_MMA_OPS.all_st_ops in + foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in + def : WMMA_STORE_D, layout, space, stride, addr>; + } // addr + } // space + } // stride + } // layout +} // defset // WMMA.MMA -class WMMA_MMA_GABDCS - : EmptyNVPTXInst, - Requires<[!if(!eq(Geometry, "m16n16k16"), - hasPTX60, - hasPTX61), - hasSM70]> { - Intrinsic Intr = !cast("int_nvvm_wmma_" - # Geometry - # "_mma" - # "_" # ALayout - # "_" # BLayout - # "_" # DType - # "_" # CType - # !subst(".", "_", Satfinite)); - dag Outs = !if(!eq(DType,"f16"), - (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3), - (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3, - d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7)); - dag InsExtraCArgs = !if(!eq(CType,"f16"), - (ins), - (ins c_reg:$c4, c_reg:$c5, c_reg:$c6, c_reg:$c7)); - dag Ins = !con((ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3, - ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7, - ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3, - ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7, - c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3), - InsExtraCArgs); - - // Construct the pattern to match corresponding intrinsic call. See the - // details in the comments in WMMA_LOAD_ALSTOS. - dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp)); - dag PatArgs = !foreach(tmp, Ins, !subst(ins, Intr, tmp)); - let Pattern = [!con(PatOuts, (set PatArgs))]; - let OutOperandList = Outs; - let InOperandList = Ins; - let AsmString = "wmma.mma.sync." - # ALayout +class WMMA_MMA + : WMMA_INSTR.record, + [FragA.Ins, FragB.Ins, FragC.Ins]>, + // Requires does not seem to have effect on Instruction w/o Patterns. + // We set it here anyways and propagate to the Pat<> we construct below. + Requires { + let OutOperandList = FragD.Outs; + let InOperandList = !con(Args, (ins MmaCode:$ptx)); + string TypeList = !cond( + !eq(FragD.ptx_elt_type, "s32") : ".s32" + # "." # FragA.ptx_elt_type + # "." # FragB.ptx_elt_type + # ".s32", + 1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type, + ); + let AsmString = "wmma.mma" + # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "") + # ".sync" + # "${ptx:aligned}" + # "." # ALayout # "." # BLayout - # "." # Geometry - # "." # DType - # "." # CType - # Satfinite # "\n\t\t" - # !if(!eq(DType,"f16"), - "{{$d0, $d1, $d2, $d3}}, \n\t\t", - "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t") - # "{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t" - # "{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t" - # !if(!eq(CType,"f16"), - "{{$c0, $c1, $c2, $c3}};", - "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};"); -} - -multiclass WMMA_MMA_GABDC { - def _satfinite: WMMA_MMA_GABDCS; - def NAME: WMMA_MMA_GABDCS; -} - -multiclass WMMA_MMA_GABD { - defm _f16: WMMA_MMA_GABDC; - defm _f32: WMMA_MMA_GABDC; -} - -multiclass WMMA_MMA_GAB { - defm _f16: WMMA_MMA_GABD; - defm _f32: WMMA_MMA_GABD; -} - -multiclass WMMA_MMA_GA { - defm _col: WMMA_MMA_GAB; - defm _row: WMMA_MMA_GAB; -} - -multiclass WMMA_MMA_G { - defm _col: WMMA_MMA_GA; - defm _row: WMMA_MMA_GA; + # "." # FragA.geom + # TypeList + # !if(Satfinite, ".satfinite", "") # "\n\t\t" + # FragD.regstring # ",\n\t\t" + # FragA.regstring # ",\n\t\t" + # FragB.regstring # ",\n\t\t" + # FragC.regstring # ";"; } -defm INT_WMMA_MMA_m32n8k16 : WMMA_MMA_G<"m32n8k16">; -defm INT_WMMA_MMA_m16n16k16 : WMMA_MMA_G<"m16n16k16">; -defm INT_WMMA_MMA_m8n32k16 : WMMA_MMA_G<"m8n32k16">; +defset list MMAs = { + foreach layout_a = ["row", "col"] in { + foreach layout_b = ["row", "col"] in { + foreach satf = [0, 1] in { + foreach op = NVVM_MMA_OPS.all_mma_ops in { + foreach _ = NVVM_MMA_SUPPORTED.ret in { + def : WMMA_MMA, + WMMA_REGINFO, + WMMA_REGINFO, + WMMA_REGINFO, + layout_a, layout_b, satf>; + } + } // op + } // satf + } // layout_b + } // layout_a +} // defset + + +// Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a +// dag, so the ptx.version must be appended *after* foreach replaces 'ins' with +// the instruction record. +class WMMA_PAT + : Pat, + Requires; + +// Build intrinsic->instruction patterns for all MMA instructions. +foreach mma = !listconcat(MMAs, MMA_LDSTs) in + def : WMMA_PAT; diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 52ced266b91c..0743a2986718 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -1,9 +1,8 @@ //===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h index 3c39f53eb30a..59d5ef40e9ac 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h @@ -1,9 +1,8 @@ //===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index e94c1914029d..76fb9f3fa692 100644 --- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -1,9 +1,8 @@ //===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,6 +26,7 @@ #include "NVPTX.h" #include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 139dc7fbeeda..c5e02e34e25e 100644 --- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -1,9 +1,8 @@ //===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -92,6 +91,7 @@ #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -170,7 +170,8 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Value *ArgInParam = new AddrSpaceCastInst( Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); - LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst); + LoadInst *LI = + new LoadInst(StructType, ArgInParam, Arg->getName(), FirstInst); new StoreInst(LI, AllocA, FirstInst); } diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp index a754a6a36dab..5ec1b2425e68 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.cpp +++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp @@ -1,9 +1,8 @@ //===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h index 95741d9b0451..440fa1310003 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/lib/Target/NVPTX/NVPTXMCExpr.h @@ -1,9 +1,8 @@ //===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h index 5a9115f6f7f1..cf63fc33e621 100644 --- a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h +++ b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp index 02c32c68ee2c..629757db8707 100644 --- a/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -1,9 +1,8 @@ //===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 2ca0ccf2dfa7..4c5a9adf1f65 100644 --- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -1,9 +1,8 @@ //===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -73,8 +72,8 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg); MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false); MI.getOperand(0).setIsDebug(); - auto *DIExpr = DIExpression::prepend(MI.getDebugExpression(), - DIExpression::NoDeref, Offset); + auto *DIExpr = DIExpression::prepend( + MI.getDebugExpression(), DIExpression::ApplyOffset, Offset); MI.getOperand(3).setMetadata(DIExpr); continue; } diff --git a/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp index f60d841c1683..af50a7465d1a 100644 --- a/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp +++ b/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp @@ -1,9 +1,8 @@ //===- NVPTXProxyRegErasure.cpp - NVPTX Proxy Register Instruction Erasure -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 755738329881..5cdec0925b26 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -1,9 +1,8 @@ //===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -127,6 +126,6 @@ void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); } -unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return NVPTX::VRFrame; } diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h index 6185a0b54cac..9ef6940daf86 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.h +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -1,9 +1,8 @@ //===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -43,7 +42,7 @@ public: unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; ManagedStringPool *getStrPool() const { return const_cast(&ManagedStrPool); diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index f04764a9e9a3..4b755dcb55ff 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -1,9 +1,8 @@ //===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index 82befe4b101b..e213089e4085 100644 --- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -1,9 +1,8 @@ //===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "NVPTXMachineFunctionInfo.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp index acbee86ae386..357826c2d19c 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -1,9 +1,8 @@ //===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h index b02822a099d9..0e9fa1fd3e56 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -1,9 +1,8 @@ //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8ec0ddb9b3d5..11b3fe2fa3d3 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "NVPTXLowerAggrCopies.h" #include "NVPTXTargetObjectFile.h" #include "NVPTXTargetTransformInfo.h" +#include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -167,8 +167,16 @@ public: void addMachineSSAOptimization() override; FunctionPass *createTargetRegisterAllocator(bool) override; - void addFastRegAlloc(FunctionPass *RegAllocPass) override; - void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + void addFastRegAlloc() override; + void addOptimizedRegAlloc() override; + + bool addRegAssignmentFast() override { + llvm_unreachable("should not be used"); + } + + bool addRegAssignmentOptimized() override { + llvm_unreachable("should not be used"); + } private: // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This @@ -323,15 +331,12 @@ FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { return nullptr; // No reg alloc } -void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - assert(!RegAllocPass && "NVPTX uses no regalloc!"); +void NVPTXPassConfig::addFastRegAlloc() { addPass(&PHIEliminationID); addPass(&TwoAddressInstructionPassID); } -void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - assert(!RegAllocPass && "NVPTX uses no regalloc!"); - +void NVPTXPassConfig::addOptimizedRegAlloc() { addPass(&ProcessImplicitDefsID); addPass(&LiveVariablesID); addPass(&MachineLoopInfoID); diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index ca540b8e0389..d84600c74e29 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -1,9 +1,8 @@ //===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h index c706b053ab8f..ab2a93b75922 100644 --- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 307654aed37f..be0416f90fca 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -39,7 +38,6 @@ static bool readsLaneId(const IntrinsicInst *II) { static bool isNVVMAtomic(const IntrinsicInst *II) { switch (II->getIntrinsicID()) { default: return false; - case Intrinsic::nvvm_atomic_load_add_f32: case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 14e93f7447dd..b179a28fa713 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- NVPTXTargetTransformInfo.h - NVPTX specific TTI ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -17,8 +16,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H -#include "NVPTX.h" #include "NVPTXTargetMachine.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/TargetLowering.h" diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp index e464f474b1d5..665eb1383253 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -1,13 +1,13 @@ //===- NVPTXUtilities.cpp - Utility Functions -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains miscellaneous utility functions +// //===----------------------------------------------------------------------===// #include "NVPTXUtilities.h" diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h index a0cc4e78ac21..bf1524194cfb 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.h +++ b/lib/Target/NVPTX/NVPTXUtilities.h @@ -1,9 +1,8 @@ //===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVVMIntrRange.cpp b/lib/Target/NVPTX/NVVMIntrRange.cpp index 11277f5ba596..5cf7b6691e63 100644 --- a/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -1,9 +1,8 @@ //===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index 64c262664fda..634a052e2ee7 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -1,9 +1,8 @@ //===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp index 803d643844f8..2c71ec58ec42 100644 --- a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp +++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp @@ -1,14 +1,12 @@ //===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "NVPTX.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h new file mode 100644 index 000000000000..5c5691349ae9 --- /dev/null +++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h @@ -0,0 +1,21 @@ +//===-- NVPTXTargetInfo.h - NVPTX Target Implementation ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NVPTX_TARGETINFO_NVPTXTARGETINFO_H +#define LLVM_LIB_TARGET_NVPTX_TARGETINFO_NVPTXTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheNVPTXTarget32(); +Target &getTheNVPTXTarget64(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_NVPTX_TARGETINFO_NVPTXTARGETINFO_H diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 8b3480f772e9..c9524da93acd 100644 --- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1,15 +1,15 @@ //===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" #include "PPCTargetStreamer.h" +#include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" @@ -147,8 +147,7 @@ public: : MCTargetAsmParser(Options, STI, MII) { // Check for 64-bit vs. 32-bit pointer mode. const Triple &TheTriple = STI.getTargetTriple(); - IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || - TheTriple.getArch() == Triple::ppc64le); + IsPPC64 = TheTriple.isPPC64(); IsDarwin = TheTriple.isMacOSX(); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -1129,7 +1128,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, } } -static std::string PPCMnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string PPCMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS, unsigned VariantID = 0); bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -1148,7 +1147,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_MissingFeature: return Error(IDLoc, "instruction use requires an option to be enabled"); case Match_MnemonicFail: { - uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = PPCMnemonicSpellCheck( ((PPCOperand &)*Operands[0]).getToken(), FBS); return Error(IDLoc, "invalid instruction" + Suggestion, diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 26869f250823..7a8af57961cb 100644 --- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -1,13 +1,13 @@ //===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" @@ -61,6 +61,14 @@ extern "C" void LLVMInitializePowerPCDisassembler() { createPPCLEDisassembler); } +static DecodeStatus DecodePCRel24BranchTarget(MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + int32_t Offset = SignExtend32<24>(Imm); + Inst.addOperand(MCOperand::createImm(Offset)); + return MCDisassembler::Success; +} + // FIXME: These can be generated by TableGen from the existing register // encoding values! @@ -78,12 +86,6 @@ static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, CRRegs); } -static DecodeStatus DecodeCRRC0RegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, - const void *Decoder) { - return decodeRegisterClass(Inst, RegNo, CRRegs); -} - static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp deleted file mode 100644 index fc29e4effbb1..000000000000 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ /dev/null @@ -1,532 +0,0 @@ -//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an PPC MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "PPCInstPrinter.h" -#include "MCTargetDesc/PPCMCTargetDesc.h" -#include "MCTargetDesc/PPCPredicates.h" -#include "PPCInstrInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// FIXME: Once the integrated assembler supports full register names, tie this -// to the verbose-asm setting. -static cl::opt -FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false), - cl::desc("Use full register names when printing assembly")); - -// Useful for testing purposes. Prints vs{31-63} as v{0-31} respectively. -static cl::opt -ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false), - cl::desc("Prints full register names with vs{31-63} as v{0-31}")); - -// Prints full register names with percent symbol. -static cl::opt -FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, - cl::init(false), - cl::desc("Prints full register names with percent")); - -#define PRINT_ALIAS_INSTR -#include "PPCGenAsmWriter.inc" - -void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - const char *RegName = getRegisterName(RegNo); - if (RegName[0] == 'q' /* QPX */) { - // The system toolchain on the BG/Q does not understand QPX register names - // in .cfi_* directives, so print the name of the floating-point - // subregister instead. - std::string RN(RegName); - - RN[0] = 'f'; - OS << RN; - - return; - } - - OS << RegName; -} - -void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - // Check for slwi/srwi mnemonics. - if (MI->getOpcode() == PPC::RLWINM) { - unsigned char SH = MI->getOperand(2).getImm(); - unsigned char MB = MI->getOperand(3).getImm(); - unsigned char ME = MI->getOperand(4).getImm(); - bool useSubstituteMnemonic = false; - if (SH <= 31 && MB == 0 && ME == (31-SH)) { - O << "\tslwi "; useSubstituteMnemonic = true; - } - if (SH <= 31 && MB == (32-SH) && ME == 31) { - O << "\tsrwi "; useSubstituteMnemonic = true; - SH = 32-SH; - } - if (useSubstituteMnemonic) { - printOperand(MI, 0, O); - O << ", "; - printOperand(MI, 1, O); - O << ", " << (unsigned int)SH; - - printAnnotation(O, Annot); - return; - } - } - - if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) && - MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { - O << "\tmr "; - printOperand(MI, 0, O); - O << ", "; - printOperand(MI, 1, O); - printAnnotation(O, Annot); - return; - } - - if (MI->getOpcode() == PPC::RLDICR || - MI->getOpcode() == PPC::RLDICR_32) { - unsigned char SH = MI->getOperand(2).getImm(); - unsigned char ME = MI->getOperand(3).getImm(); - // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH - if (63-SH == ME) { - O << "\tsldi "; - printOperand(MI, 0, O); - O << ", "; - printOperand(MI, 1, O); - O << ", " << (unsigned int)SH; - printAnnotation(O, Annot); - return; - } - } - - // dcbt[st] is printed manually here because: - // 1. The assembly syntax is different between embedded and server targets - // 2. We must print the short mnemonics for TH == 0 because the - // embedded/server syntax default will not be stable across assemblers - // The syntax for dcbt is: - // dcbt ra, rb, th [server] - // dcbt th, ra, rb [embedded] - // where th can be omitted when it is 0. dcbtst is the same. - if (MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) { - unsigned char TH = MI->getOperand(0).getImm(); - O << "\tdcbt"; - if (MI->getOpcode() == PPC::DCBTST) - O << "st"; - if (TH == 16) - O << "t"; - O << " "; - - bool IsBookE = STI.getFeatureBits()[PPC::FeatureBookE]; - if (IsBookE && TH != 0 && TH != 16) - O << (unsigned int) TH << ", "; - - printOperand(MI, 1, O); - O << ", "; - printOperand(MI, 2, O); - - if (!IsBookE && TH != 0 && TH != 16) - O << ", " << (unsigned int) TH; - - printAnnotation(O, Annot); - return; - } - - if (MI->getOpcode() == PPC::DCBF) { - unsigned char L = MI->getOperand(0).getImm(); - if (!L || L == 1 || L == 3) { - O << "\tdcbf"; - if (L == 1 || L == 3) - O << "l"; - if (L == 3) - O << "p"; - O << " "; - - printOperand(MI, 1, O); - O << ", "; - printOperand(MI, 2, O); - - printAnnotation(O, Annot); - return; - } - } - - if (!printAliasInstr(MI, O)) - printInstruction(MI, O); - printAnnotation(O, Annot); -} - - -void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, - const char *Modifier) { - unsigned Code = MI->getOperand(OpNo).getImm(); - - if (StringRef(Modifier) == "cc") { - switch ((PPC::Predicate)Code) { - case PPC::PRED_LT_MINUS: - case PPC::PRED_LT_PLUS: - case PPC::PRED_LT: - O << "lt"; - return; - case PPC::PRED_LE_MINUS: - case PPC::PRED_LE_PLUS: - case PPC::PRED_LE: - O << "le"; - return; - case PPC::PRED_EQ_MINUS: - case PPC::PRED_EQ_PLUS: - case PPC::PRED_EQ: - O << "eq"; - return; - case PPC::PRED_GE_MINUS: - case PPC::PRED_GE_PLUS: - case PPC::PRED_GE: - O << "ge"; - return; - case PPC::PRED_GT_MINUS: - case PPC::PRED_GT_PLUS: - case PPC::PRED_GT: - O << "gt"; - return; - case PPC::PRED_NE_MINUS: - case PPC::PRED_NE_PLUS: - case PPC::PRED_NE: - O << "ne"; - return; - case PPC::PRED_UN_MINUS: - case PPC::PRED_UN_PLUS: - case PPC::PRED_UN: - O << "un"; - return; - case PPC::PRED_NU_MINUS: - case PPC::PRED_NU_PLUS: - case PPC::PRED_NU: - O << "nu"; - return; - case PPC::PRED_BIT_SET: - case PPC::PRED_BIT_UNSET: - llvm_unreachable("Invalid use of bit predicate code"); - } - llvm_unreachable("Invalid predicate code"); - } - - if (StringRef(Modifier) == "pm") { - switch ((PPC::Predicate)Code) { - case PPC::PRED_LT: - case PPC::PRED_LE: - case PPC::PRED_EQ: - case PPC::PRED_GE: - case PPC::PRED_GT: - case PPC::PRED_NE: - case PPC::PRED_UN: - case PPC::PRED_NU: - return; - case PPC::PRED_LT_MINUS: - case PPC::PRED_LE_MINUS: - case PPC::PRED_EQ_MINUS: - case PPC::PRED_GE_MINUS: - case PPC::PRED_GT_MINUS: - case PPC::PRED_NE_MINUS: - case PPC::PRED_UN_MINUS: - case PPC::PRED_NU_MINUS: - O << "-"; - return; - case PPC::PRED_LT_PLUS: - case PPC::PRED_LE_PLUS: - case PPC::PRED_EQ_PLUS: - case PPC::PRED_GE_PLUS: - case PPC::PRED_GT_PLUS: - case PPC::PRED_NE_PLUS: - case PPC::PRED_UN_PLUS: - case PPC::PRED_NU_PLUS: - O << "+"; - return; - case PPC::PRED_BIT_SET: - case PPC::PRED_BIT_UNSET: - llvm_unreachable("Invalid use of bit predicate code"); - } - llvm_unreachable("Invalid predicate code"); - } - - assert(StringRef(Modifier) == "reg" && - "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!"); - printOperand(MI, OpNo+1, O); -} - -void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Code = MI->getOperand(OpNo).getImm(); - if (Code == 2) - O << "-"; - else if (Code == 3) - O << "+"; -} - -void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 1 && "Invalid u1imm argument!"); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 3 && "Invalid u2imm argument!"); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 8 && "Invalid u3imm argument!"); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 15 && "Invalid u4imm argument!"); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int Value = MI->getOperand(OpNo).getImm(); - Value = SignExtend32<5>(Value); - O << (int)Value; -} - -void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 31 && "Invalid u5imm argument!"); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 63 && "Invalid u6imm argument!"); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 127 && "Invalid u7imm argument!"); - O << (unsigned int)Value; -} - -// Operands of BUILD_VECTOR are signed and we use this to print operands -// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and -// print as unsigned. -void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned char Value = MI->getOperand(OpNo).getImm(); - O << (unsigned int)Value; -} - -void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned short Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 1023 && "Invalid u10imm argument!"); - O << (unsigned short)Value; -} - -void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned short Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 4095 && "Invalid u12imm argument!"); - O << (unsigned short)Value; -} - -void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).isImm()) - O << (short)MI->getOperand(OpNo).getImm(); - else - printOperand(MI, OpNo, O); -} - -void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).isImm()) - O << (unsigned short)MI->getOperand(OpNo).getImm(); - else - printOperand(MI, OpNo, O); -} - -void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (!MI->getOperand(OpNo).isImm()) - return printOperand(MI, OpNo, O); - - // Branches can take an immediate operand. This is used by the branch - // selection pass to print .+8, an eight byte displacement from the PC. - O << ".+"; - printAbsBranchOperand(MI, OpNo, O); -} - -void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (!MI->getOperand(OpNo).isImm()) - return printOperand(MI, OpNo, O); - - O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2); -} - - -void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned CCReg = MI->getOperand(OpNo).getReg(); - unsigned RegNo; - switch (CCReg) { - default: llvm_unreachable("Unknown CR register"); - case PPC::CR0: RegNo = 0; break; - case PPC::CR1: RegNo = 1; break; - case PPC::CR2: RegNo = 2; break; - case PPC::CR3: RegNo = 3; break; - case PPC::CR4: RegNo = 4; break; - case PPC::CR5: RegNo = 5; break; - case PPC::CR6: RegNo = 6; break; - case PPC::CR7: RegNo = 7; break; - } - O << (0x80 >> RegNo); -} - -void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printS16ImmOperand(MI, OpNo, O); - O << '('; - if (MI->getOperand(OpNo+1).getReg() == PPC::R0) - O << "0"; - else - printOperand(MI, OpNo+1, O); - O << ')'; -} - -void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - // When used as the base register, r0 reads constant zero rather than - // the value contained in the register. For this reason, the darwin - // assembler requires that we print r0 as 0 (no r) when used as the base. - if (MI->getOperand(OpNo).getReg() == PPC::R0) - O << "0"; - else - printOperand(MI, OpNo, O); - O << ", "; - printOperand(MI, OpNo+1, O); -} - -void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must - // come at the _end_ of the expression. - const MCOperand &Op = MI->getOperand(OpNo); - const MCSymbolRefExpr &refExp = cast(*Op.getExpr()); - O << refExp.getSymbol().getName(); - O << '('; - printOperand(MI, OpNo+1, O); - O << ')'; - if (refExp.getKind() != MCSymbolRefExpr::VK_None) - O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind()); -} - -/// showRegistersWithPercentPrefix - Check if this register name should be -/// printed with a percentage symbol as prefix. -bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const { - if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX) - return false; - - switch (RegName[0]) { - default: - return false; - case 'r': - case 'f': - case 'q': - case 'v': - case 'c': - return true; - } -} - -/// getVerboseConditionalRegName - This method expands the condition register -/// when requested explicitly or targetting Darwin. -const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum, - unsigned RegEncoding) - const { - if (!TT.isOSDarwin() && !FullRegNames) - return nullptr; - if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN) - return nullptr; - const char *CRBits[] = { - "lt", "gt", "eq", "un", - "4*cr1+lt", "4*cr1+gt", "4*cr1+eq", "4*cr1+un", - "4*cr2+lt", "4*cr2+gt", "4*cr2+eq", "4*cr2+un", - "4*cr3+lt", "4*cr3+gt", "4*cr3+eq", "4*cr3+un", - "4*cr4+lt", "4*cr4+gt", "4*cr4+eq", "4*cr4+un", - "4*cr5+lt", "4*cr5+gt", "4*cr5+eq", "4*cr5+un", - "4*cr6+lt", "4*cr6+gt", "4*cr6+eq", "4*cr6+un", - "4*cr7+lt", "4*cr7+gt", "4*cr7+eq", "4*cr7+un" - }; - return CRBits[RegEncoding]; -} - -// showRegistersWithPrefix - This method determines whether registers -// should be number-only or include the prefix. -bool PPCInstPrinter::showRegistersWithPrefix() const { - if (TT.getOS() == Triple::AIX) - return false; - return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames; -} - -void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - if (!ShowVSRNumsAsVR) - Reg = PPCInstrInfo::getRegNumForOperand(MII.get(MI->getOpcode()), - Reg, OpNo); - - const char *RegName; - RegName = getVerboseConditionRegName(Reg, MRI.getEncodingValue(Reg)); - if (RegName == nullptr) - RegName = getRegisterName(Reg); - if (showRegistersWithPercentPrefix(RegName)) - O << "%"; - if (!showRegistersWithPrefix()) - RegName = PPCRegisterInfo::stripRegisterPrefix(RegName); - - O << RegName; - return; - } - - if (Op.isImm()) { - O << Op.getImm(); - return; - } - - assert(Op.isExpr() && "unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI); -} - diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h deleted file mode 100644 index 351ccefa2da2..000000000000 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ /dev/null @@ -1,77 +0,0 @@ -//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an PPC MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H -#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H - -#include "llvm/ADT/Triple.h" -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class PPCInstPrinter : public MCInstPrinter { - Triple TT; -private: - bool showRegistersWithPercentPrefix(const char *RegName) const; - bool showRegistersWithPrefix() const; - const char *getVerboseConditionRegName(unsigned RegNum, - unsigned RegEncoding) const; - -public: - PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, Triple T) - : MCInstPrinter(MAI, MII, MRI), TT(T) {} - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - bool printAliasInstr(const MCInst *MI, raw_ostream &OS); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, - raw_ostream &OS); - - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier = nullptr); - void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - - void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); -}; -} // end namespace llvm - -#endif diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index a405dd70c307..8778e916f7e4 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -29,6 +28,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); + case FK_NONE: case FK_Data_1: case FK_Data_2: case FK_Data_4: @@ -52,6 +52,8 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); + case FK_NONE: + return 0; case FK_Data_1: return 1; case FK_Data_2: @@ -74,10 +76,12 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { namespace { class PPCAsmBackend : public MCAsmBackend { - const Target &TheTarget; +protected: + Triple TT; public: - PPCAsmBackend(const Target &T, support::endianness Endian) - : MCAsmBackend(Endian), TheTarget(T) {} + PPCAsmBackend(const Target &T, const Triple &TT) + : MCAsmBackend(TT.isLittleEndian() ? support::little : support::big), + TT(TT) {} unsigned getNumFixupKinds() const override { return PPC::NumTargetFixupKinds; @@ -136,9 +140,11 @@ public: bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override { - switch ((PPC::Fixups)Fixup.getKind()) { + switch ((unsigned)Fixup.getKind()) { default: return false; + case FK_NONE: + return true; case PPC::fixup_ppc_br24: case PPC::fixup_ppc_br24abs: // If the target symbol has a local entry point we must not attempt @@ -187,59 +193,76 @@ public: return true; } - - unsigned getPointerSize() const { - StringRef Name = TheTarget.getName(); - if (Name == "ppc64" || Name == "ppc64le") return 8; - assert(Name == "ppc32" && "Unknown target name!"); - return 4; - } }; } // end anonymous namespace // FIXME: This should be in a separate file. namespace { - class DarwinPPCAsmBackend : public PPCAsmBackend { - public: - DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, support::big) { } - - std::unique_ptr - createObjectTargetWriter() const override { - bool is64 = getPointerSize() == 8; - return createPPCMachObjectWriter( - /*Is64Bit=*/is64, - (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC), - MachO::CPU_SUBTYPE_POWERPC_ALL); - } - }; - - class ELFPPCAsmBackend : public PPCAsmBackend { - uint8_t OSABI; - public: - ELFPPCAsmBackend(const Target &T, support::endianness Endian, - uint8_t OSABI) - : PPCAsmBackend(T, Endian), OSABI(OSABI) {} - - std::unique_ptr - createObjectTargetWriter() const override { - bool is64 = getPointerSize() == 8; - return createPPCELFObjectWriter(is64, OSABI); - } - }; + +class DarwinPPCAsmBackend : public PPCAsmBackend { +public: + DarwinPPCAsmBackend(const Target &T, const Triple &TT) + : PPCAsmBackend(T, TT) {} + + std::unique_ptr + createObjectTargetWriter() const override { + bool Is64 = TT.isPPC64(); + return createPPCMachObjectWriter( + /*Is64Bit=*/Is64, + (Is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC), + MachO::CPU_SUBTYPE_POWERPC_ALL); + } +}; + +class ELFPPCAsmBackend : public PPCAsmBackend { +public: + ELFPPCAsmBackend(const Target &T, const Triple &TT) : PPCAsmBackend(T, TT) {} + + std::unique_ptr + createObjectTargetWriter() const override { + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); + bool Is64 = TT.isPPC64(); + return createPPCELFObjectWriter(Is64, OSABI); + } + + Optional getFixupKind(StringRef Name) const override; +}; + +class XCOFFPPCAsmBackend : public PPCAsmBackend { +public: + XCOFFPPCAsmBackend(const Target &T, const Triple &TT) + : PPCAsmBackend(T, TT) {} + + std::unique_ptr + createObjectTargetWriter() const override { + return createPPCXCOFFObjectWriter(TT.isArch64Bit()); + } +}; } // end anonymous namespace +Optional ELFPPCAsmBackend::getFixupKind(StringRef Name) const { + if (TT.isPPC64()) { + if (Name == "R_PPC64_NONE") + return FK_NONE; + } else { + if (Name == "R_PPC_NONE") + return FK_NONE; + } + return MCAsmBackend::getFixupKind(Name); +} + MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { const Triple &TT = STI.getTargetTriple(); if (TT.isOSDarwin()) - return new DarwinPPCAsmBackend(T); + return new DarwinPPCAsmBackend(T, TT); + + if (TT.isOSBinFormatXCOFF()) + return new XCOFFPPCAsmBackend(T, TT); - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); - bool IsLittleEndian = TT.getArch() == Triple::ppc64le; - return new ELFPPCAsmBackend( - T, IsLittleEndian ? support::little : support::big, OSABI); + return new ELFPPCAsmBackend(T, TT); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index a3caf9a7a5ee..042ddf48d5df 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -134,6 +133,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } else { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); + case FK_NONE: + Type = ELF::R_PPC_NONE; + break; case PPC::fixup_ppc_br24abs: Type = ELF::R_PPC_ADDR24; break; diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index dce443997ea5..845489788c86 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -1,9 +1,8 @@ //===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp new file mode 100644 index 000000000000..0e64ae55ab1c --- /dev/null +++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -0,0 +1,543 @@ +//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an PPC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCInstPrinter.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCInstrInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// FIXME: Once the integrated assembler supports full register names, tie this +// to the verbose-asm setting. +static cl::opt +FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false), + cl::desc("Use full register names when printing assembly")); + +// Useful for testing purposes. Prints vs{31-63} as v{0-31} respectively. +static cl::opt +ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false), + cl::desc("Prints full register names with vs{31-63} as v{0-31}")); + +// Prints full register names with percent symbol. +static cl::opt +FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, + cl::init(false), + cl::desc("Prints full register names with percent")); + +#define PRINT_ALIAS_INSTR +#include "PPCGenAsmWriter.inc" + +void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + const char *RegName = getRegisterName(RegNo); + if (RegName[0] == 'q' /* QPX */) { + // The system toolchain on the BG/Q does not understand QPX register names + // in .cfi_* directives, so print the name of the floating-point + // subregister instead. + std::string RN(RegName); + + RN[0] = 'f'; + OS << RN; + + return; + } + + OS << RegName; +} + +void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + // Check for slwi/srwi mnemonics. + if (MI->getOpcode() == PPC::RLWINM) { + unsigned char SH = MI->getOperand(2).getImm(); + unsigned char MB = MI->getOperand(3).getImm(); + unsigned char ME = MI->getOperand(4).getImm(); + bool useSubstituteMnemonic = false; + if (SH <= 31 && MB == 0 && ME == (31-SH)) { + O << "\tslwi "; useSubstituteMnemonic = true; + } + if (SH <= 31 && MB == (32-SH) && ME == 31) { + O << "\tsrwi "; useSubstituteMnemonic = true; + SH = 32-SH; + } + if (useSubstituteMnemonic) { + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + O << ", " << (unsigned int)SH; + + printAnnotation(O, Annot); + return; + } + } + + if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) && + MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + O << "\tmr "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + printAnnotation(O, Annot); + return; + } + + if (MI->getOpcode() == PPC::RLDICR || + MI->getOpcode() == PPC::RLDICR_32) { + unsigned char SH = MI->getOperand(2).getImm(); + unsigned char ME = MI->getOperand(3).getImm(); + // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH + if (63-SH == ME) { + O << "\tsldi "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + O << ", " << (unsigned int)SH; + printAnnotation(O, Annot); + return; + } + } + + // dcbt[st] is printed manually here because: + // 1. The assembly syntax is different between embedded and server targets + // 2. We must print the short mnemonics for TH == 0 because the + // embedded/server syntax default will not be stable across assemblers + // The syntax for dcbt is: + // dcbt ra, rb, th [server] + // dcbt th, ra, rb [embedded] + // where th can be omitted when it is 0. dcbtst is the same. + if (MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) { + unsigned char TH = MI->getOperand(0).getImm(); + O << "\tdcbt"; + if (MI->getOpcode() == PPC::DCBTST) + O << "st"; + if (TH == 16) + O << "t"; + O << " "; + + bool IsBookE = STI.getFeatureBits()[PPC::FeatureBookE]; + if (IsBookE && TH != 0 && TH != 16) + O << (unsigned int) TH << ", "; + + printOperand(MI, 1, O); + O << ", "; + printOperand(MI, 2, O); + + if (!IsBookE && TH != 0 && TH != 16) + O << ", " << (unsigned int) TH; + + printAnnotation(O, Annot); + return; + } + + if (MI->getOpcode() == PPC::DCBF) { + unsigned char L = MI->getOperand(0).getImm(); + if (!L || L == 1 || L == 3) { + O << "\tdcbf"; + if (L == 1 || L == 3) + O << "l"; + if (L == 3) + O << "p"; + O << " "; + + printOperand(MI, 1, O); + O << ", "; + printOperand(MI, 2, O); + + printAnnotation(O, Annot); + return; + } + } + + if (!printAliasInstr(MI, O)) + printInstruction(MI, O); + printAnnotation(O, Annot); +} + + +void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, + const char *Modifier) { + unsigned Code = MI->getOperand(OpNo).getImm(); + + if (StringRef(Modifier) == "cc") { + switch ((PPC::Predicate)Code) { + case PPC::PRED_LT_MINUS: + case PPC::PRED_LT_PLUS: + case PPC::PRED_LT: + O << "lt"; + return; + case PPC::PRED_LE_MINUS: + case PPC::PRED_LE_PLUS: + case PPC::PRED_LE: + O << "le"; + return; + case PPC::PRED_EQ_MINUS: + case PPC::PRED_EQ_PLUS: + case PPC::PRED_EQ: + O << "eq"; + return; + case PPC::PRED_GE_MINUS: + case PPC::PRED_GE_PLUS: + case PPC::PRED_GE: + O << "ge"; + return; + case PPC::PRED_GT_MINUS: + case PPC::PRED_GT_PLUS: + case PPC::PRED_GT: + O << "gt"; + return; + case PPC::PRED_NE_MINUS: + case PPC::PRED_NE_PLUS: + case PPC::PRED_NE: + O << "ne"; + return; + case PPC::PRED_UN_MINUS: + case PPC::PRED_UN_PLUS: + case PPC::PRED_UN: + O << "un"; + return; + case PPC::PRED_NU_MINUS: + case PPC::PRED_NU_PLUS: + case PPC::PRED_NU: + O << "nu"; + return; + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Invalid predicate code"); + } + + if (StringRef(Modifier) == "pm") { + switch ((PPC::Predicate)Code) { + case PPC::PRED_LT: + case PPC::PRED_LE: + case PPC::PRED_EQ: + case PPC::PRED_GE: + case PPC::PRED_GT: + case PPC::PRED_NE: + case PPC::PRED_UN: + case PPC::PRED_NU: + return; + case PPC::PRED_LT_MINUS: + case PPC::PRED_LE_MINUS: + case PPC::PRED_EQ_MINUS: + case PPC::PRED_GE_MINUS: + case PPC::PRED_GT_MINUS: + case PPC::PRED_NE_MINUS: + case PPC::PRED_UN_MINUS: + case PPC::PRED_NU_MINUS: + O << "-"; + return; + case PPC::PRED_LT_PLUS: + case PPC::PRED_LE_PLUS: + case PPC::PRED_EQ_PLUS: + case PPC::PRED_GE_PLUS: + case PPC::PRED_GT_PLUS: + case PPC::PRED_NE_PLUS: + case PPC::PRED_UN_PLUS: + case PPC::PRED_NU_PLUS: + O << "+"; + return; + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Invalid predicate code"); + } + + assert(StringRef(Modifier) == "reg" && + "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!"); + printOperand(MI, OpNo+1, O); +} + +void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Code = MI->getOperand(OpNo).getImm(); + if (Code == 2) + O << "-"; + else if (Code == 3) + O << "+"; +} + +void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 1 && "Invalid u1imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 3 && "Invalid u2imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 8 && "Invalid u3imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 15 && "Invalid u4imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Value = MI->getOperand(OpNo).getImm(); + Value = SignExtend32<5>(Value); + O << (int)Value; +} + +void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 31 && "Invalid u5imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 63 && "Invalid u6imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 127 && "Invalid u7imm argument!"); + O << (unsigned int)Value; +} + +// Operands of BUILD_VECTOR are signed and we use this to print operands +// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and +// print as unsigned. +void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned char Value = MI->getOperand(OpNo).getImm(); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned short Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 1023 && "Invalid u10imm argument!"); + O << (unsigned short)Value; +} + +void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned short Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 4095 && "Invalid u12imm argument!"); + O << (unsigned short)Value; +} + +void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).isImm()) + O << (short)MI->getOperand(OpNo).getImm(); + else + printOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).isImm()) + O << (unsigned short)MI->getOperand(OpNo).getImm(); + else + printOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (!MI->getOperand(OpNo).isImm()) + return printOperand(MI, OpNo, O); + + // Branches can take an immediate operand. This is used by the branch + // selection pass to print .+8, an eight byte displacement from the PC. + O << "."; + int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2); + if (Imm >= 0) + O << "+"; + O << Imm; +} + +void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (!MI->getOperand(OpNo).isImm()) + return printOperand(MI, OpNo, O); + + O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2); +} + + +void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CCReg = MI->getOperand(OpNo).getReg(); + unsigned RegNo; + switch (CCReg) { + default: llvm_unreachable("Unknown CR register"); + case PPC::CR0: RegNo = 0; break; + case PPC::CR1: RegNo = 1; break; + case PPC::CR2: RegNo = 2; break; + case PPC::CR3: RegNo = 3; break; + case PPC::CR4: RegNo = 4; break; + case PPC::CR5: RegNo = 5; break; + case PPC::CR6: RegNo = 6; break; + case PPC::CR7: RegNo = 7; break; + } + O << (0x80 >> RegNo); +} + +void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printS16ImmOperand(MI, OpNo, O); + O << '('; + if (MI->getOperand(OpNo+1).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo+1, O); + O << ')'; +} + +void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // When used as the base register, r0 reads constant zero rather than + // the value contained in the register. For this reason, the darwin + // assembler requires that we print r0 as 0 (no r) when used as the base. + if (MI->getOperand(OpNo).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo+1, O); +} + +void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must + // come at the _end_ of the expression. + const MCOperand &Op = MI->getOperand(OpNo); + const MCSymbolRefExpr *RefExp = nullptr; + const MCConstantExpr *ConstExp = nullptr; + if (const MCBinaryExpr *BinExpr = dyn_cast(Op.getExpr())) { + RefExp = cast(BinExpr->getLHS()); + ConstExp = cast(BinExpr->getRHS()); + } else + RefExp = cast(Op.getExpr()); + + O << RefExp->getSymbol().getName(); + O << '('; + printOperand(MI, OpNo+1, O); + O << ')'; + if (RefExp->getKind() != MCSymbolRefExpr::VK_None) + O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind()); + if (ConstExp != nullptr) + O << '+' << ConstExp->getValue(); +} + +/// showRegistersWithPercentPrefix - Check if this register name should be +/// printed with a percentage symbol as prefix. +bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const { + if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX) + return false; + + switch (RegName[0]) { + default: + return false; + case 'r': + case 'f': + case 'q': + case 'v': + case 'c': + return true; + } +} + +/// getVerboseConditionalRegName - This method expands the condition register +/// when requested explicitly or targetting Darwin. +const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum, + unsigned RegEncoding) + const { + if (!TT.isOSDarwin() && !FullRegNames) + return nullptr; + if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN) + return nullptr; + const char *CRBits[] = { + "lt", "gt", "eq", "un", + "4*cr1+lt", "4*cr1+gt", "4*cr1+eq", "4*cr1+un", + "4*cr2+lt", "4*cr2+gt", "4*cr2+eq", "4*cr2+un", + "4*cr3+lt", "4*cr3+gt", "4*cr3+eq", "4*cr3+un", + "4*cr4+lt", "4*cr4+gt", "4*cr4+eq", "4*cr4+un", + "4*cr5+lt", "4*cr5+gt", "4*cr5+eq", "4*cr5+un", + "4*cr6+lt", "4*cr6+gt", "4*cr6+eq", "4*cr6+un", + "4*cr7+lt", "4*cr7+gt", "4*cr7+eq", "4*cr7+un" + }; + return CRBits[RegEncoding]; +} + +// showRegistersWithPrefix - This method determines whether registers +// should be number-only or include the prefix. +bool PPCInstPrinter::showRegistersWithPrefix() const { + if (TT.getOS() == Triple::AIX) + return false; + return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames; +} + +void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (!ShowVSRNumsAsVR) + Reg = PPCInstrInfo::getRegNumForOperand(MII.get(MI->getOpcode()), + Reg, OpNo); + + const char *RegName; + RegName = getVerboseConditionRegName(Reg, MRI.getEncodingValue(Reg)); + if (RegName == nullptr) + RegName = getRegisterName(Reg); + if (showRegistersWithPercentPrefix(RegName)) + O << "%"; + if (!showRegistersWithPrefix()) + RegName = PPCRegisterInfo::stripRegisterPrefix(RegName); + + O << RegName; + return; + } + + if (Op.isImm()) { + O << Op.getImm(); + return; + } + + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); +} + diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h new file mode 100644 index 000000000000..725ae2a7081b --- /dev/null +++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h @@ -0,0 +1,76 @@ +//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an PPC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCINSTPRINTER_H +#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCINSTPRINTER_H + +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class PPCInstPrinter : public MCInstPrinter { + Triple TT; +private: + bool showRegistersWithPercentPrefix(const char *RegName) const; + bool showRegistersWithPrefix() const; + const char *getVerboseConditionRegName(unsigned RegNum, + unsigned RegEncoding) const; + +public: + PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI, Triple T) + : MCInstPrinter(MAI, MII, MRI), TT(T) {} + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + raw_ostream &OS); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier = nullptr); + void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; +} // end namespace llvm + +#endif diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index fb7bf23509c7..5f0005ea1d7b 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -82,3 +81,9 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { UseIntegratedAssembler = true; } +void PPCXCOFFMCAsmInfo::anchor() {} + +PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { + assert(!IsLittleEndian && "Little-endian XCOFF not supported."); + CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4; +} diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index e252ac944d40..42cb62ad26a4 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -1,13 +1,12 @@ //===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // -// This file contains the declaration of the MCAsmInfoDarwin class. +// This file contains the declarations of the PowerPC MCAsmInfo classes. // //===----------------------------------------------------------------------===// @@ -16,6 +15,7 @@ #include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/MCAsmInfoELF.h" +#include "llvm/MC/MCAsmInfoXCOFF.h" namespace llvm { class Triple; @@ -34,6 +34,13 @@ public: explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &); }; +class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF { + virtual void anchor(); + +public: + explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &); +}; + } // namespace llvm #endif diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 8c15ade6f9c4..676efc500455 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -217,7 +216,7 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, Fixups.push_back(MCFixup::create(0, MO.getExpr(), (MCFixupKind)PPC::fixup_ppc_nofixup)); const Triple &TT = STI.getTargetTriple(); - bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le; + bool isPPC64 = TT.isPPC64(); return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index a4bcff4b9450..1324faa12553 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -1,9 +1,8 @@ //===-- PPCMCCodeEmitter.h - Convert PPC code to machine code -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -99,9 +98,10 @@ public: unsigned getInstSizeInBytes(const MCInst &MI) const; private: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // namespace llvm diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp index 32e6a0bdd65f..d467f5c4a439 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -1,9 +1,8 @@ //===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index 8bb4791d13dd..449e2c34f74d 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -1,9 +1,8 @@ //===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index a1e4e07b25af..90c3c8d20edb 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,11 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/PPCMCTargetDesc.h" -#include "InstPrinter/PPCInstPrinter.h" +#include "MCTargetDesc/PPCInstPrinter.h" #include "MCTargetDesc/PPCMCAsmInfo.h" #include "PPCTargetStreamer.h" +#include "TargetInfo/PowerPCTargetInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/ELF.h" @@ -47,9 +48,9 @@ using namespace llvm; #define GET_REGINFO_MC_DESC #include "PPCGenRegisterInfo.inc" -// Pin the vtable to this file. PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} +// Pin the vtable to this file. PPCTargetStreamer::~PPCTargetStreamer() = default; static MCInstrInfo *createPPCMCInstrInfo() { @@ -82,6 +83,8 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI; if (TheTriple.isOSDarwin()) MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple); + else if (TheTriple.isOSBinFormatXCOFF()) + MAI = new PPCXCOFFMCAsmInfo(isPPC64, TheTriple); else MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple); @@ -182,16 +185,33 @@ public: void emitAssignment(MCSymbol *S, const MCExpr *Value) override { auto *Symbol = cast(S); + // When encoding an assignment to set symbol A to symbol B, also copy // the st_other bits encoding the local entry point offset. - if (Value->getKind() != MCExpr::SymbolRef) - return; - const auto &RhsSym = cast( - static_cast(Value)->getSymbol()); - unsigned Other = Symbol->getOther(); + if (copyLocalEntry(Symbol, Value)) + UpdateOther.insert(Symbol); + else + UpdateOther.erase(Symbol); + } + + void finish() override { + for (auto *Sym : UpdateOther) + copyLocalEntry(Sym, Sym->getVariableValue()); + } + +private: + SmallPtrSet UpdateOther; + + bool copyLocalEntry(MCSymbolELF *D, const MCExpr *S) { + auto *Ref = dyn_cast(S); + if (!Ref) + return false; + const auto &RhsSym = cast(Ref->getSymbol()); + unsigned Other = D->getOther(); Other &= ~ELF::STO_PPC64_LOCAL_MASK; Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK; - Symbol->setOther(Other); + D->setOther(Other); + return true; } }; @@ -217,6 +237,27 @@ public: } }; +class PPCTargetXCOFFStreamer : public PPCTargetStreamer { +public: + PPCTargetXCOFFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + + void emitTCEntry(const MCSymbol &S) override { + report_fatal_error("TOC entries not supported yet."); + } + + void emitMachine(StringRef CPU) override { + llvm_unreachable("Machine pseudo-ops are invalid for XCOFF."); + } + + void emitAbiVersion(int AbiVersion) override { + llvm_unreachable("ABI-version pseudo-ops are invalid for XCOFF."); + } + + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { + llvm_unreachable("Local-entry pseudo-ops are invalid for XCOFF."); + } +}; + } // end anonymous namespace static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, @@ -231,6 +272,8 @@ createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); if (TT.isOSBinFormatELF()) return new PPCTargetELFStreamer(S); + if (TT.isOSBinFormatXCOFF()) + return new PPCTargetXCOFFStreamer(S); return new PPCTargetMachOStreamer(S); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index d6e450cba0d7..74b67bd2e928 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,10 +36,6 @@ class Triple; class StringRef; class raw_pwrite_stream; -Target &getThePPC32Target(); -Target &getThePPC64Target(); -Target &getThePPC64LETarget(); - MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); @@ -56,6 +51,9 @@ std::unique_ptr createPPCELFObjectWriter(bool Is64Bit, std::unique_ptr createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype); +/// Construct a PPC XCOFF object writer. +std::unique_ptr createPPCXCOFFObjectWriter(bool Is64Bit); + /// Returns true iff Val consists of one contiguous run of 1s with any number of /// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so /// 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is not, diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index ff6cf584da23..4cf7fd15fa75 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp index c2987b641c04..284e52c298a2 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp @@ -1,9 +1,8 @@ //===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h index 481ba3f09cc7..d686a8ea2a22 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -1,9 +1,8 @@ //===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp new file mode 100644 index 000000000000..9c661286d455 --- /dev/null +++ b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -0,0 +1,29 @@ +//===-- PPCXCOFFObjectWriter.cpp - PowerPC XCOFF Writer -------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PPCMCTargetDesc.h" +#include "llvm/MC/MCXCOFFObjectWriter.h" + +using namespace llvm; + +namespace { +class PPCXCOFFObjectWriter : public MCXCOFFObjectTargetWriter { + +public: + PPCXCOFFObjectWriter(bool Is64Bit); +}; +} // end anonymous namespace + +PPCXCOFFObjectWriter::PPCXCOFFObjectWriter(bool Is64Bit) + : MCXCOFFObjectTargetWriter(Is64Bit) {} + +std::unique_ptr +llvm::createPPCXCOFFObjectWriter(bool Is64Bit) { + return llvm::make_unique(Is64Bit); +} diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td index 17c37964c562..2a10322d3f49 100644 --- a/lib/Target/PowerPC/P9InstrResources.td +++ b/lib/Target/PowerPC/P9InstrResources.td @@ -1,22 +1,21 @@ -//===- P9InstrResources.td - P9 Instruction Resource Defs -*- tablegen -*-===// +//===- P9InstrResources.td - P9 Instruction Resource Defs -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // -// This file defines the resources required by P9 instructions. This is part -// P9 processor model used for instruction scheduling. This file should contain -// all of the instructions that may be used on Power 9. This is not just -// instructions that are new on Power 9 but also instructions that were +// This file defines the resources required by P9 instructions. This is part of +// the P9 processor model used for instruction scheduling. This file should +// contain all the instructions that may be used on Power 9. This is not +// just instructions that are new on Power 9 but also instructions that were // available on earlier architectures and are still used in Power 9. // // The makeup of the P9 CPU is modeled as follows: // - Each CPU is made up of two superslices. // - Each superslice is made up of two slices. Therefore, there are 4 slices -// for each CPU. +// for each CPU. // - Up to 6 instructions can be dispatched to each CPU. Three per superslice. // - Each CPU has: // - One CY (Crypto) unit P9_CY_* @@ -33,9 +32,8 @@ // Two cycle ALU vector operation that uses an entire superslice. // Uses both ALU units (the even ALUE and odd ALUO units), two pipelines -// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. -def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], +// (EXECE, EXECO) and 1 dispatch (DISP) to the given superslice. +def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs (instregex "VADDU(B|H|W|D)M$"), (instregex "VAND(C)?$"), @@ -85,9 +83,9 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, )>; // Restricted Dispatch ALU operation for 3 cycles. The operation runs on a -// slingle slice. However, since it is Restricted it requires all 3 dispatches +// single slice. However, since it is Restricted, it requires all 3 dispatches // (DISP) for that superslice. -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs (instregex "TABORT(D|W)C(I)?$"), (instregex "MTFSB(0|1)$"), @@ -103,7 +101,7 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], )>; // Standard Dispatch ALU operation for 3 cycles. Only one slice used. -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C], (instrs (instregex "XSMAX(C|J)?DP$"), (instregex "XSMIN(C|J)?DP$"), @@ -120,11 +118,11 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], )>; // Standard Dispatch ALU operation for 2 cycles. Only one slice used. -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], (instrs (instregex "S(L|R)D$"), (instregex "SRAD(I)?$"), - (instregex "EXTSWSLI$"), + (instregex "EXTSWSLI_32_64$"), (instregex "MFV(S)?RD$"), (instregex "MTVSRD$"), (instregex "MTVSRW(A|Z)$"), @@ -160,6 +158,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], XSNEGDP, XSCPSGNDP, MFVSRWZ, + EXTSWSLI, SRADI_32, RLDIC, RFEBB, @@ -171,9 +170,9 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], )>; // Restricted Dispatch ALU operation for 2 cycles. The operation runs on a -// slingle slice. However, since it is Restricted it requires all 3 dispatches -// (DISP) for that superslice. -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// single slice. However, since it is Restricted, it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs (instregex "RLDC(L|R)$"), (instregex "RLWIMI(8)?$"), @@ -200,9 +199,8 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], // Three cycle ALU vector operation that uses an entire superslice. // Uses both ALU units (the even ALUE and odd ALUO units), two pipelines -// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. -def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], +// (EXECE, EXECO) and 1 dispatch (DISP) to the given superslice. +def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs (instregex "M(T|F)VSCR$"), (instregex "VCMPNEZ(B|H|W)$"), @@ -285,10 +283,9 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, )>; // 7 cycle DP vector operation that uses an entire superslice. -// Uses both DP units (the even DPE and odd DPO units), two pipelines -// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. -def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], +// Uses both DP units (the even DPE and odd DPO units), two pipelines (EXECE, +// EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs VADDFP, VCTSXS, @@ -395,18 +392,17 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, VSUMSWS )>; - // 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three -// dispatch units for the superslice. -def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// dispatch units for the superslice. +def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs - (instregex "MADD(HD|HDU|LD)$"), + (instregex "MADD(HD|HDU|LD|LD8)$"), (instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$") )>; // 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three -// dispatch units for the superslice. -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// dispatch units for the superslice. +def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FRSP, (instregex "FRI(N|P|Z|M)(D|S)$"), @@ -448,26 +444,26 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], )>; // 7 cycle Restricted DP operation and one 3 cycle ALU operation. -// These operations can be done in parallel. -// The DP is restricted so we need a full 5 dispatches. +// These operations can be done in parallel. The DP is restricted so we need a +// full 4 dispatches. def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "FSEL(D|S)o$") )>; // 5 Cycle Restricted DP operation and one 2 cycle ALU operation. def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "MUL(H|L)(D|W)(U)?o$") )>; // 7 cycle Restricted DP operation and one 3 cycle ALU operation. -// These operations must be done sequentially. -// The DP is restricted so we need a full 5 dispatches. +// These operations must be done sequentially.The DP is restricted so we need a +// full 4 dispatches. def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "FRI(N|P|Z|M)(D|S)o$"), (instregex "FRE(S)?o$"), @@ -483,8 +479,8 @@ def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C, FRSPo )>; -// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units. -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], +// 7 cycle DP operation. One DP unit, one EXEC pipeline and 1 dispatch units. +def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C], (instrs XSADDDP, XSADDSP, @@ -520,9 +516,9 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], )>; // Three Cycle PM operation. Only one PM unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], (instrs (instregex "LVS(L|R)$"), (instregex "VSPLTIS(W|H|B)$"), @@ -628,9 +624,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], )>; // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs BCDSRo, XSADDQP, @@ -652,17 +648,17 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], )>; // 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs BCDCTSQo )>; // 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs XSMADDQP, XSMADDQPO, @@ -677,39 +673,39 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], )>; // 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs BCDCFSQo )>; // 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs XSDIVQP, XSDIVQPO )>; // 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs XSSQRTQP, XSSQRTQPO )>; // 6 Cycle Load uses a single slice. -def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C], (instrs (instregex "LXVL(L)?") )>; // 5 Cycle Load uses a single slice. -def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C], (instrs (instregex "LVE(B|H|W)X$"), (instregex "LVX(L)?"), @@ -728,7 +724,7 @@ def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C], )>; // 4 Cycle Load uses a single slice. -def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C], (instrs (instregex "DCB(F|T|ST)(EP)?$"), (instregex "DCBZ(L)?(EP)?$"), @@ -757,8 +753,8 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C], )>; // 4 Cycle Restricted load uses a single slice but the dispatch for the whole -// superslice. -def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_3SLOTS_1C], (instrs LFIWZX, LFDX, @@ -768,7 +764,7 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], // Cracked Load Instructions. // Load instructions that can be done in parallel. def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C], (instrs SLBIA, SLBIE, @@ -782,17 +778,26 @@ def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C, // Requires Load and ALU pieces totaling 6 cycles. The Load and ALU // operations can be run in parallel. def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C, DISP_PAIR_1C], + (instrs + (instregex "L(W|H)ZU(X)?(8)?$") +)>; + +// Cracked TEND Instruction. +// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU +// operations can be run in parallel. +def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C, + DISP_1C, DISP_1C], (instrs - (instregex "L(W|H)ZU(X)?(8)?$"), TEND )>; + // Cracked Store Instruction // Consecutive Store and ALU instructions. The store is restricted and requires // three dispatches. def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "ST(B|H|W|D)CX$") )>; @@ -800,16 +805,16 @@ def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, // Cracked Load Instruction. // Two consecutive load operations for a total of 8 cycles. def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs LDMX )>; // Cracked Load instruction. // Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU -// operations cannot be done at the same time and so their latencies are added. +// operations cannot be done at the same time and so their latencies are added. def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs (instregex "LHA(X)?(8)?$"), (instregex "CP_PASTE(8)?o$"), @@ -819,20 +824,19 @@ def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, // Cracked Restricted Load instruction. // Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU -// operations cannot be done at the same time and so their latencies are added. +// operations cannot be done at the same time and so their latencies are added. // Full 6 dispatches are required as this is both cracked and restricted. def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs LFIWAX )>; // Cracked Load instruction. // Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU -// operations cannot be done at the same time and so their latencies are added. +// operations cannot be done at the same time and so their latencies are added. // Full 4 dispatches are required as this is a cracked instruction. -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs LXSIWAX, LIWAX @@ -844,7 +848,7 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, // their latencies are added. // Full 6 dispatches are required as this is a restricted instruction. def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs LFSX, LFS @@ -852,10 +856,9 @@ def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C, // Cracked Load instruction. // Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU -// operations cannot be done at the same time and so their latencies are added. +// operations cannot be done at the same time and so their latencies are added. // Full 4 dispatches are required as this is a cracked instruction. -def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs LXSSP, LXSSPX, @@ -866,7 +869,7 @@ def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, // Cracked 3-Way Load Instruction // Load with two ALU operations that depend on each other def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C, DISP_PAIR_1C, DISP_1C], (instrs (instregex "LHAU(X)?(8)?$"), LWAUX @@ -874,12 +877,11 @@ def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, // Cracked Load that requires the PM resource. // Since the Load and the PM cannot be done at the same time the latencies are -// added. Requires 8 cycles. -// Since the PM requires the full superslice we need both EXECE, EXECO pipelines -// as well as 3 dispatches for the PM. The Load requires the remaining 2 -// dispatches. +// added. Requires 8 cycles. Since the PM requires the full superslice we need +// both EXECE, EXECO pipelines as well as 1 dispatch for the PM. The Load +// requires the remaining 1 dispatch. def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs LXVH8X, LXVDSX, @@ -887,8 +889,8 @@ def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C, )>; // Single slice Restricted store operation. The restricted operation requires -// all three dispatches for the superslice. -def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +// all three dispatches for the superslice. +def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C], (instrs (instregex "STF(S|D|IWX|SX|DX)$"), (instregex "STXS(D|DX|SPX|IWX|IBX|IHX|SP)(v)?$"), @@ -905,10 +907,9 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], )>; // Vector Store Instruction -// Requires the whole superslice and therefore requires all three dispatches +// Requires the whole superslice and therefore requires one dispatch // as well as both the Even and Odd exec pipelines. -def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, DISP_1C], (instrs (instregex "STVE(B|H|W)X$"), (instregex "STVX(L)?$"), @@ -916,18 +917,18 @@ def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, )>; // 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and two // dispatches. -def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C], (instrs (instregex "MTCTR(8)?(loop)?$"), (instregex "MTLR(8)?$") )>; // 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and two // dispatches. -def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C], (instrs (instregex "M(T|F)VRSAVE(v)?$"), (instregex "M(T|F)PMR$"), @@ -938,10 +939,9 @@ def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], )>; // 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, - DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and two +// dispatches. +def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVW, DIVWU, @@ -949,10 +949,9 @@ def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, )>; // 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, - DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and two +// dispatches. +def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVWE, DIVD, @@ -964,29 +963,28 @@ def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, )>; // 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, - DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVDE, DIVDEU )>; // Cracked DIV and ALU operation. Requires one full slice for the ALU operation -// and one full superslice for the DIV operation since there is only one DIV -// per superslice. Latency of DIV plus ALU is 26. +// and one full superslice for the DIV operation since there is only one DIV per +// superslice. Latency of DIV plus ALU is 26. def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_EVEN_1C, DISP_1C], (instrs (instregex "DIVW(U)?(O)?o$") )>; // Cracked DIV and ALU operation. Requires one full slice for the ALU operation -// and one full superslice for the DIV operation since there is only one DIV -// per superslice. Latency of DIV plus ALU is 26. +// and one full superslice for the DIV operation since there is only one DIV per +// superslice. Latency of DIV plus ALU is 26. def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_EVEN_1C, DISP_1C], (instrs DIVDo, DIVDUo, @@ -995,10 +993,10 @@ def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, )>; // Cracked DIV and ALU operation. Requires one full slice for the ALU operation -// and one full superslice for the DIV operation since there is only one DIV -// per superslice. Latency of DIV plus ALU is 42. +// and one full superslice for the DIV operation since there is only one DIV per +// superslice. Latency of DIV plus ALU is 42. def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_EVEN_1C, DISP_1C], (instrs DIVDEo, DIVDEUo @@ -1008,11 +1006,11 @@ def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, // Cracked, restricted, ALU operations. // Here the two ALU ops can actually be done in parallel and therefore the -// latencies are not added together. Otherwise this is like having two -// instructions running together on two pipelines and 6 dispatches. -// ALU ops are 2 cycles each. +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. ALU ops are +// 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs MTCRF, MTCRF8 @@ -1020,11 +1018,11 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, // Cracked ALU operations. // Here the two ALU ops can actually be done in parallel and therefore the -// latencies are not added together. Otherwise this is like having two -// instructions running together on two pipelines and 4 dispatches. -// ALU ops are 2 cycles each. +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 2 dispatches. ALU ops are +// 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs (instregex "ADDC(8)?o$"), (instregex "SUBFC(8)?o$") @@ -1036,7 +1034,7 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, // One of the ALU ops is restricted the other is not so we have a total of // 5 dispatches. def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "F(N)?ABS(D|S)o$"), (instregex "FCPSGN(D|S)o$"), @@ -1046,22 +1044,22 @@ def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, // Cracked ALU operations. // Here the two ALU ops can actually be done in parallel and therefore the -// latencies are not added together. Otherwise this is like having two -// instructions running together on two pipelines and 4 dispatches. +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 2 dispatches. // ALU ops are 3 cycles each. def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs MCRFS )>; // Cracked Restricted ALU operations. // Here the two ALU ops can actually be done in parallel and therefore the -// latencies are not added together. Otherwise this is like having two -// instructions running together on two pipelines and 6 dispatches. +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. // ALU ops are 3 cycles each. def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs (instregex "MTFSF(b|o)?$"), (instregex "MTFSFI(o)?$") @@ -1071,7 +1069,7 @@ def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, // The two ops cannot be done in parallel. // One of the ALU ops is restricted and takes 3 dispatches. def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "RLD(I)?C(R|L)o$"), (instregex "RLW(IMI|INM|NM)(8)?o$"), @@ -1086,7 +1084,7 @@ def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, // The two ops cannot be done in parallel. // Both of the ALU ops are restricted and take 3 dispatches. def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs (instregex "MFFS(L|CE|o)?$") )>; @@ -1095,143 +1093,141 @@ def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C, // total of 6 cycles. All of the ALU operations are also restricted so each // takes 3 dispatches for a total of 9. def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs (instregex "MFCR(8)?$") )>; // Cracked instruction made of two ALU ops. // The two ops cannot be done in parallel. -def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - (instregex "EXTSWSLIo$"), + (instregex "EXTSWSLI_32_64o$"), (instregex "SRAD(I)?o$"), + EXTSWSLIo, SLDo, SRDo, RLDICo )>; // 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FDIV )>; // 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FDIVo )>; // 36 Cycle DP Instruction. // Instruction can be done on a single slice. -def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C], (instrs XSSQRTDP )>; // 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FSQRT )>; // 36 Cycle DP Vector Instruction. def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVSQRTDP )>; // 27 Cycle DP Vector Instruction. def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVSQRTSP )>; // 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FSQRTo )>; // 26 Cycle DP Instruction. -def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C], (instrs XSSQRTSP )>; // 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FSQRTS )>; // 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FSQRTSo )>; -// 33 Cycle DP Instruction. Takes one slice and 2 dispatches. -def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction. Takes one slice and 1 dispatch. +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C], (instrs XSDIVDP )>; // 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FDIVS )>; // 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FDIVSo )>; -// 22 Cycle DP Instruction. Takes one slice and 2 dispatches. -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction. Takes one slice and 1 dispatch. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C], (instrs XSDIVSP )>; // 24 Cycle DP Vector Instruction. Takes one full superslice. -// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given -// superslice. +// Includes both EXECE, EXECO pipelines and 1 dispatch for the given +// superslice. def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVDIVSP )>; // 33 Cycle DP Vector Instruction. Takes one full superslice. -// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given -// superslice. +// Includes both EXECE, EXECO pipelines and 1 dispatch for the given +// superslice. def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVDIVDP )>; // Instruction cracked into three pieces. One Load and two ALU operations. // The Load and one of the ALU ops cannot be run at the same time and so the -// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles. +// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles. // Both the load and the ALU that depends on it are restricted and so they take -// a total of 6 dispatches. The final 2 dispatches come from the second ALU op. +// a total of 7 dispatches. The final 2 dispatches come from the second ALU op. // The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load. def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "LF(SU|SUX)$") )>; @@ -1240,7 +1236,7 @@ def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C, // the store and so it can be run at the same time as the store. The store is // also restricted. def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "STF(S|D)U(X)?$"), (instregex "ST(B|H|W|D)U(X)?(8)?$") @@ -1249,20 +1245,19 @@ def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, // Cracked instruction made up of a Load and an ALU. The ALU does not depend on // the load and so it can be run at the same time as the load. def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C, DISP_PAIR_1C], (instrs (instregex "LBZU(X)?(8)?$"), (instregex "LDU(X)?$") )>; - // Cracked instruction made up of a Load and an ALU. The ALU does not depend on -// the load and so it can be run at the same time as the load. The load is also -// restricted. 3 dispatches are from the restricted load while the other two -// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline -// is required for the ALU. +// the load and so it can be run at the same time as the load. The load is also +// restricted. 3 dispatches are from the restricted load while the other two +// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline +// is required for the ALU. def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "LF(DU|DUX)$") )>; @@ -1270,9 +1265,9 @@ def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, // Crypto Instructions // 6 Cycle CY operation. Only one CY unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], (instrs (instregex "VPMSUM(B|H|W|D)$"), (instregex "V(N)?CIPHER(LAST)?$"), @@ -1282,14 +1277,14 @@ def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], // Branch Instructions // Two Cycle Branch -def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C], +def : InstRW<[P9_BR_2C, DISP_BR_1C], (instrs (instregex "BCCCTR(L)?(8)?$"), (instregex "BCCL(A|R|RL)?$"), (instregex "BCCTR(L)?(8)?(n)?$"), (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"), (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"), - (instregex "BL(_TLS)?$"), + (instregex "BL(_TLS|_NOP)?$"), (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"), (instregex "BLA(8|8_NOP)?$"), (instregex "BLR(8|L)?$"), @@ -1313,8 +1308,7 @@ def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C], // Five Cycle Branch with a 2 Cycle ALU Op // Operations must be done consecutively and not in parallel. -def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, DISP_BR_1C, DISP_1C], (instrs ADDPCIS )>; @@ -1324,17 +1318,15 @@ def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, // Atomic Load def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C, - IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C], + IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, + DISP_3SLOTS_1C, DISP_1C, DISP_1C, DISP_1C], (instrs (instregex "L(D|W)AT$") )>; // Atomic Store def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, - IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C], + IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "ST(D|W)AT$") )>; @@ -1406,6 +1398,7 @@ def : InstRW<[], MBAR, MSYNC, SLBSYNC, + SLBFEEo, NAP, STOP, TRAP, diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index bfc613af3dc0..c6951ab67b08 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -1,9 +1,8 @@ //===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,7 +15,6 @@ #define LLVM_LIB_TARGET_POWERPC_PPC_H #include "llvm/Support/CodeGen.h" -#include "MCTargetDesc/PPCMCTargetDesc.h" // GCC #defines PPC on Linux but we use it as our namespace name #undef PPC @@ -57,12 +55,26 @@ namespace llvm { MCOperand &OutMO, AsmPrinter &AP, bool isDarwin); + void initializePPCCTRLoopsPass(PassRegistry&); +#ifndef NDEBUG + void initializePPCCTRLoopsVerifyPass(PassRegistry&); +#endif + void initializePPCLoopPreIncPrepPass(PassRegistry&); + void initializePPCTOCRegDepsPass(PassRegistry&); + void initializePPCEarlyReturnPass(PassRegistry&); + void initializePPCVSXCopyPass(PassRegistry&); void initializePPCVSXFMAMutatePass(PassRegistry&); + void initializePPCVSXSwapRemovalPass(PassRegistry&); + void initializePPCReduceCRLogicalsPass(PassRegistry&); + void initializePPCBSelPass(PassRegistry&); + void initializePPCBranchCoalescingPass(PassRegistry&); + void initializePPCQPXLoadSplatPass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCPreEmitPeepholePass(PassRegistry &); void initializePPCTLSDynamicCallPass(PassRegistry &); void initializePPCMIPeepholePass(PassRegistry&); + extern char &PPCVSXFMAMutateID; namespace PPCII { diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index 98e6e98e6974..8e94a2ae15e0 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -1,9 +1,8 @@ //===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -136,6 +135,9 @@ def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", "Enable VSX instructions", [FeatureAltivec]>; +def FeatureTwoConstNR : + SubtargetFeature<"two-const-nr", "NeedsTwoConstNR", "true", + "Requires two constant Newton-Raphson computation">; def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true", "Enable POWER8 Altivec instructions", [FeatureAltivec]>; @@ -162,8 +164,12 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", "Enable Hardware Transactional Memory instructions">; def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true", "Implement mftb using the mfspr instruction">; -def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true", - "Target supports add/load integer fusion.">; +def FeaturePPCPreRASched: + SubtargetFeature<"ppc-prera-sched", "UsePPCPreRASchedStrategy", "true", + "Use PowerPC pre-RA scheduling strategy">; +def FeaturePPCPostRASched: + SubtargetFeature<"ppc-postra-sched", "UsePPCPostRASchedStrategy", "true", + "Use PowerPC post-RA scheduling strategy">; def FeatureFloat128 : SubtargetFeature<"float128", "HasFloat128", "true", "Enable the __float128 data type for IEEE-754R Binary128.", @@ -191,6 +197,13 @@ def FeatureP9Vector : SubtargetFeature<"power9-vector", "HasP9Vector", "true", "Enable POWER9 vector instructions", [FeatureISA3_0, FeatureP8Vector, FeatureP9Altivec]>; +// A separate feature for this even though it is equivalent to P9Vector +// because this is a feature of the implementation rather than the architecture +// and may go away with future CPU's. +def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units", + "VectorsUseTwoUnits", + "true", + "Vectors use two units">; // Since new processors generally contain a superset of features of those that // came before them, the idea is to make implementations of new processors @@ -215,15 +228,15 @@ def ProcessorFeatures { FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureBPERMD, FeatureExtDiv, - FeatureMFTB, DeprecatedDST]; + FeatureMFTB, DeprecatedDST, FeatureTwoConstNR]; list Power8SpecificFeatures = [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto, - FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic, - FeatureFusion]; + FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic]; list Power8FeatureList = !listconcat(Power7FeatureList, Power8SpecificFeatures); list Power9SpecificFeatures = - [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0]; + [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0, + FeatureVectorsUseTwoUnits, FeaturePPCPreRASched, FeaturePPCPostRASched]; list Power9FeatureList = !listconcat(Power8FeatureList, Power9SpecificFeatures); } @@ -279,10 +292,9 @@ def getNonRecordFormOpcode : InstrMapping { def getAltVSXFMAOpcode : InstrMapping { let FilterClass = "AltVSXFMARel"; - // Instructions with the same BaseName and Interpretation64Bit values - // form a row. + // Instructions with the same BaseName value form a row. let RowFields = ["BaseName"]; - // Instructions with the same RC value form a column. + // Instructions with the same IsVSXFMAAlt value form a column. let ColFields = ["IsVSXFMAAlt"]; // The key column are the (default) addend-killing instructions. let KeyCol = ["0"]; diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index 04aa3c9b1e22..bd87ce06b4fb 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,7 +15,7 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/PPCInstPrinter.h" +#include "MCTargetDesc/PPCInstPrinter.h" #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCPredicates.h" @@ -26,6 +25,7 @@ #include "PPCSubtarget.h" #include "PPCTargetMachine.h" #include "PPCTargetStreamer.h" +#include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" @@ -95,68 +95,102 @@ public: return AsmPrinter::doInitialization(M); } - void EmitInstruction(const MachineInstr *MI) override; + void EmitInstruction(const MachineInstr *MI) override; + + /// This function is for PrintAsmOperand and PrintAsmMemoryOperand, + /// invoked by EmitMSInlineAsmStr and EmitGCCInlineAsmStr only. + /// The \p MI would be INLINEASM ONLY. + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + + void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + const char *ExtraCode, raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + const char *ExtraCode, raw_ostream &O) override; + + void EmitEndOfAsmFile(Module &M) override; + + void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI); + void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); + void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); + bool runOnMachineFunction(MachineFunction &MF) override { + Subtarget = &MF.getSubtarget(); + bool Changed = AsmPrinter::runOnMachineFunction(MF); + emitXRayTable(); + return Changed; + } +}; - void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); +/// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux +class PPCLinuxAsmPrinter : public PPCAsmPrinter { +public: + explicit PPCLinuxAsmPrinter(TargetMachine &TM, + std::unique_ptr Streamer) + : PPCAsmPrinter(TM, std::move(Streamer)) {} - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; - bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + StringRef getPassName() const override { + return "Linux PPC Assembly Printer"; + } - void EmitEndOfAsmFile(Module &M) override; + bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; - void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI); - void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); - void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); - bool runOnMachineFunction(MachineFunction &MF) override { - Subtarget = &MF.getSubtarget(); - bool Changed = AsmPrinter::runOnMachineFunction(MF); - emitXRayTable(); - return Changed; - } - }; + void EmitFunctionEntryLabel() override; - /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux - class PPCLinuxAsmPrinter : public PPCAsmPrinter { - public: - explicit PPCLinuxAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer) - : PPCAsmPrinter(TM, std::move(Streamer)) {} + void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; + void EmitInstruction(const MachineInstr *MI) override; +}; - StringRef getPassName() const override { - return "Linux PPC Assembly Printer"; - } +/// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac +/// OS X +class PPCDarwinAsmPrinter : public PPCAsmPrinter { +public: + explicit PPCDarwinAsmPrinter(TargetMachine &TM, + std::unique_ptr Streamer) + : PPCAsmPrinter(TM, std::move(Streamer)) {} - bool doFinalization(Module &M) override; - void EmitStartOfAsmFile(Module &M) override; + StringRef getPassName() const override { + return "Darwin PPC Assembly Printer"; + } - void EmitFunctionEntryLabel() override; + bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; +}; - void EmitFunctionBodyStart() override; - void EmitFunctionBodyEnd() override; - void EmitInstruction(const MachineInstr *MI) override; - }; +class PPCAIXAsmPrinter : public PPCAsmPrinter { +public: + PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) + : PPCAsmPrinter(TM, std::move(Streamer)) {} - /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac - /// OS X - class PPCDarwinAsmPrinter : public PPCAsmPrinter { - public: - explicit PPCDarwinAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer) - : PPCAsmPrinter(TM, std::move(Streamer)) {} + StringRef getPassName() const override { return "AIX PPC Assembly Printer"; } +}; - StringRef getPassName() const override { - return "Darwin PPC Assembly Printer"; - } +} // end anonymous namespace - bool doFinalization(Module &M) override; - void EmitStartOfAsmFile(Module &M) override; - }; +void PPCAsmPrinter::PrintSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { + // Computing the address of a global symbol, not calling it. + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *SymToPrint; + + // External or weakly linked global variables need non-lazily-resolved stubs + if (Subtarget->hasLazyResolverStub(GV)) { + SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo().getGVStubEntry( + SymToPrint); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), + !GV->hasInternalLinkage()); + } else { + SymToPrint = getSymbol(GV); + } -} // end anonymous namespace + SymToPrint->print(O, MAI); + + printOffset(MO.getOffset(), O); +} void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { @@ -165,10 +199,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, switch (MO.getType()) { case MachineOperand::MO_Register: { - unsigned Reg = PPCInstrInfo::getRegNumForOperand(MI->getDesc(), - MO.getReg(), OpNo); - - const char *RegName = PPCInstPrinter::getRegisterName(Reg); + // The MI is INLINEASM ONLY and UseVSXReg is always false. + const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg()); // Linux assembler (Others?) does not take register mnemonics. // FIXME - What about special registers used in mfspr/mtspr? @@ -192,26 +224,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI); return; case MachineOperand::MO_GlobalAddress: { - // Computing the address of a global symbol, not calling it. - const GlobalValue *GV = MO.getGlobal(); - MCSymbol *SymToPrint; - - // External or weakly linked global variables need non-lazily-resolved stubs - if (Subtarget->hasLazyResolverStub(GV)) { - SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo().getGVStubEntry( - SymToPrint); - if (!StubSym.getPointer()) - StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), - !GV->hasInternalLinkage()); - } else { - SymToPrint = getSymbol(GV); - } - - SymToPrint->print(O, MAI); - - printOffset(MO.getOffset(), O); + PrintSymbolOperand(MO, O); return; } @@ -224,7 +237,6 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { @@ -233,9 +245,7 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); - case 'c': // Don't print "$" before a global var name or constant. - break; // PPC never has a prefix. + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); case 'L': // Write second word of DImode reference. // Verify that this operand has two consecutive registers. if (!MI->getOperand(OpNo).isReg() || @@ -277,7 +287,6 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, // assembler operand. bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) { @@ -460,6 +469,7 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, StringRef Name = "__tls_get_addr"; MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name); MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None; + const Module *M = MF->getFunction().getParent(); assert(MI->getOperand(0).isReg() && ((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) || @@ -473,8 +483,14 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, if (!Subtarget->isPPC64() && !Subtarget->isDarwin() && isPositionIndependent()) Kind = MCSymbolRefExpr::VK_PLT; - const MCSymbolRefExpr *TlsRef = + const MCExpr *TlsRef = MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext); + + // Add 32768 offset to the symbol so we follow up the latest GOT/PLT ABI. + if (Kind == MCSymbolRefExpr::VK_PLT && Subtarget->isSecurePlt() && + M->getPICLevel() == PICLevel::BigPIC) + TlsRef = MCBinaryExpr::createAdd( + TlsRef, MCConstantExpr::create(32768, OutContext), OutContext); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -576,34 +592,30 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { // Into: lwz %rt, .L0$poff - .L0$pb(%ri) // add %rd, %rt, %ri // or into (if secure plt mode is on): - // addis r30, r30, .LTOC - .L0$pb@ha - // addi r30, r30, .LTOC - .L0$pb@l + // addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha + // addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l // Get the offset from the GOT Base Register to the GOT LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); if (Subtarget->isSecurePlt() && isPositionIndependent() ) { unsigned PICR = TmpInst.getOperand(0).getReg(); - MCSymbol *LTOCSymbol = OutContext.getOrCreateSymbol(StringRef(".LTOC")); + MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol( + M->getPICLevel() == PICLevel::SmallPIC ? "_GLOBAL_OFFSET_TABLE_" + : ".LTOC"); const MCExpr *PB = - MCSymbolRefExpr::create(MF->getPICBaseSymbol(), - OutContext); + MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext); - const MCExpr *LTOCDeltaExpr = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(LTOCSymbol, OutContext), - PB, OutContext); + const MCExpr *DeltaExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext); - const MCExpr *LTOCDeltaHi = - PPCMCExpr::createHa(LTOCDeltaExpr, false, OutContext); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) - .addReg(PICR) - .addReg(PICR) - .addExpr(LTOCDeltaHi)); + const MCExpr *DeltaHi = PPCMCExpr::createHa(DeltaExpr, false, OutContext); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi)); - const MCExpr *LTOCDeltaLo = - PPCMCExpr::createLo(LTOCDeltaExpr, false, OutContext); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI) - .addReg(PICR) - .addReg(PICR) - .addExpr(LTOCDeltaLo)); + const MCExpr *DeltaLo = PPCMCExpr::createLo(DeltaExpr, false, OutContext); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo)); return; } else { MCSymbol *PICOffset = @@ -1640,6 +1652,9 @@ createPPCAsmPrinterPass(TargetMachine &tm, std::unique_ptr &&Streamer) { if (tm.getTargetTriple().isMacOSX()) return new PPCDarwinAsmPrinter(tm, std::move(Streamer)); + if (tm.getTargetTriple().isOSAIX()) + return new PPCAIXAsmPrinter(tm, std::move(Streamer)); + return new PPCLinuxAsmPrinter(tm, std::move(Streamer)); } diff --git a/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/lib/Target/PowerPC/PPCBoolRetToInt.cpp index 55e105dad0e5..104cf2ba3c00 100644 --- a/lib/Target/PowerPC/PPCBoolRetToInt.cpp +++ b/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -1,9 +1,8 @@ //===- PPCBoolRetToInt.cpp ------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp index bbb977f090c5..5e9a661f8f0b 100644 --- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -1,9 +1,8 @@ //===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -34,10 +33,6 @@ STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced"); STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged"); STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced"); -namespace llvm { - void initializePPCBranchCoalescingPass(PassRegistry&); -} - //===----------------------------------------------------------------------===// // PPCBranchCoalescing //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp index 0d1bb9297bcb..793d690baec3 100644 --- a/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -1,9 +1,8 @@ //===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,16 +25,13 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" +#include using namespace llvm; #define DEBUG_TYPE "ppc-branch-select" STATISTIC(NumExpanded, "Number of branches expanded to long format"); -namespace llvm { - void initializePPCBSelPass(PassRegistry&); -} - namespace { struct PPCBSel : public MachineFunctionPass { static char ID; @@ -48,6 +44,17 @@ namespace { // size that is due to potential padding. std::vector> BlockSizes; + // The first block number which has imprecise instruction address. + int FirstImpreciseBlock = -1; + + unsigned GetAlignmentAdjustment(MachineBasicBlock &MBB, unsigned Offset); + unsigned ComputeBlockSizes(MachineFunction &Fn); + void modifyAdjustment(MachineFunction &Fn); + int computeBranchSize(MachineFunction &Fn, + const MachineBasicBlock *Src, + const MachineBasicBlock *Dest, + unsigned BrOffset); + bool runOnMachineFunction(MachineFunction &Fn) override; MachineFunctionProperties getRequiredProperties() const override { @@ -70,43 +77,47 @@ FunctionPass *llvm::createPPCBranchSelectionPass() { return new PPCBSel(); } -bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { - const PPCInstrInfo *TII = - static_cast(Fn.getSubtarget().getInstrInfo()); - // Give the blocks of the function a dense, in-order, numbering. - Fn.RenumberBlocks(); - BlockSizes.resize(Fn.getNumBlockIDs()); - - auto GetAlignmentAdjustment = - [](MachineBasicBlock &MBB, unsigned Offset) -> unsigned { - unsigned Align = MBB.getAlignment(); - if (!Align) - return 0; - - unsigned AlignAmt = 1 << Align; - unsigned ParentAlign = MBB.getParent()->getAlignment(); - - if (Align <= ParentAlign) - return OffsetToAlignment(Offset, AlignAmt); - - // The alignment of this MBB is larger than the function's alignment, so we - // can't tell whether or not it will insert nops. Assume that it will. - return AlignAmt + OffsetToAlignment(Offset, AlignAmt); - }; +/// In order to make MBB aligned, we need to add an adjustment value to the +/// original Offset. +unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB, + unsigned Offset) { + unsigned Align = MBB.getAlignment(); + if (!Align) + return 0; + + unsigned AlignAmt = 1 << Align; + unsigned ParentAlign = MBB.getParent()->getAlignment(); + + if (Align <= ParentAlign) + return OffsetToAlignment(Offset, AlignAmt); + + // The alignment of this MBB is larger than the function's alignment, so we + // can't tell whether or not it will insert nops. Assume that it will. + if (FirstImpreciseBlock < 0) + FirstImpreciseBlock = MBB.getNumber(); + return AlignAmt + OffsetToAlignment(Offset, AlignAmt); +} - // We need to be careful about the offset of the first block in the function - // because it might not have the function's alignment. This happens because, - // under the ELFv2 ABI, for functions which require a TOC pointer, we add a - // two-instruction sequence to the start of the function. - // Note: This needs to be synchronized with the check in - // PPCLinuxAsmPrinter::EmitFunctionBodyStart. +/// We need to be careful about the offset of the first block in the function +/// because it might not have the function's alignment. This happens because, +/// under the ELFv2 ABI, for functions which require a TOC pointer, we add a +/// two-instruction sequence to the start of the function. +/// Note: This needs to be synchronized with the check in +/// PPCLinuxAsmPrinter::EmitFunctionBodyStart. +static inline unsigned GetInitialOffset(MachineFunction &Fn) { unsigned InitialOffset = 0; if (Fn.getSubtarget().isELFv2ABI() && !Fn.getRegInfo().use_empty(PPC::X2)) InitialOffset = 8; + return InitialOffset; +} + +/// Measure each MBB and compute a size for the entire function. +unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) { + const PPCInstrInfo *TII = + static_cast(Fn.getSubtarget().getInstrInfo()); + unsigned FuncSize = GetInitialOffset(Fn); - // Measure each MBB and compute a size for the entire function. - unsigned FuncSize = InitialOffset; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { MachineBasicBlock *MBB = &*MFI; @@ -124,13 +135,145 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { } unsigned BlockSize = 0; - for (MachineInstr &MI : *MBB) + for (MachineInstr &MI : *MBB) { BlockSize += TII->getInstSizeInBytes(MI); + if (MI.isInlineAsm() && (FirstImpreciseBlock < 0)) + FirstImpreciseBlock = MBB->getNumber(); + } BlockSizes[MBB->getNumber()].first = BlockSize; FuncSize += BlockSize; } + return FuncSize; +} + +/// Modify the basic block align adjustment. +void PPCBSel::modifyAdjustment(MachineFunction &Fn) { + unsigned Offset = GetInitialOffset(Fn); + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock *MBB = &*MFI; + + if (MBB->getNumber() > 0) { + auto &BS = BlockSizes[MBB->getNumber()-1]; + BS.first -= BS.second; + Offset -= BS.second; + + unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset); + + BS.first += AlignExtra; + BS.second = AlignExtra; + + Offset += AlignExtra; + } + + Offset += BlockSizes[MBB->getNumber()].first; + } +} + +/// Determine the offset from the branch in Src block to the Dest block. +/// BrOffset is the offset of the branch instruction inside Src block. +int PPCBSel::computeBranchSize(MachineFunction &Fn, + const MachineBasicBlock *Src, + const MachineBasicBlock *Dest, + unsigned BrOffset) { + int BranchSize; + unsigned MaxAlign = 2; + bool NeedExtraAdjustment = false; + if (Dest->getNumber() <= Src->getNumber()) { + // If this is a backwards branch, the delta is the offset from the + // start of this block to this branch, plus the sizes of all blocks + // from this block to the dest. + BranchSize = BrOffset; + MaxAlign = std::max(MaxAlign, Src->getAlignment()); + + int DestBlock = Dest->getNumber(); + BranchSize += BlockSizes[DestBlock].first; + for (unsigned i = DestBlock+1, e = Src->getNumber(); i < e; ++i) { + BranchSize += BlockSizes[i].first; + MaxAlign = std::max(MaxAlign, + Fn.getBlockNumbered(i)->getAlignment()); + } + + NeedExtraAdjustment = (FirstImpreciseBlock >= 0) && + (DestBlock >= FirstImpreciseBlock); + } else { + // Otherwise, add the size of the blocks between this block and the + // dest to the number of bytes left in this block. + unsigned StartBlock = Src->getNumber(); + BranchSize = BlockSizes[StartBlock].first - BrOffset; + + MaxAlign = std::max(MaxAlign, Dest->getAlignment()); + for (unsigned i = StartBlock+1, e = Dest->getNumber(); i != e; ++i) { + BranchSize += BlockSizes[i].first; + MaxAlign = std::max(MaxAlign, + Fn.getBlockNumbered(i)->getAlignment()); + } + + NeedExtraAdjustment = (FirstImpreciseBlock >= 0) && + (Src->getNumber() >= FirstImpreciseBlock); + } + + // We tend to over estimate code size due to large alignment and + // inline assembly. Usually it causes larger computed branch offset. + // But sometimes it may also causes smaller computed branch offset + // than actual branch offset. If the offset is close to the limit of + // encoding, it may cause problem at run time. + // Following is a simplified example. + // + // actual estimated + // address address + // ... + // bne Far 100 10c + // .p2align 4 + // Near: 110 110 + // ... + // Far: 8108 8108 + // + // Actual offset: 0x8108 - 0x100 = 0x8008 + // Computed offset: 0x8108 - 0x10c = 0x7ffc + // + // This example also shows when we can get the largest gap between + // estimated offset and actual offset. If there is an aligned block + // ABB between branch and target, assume its alignment is + // bits. Now consider the accumulated function size FSIZE till the end + // of previous block PBB. If the estimated FSIZE is multiple of + // 2^, we don't need any padding for the estimated address of + // ABB. If actual FSIZE at the end of PBB is 4 bytes more than + // multiple of 2^, then we need (2^ - 4) bytes of + // padding. It also means the actual branch offset is (2^ - 4) + // larger than computed offset. Other actual FSIZE needs less padding + // bytes, so causes smaller gap between actual and computed offset. + // + // On the other hand, if the inline asm or large alignment occurs + // between the branch block and destination block, the estimated address + // can be larger than actual address. If padding bytes are + // needed for a later aligned block, the actual number of padding bytes + // is at most more than estimated padding bytes. So the actual + // aligned block address is less than or equal to the estimated aligned + // block address. So the actual branch offset is less than or equal to + // computed branch offset. + // + // The computed offset is at most ((1 << alignment) - 4) bytes smaller + // than actual offset. So we add this number to the offset for safety. + if (NeedExtraAdjustment) + BranchSize += (1 << MaxAlign) - 4; + + return BranchSize; +} + +bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { + const PPCInstrInfo *TII = + static_cast(Fn.getSubtarget().getInstrInfo()); + // Give the blocks of the function a dense, in-order, numbering. + Fn.RenumberBlocks(); + BlockSizes.resize(Fn.getNumBlockIDs()); + FirstImpreciseBlock = -1; + + // Measure each MBB and compute a size for the entire function. + unsigned FuncSize = ComputeBlockSizes(Fn); + // If the entire function is smaller than the displacement of a branch field, // we know we don't need to shrink any branches in this function. This is a // common case. @@ -178,23 +321,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { // Determine the offset from the current branch to the destination // block. - int BranchSize; - if (Dest->getNumber() <= MBB.getNumber()) { - // If this is a backwards branch, the delta is the offset from the - // start of this block to this branch, plus the sizes of all blocks - // from this block to the dest. - BranchSize = MBBStartOffset; - - for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i) - BranchSize += BlockSizes[i].first; - } else { - // Otherwise, add the size of the blocks between this block and the - // dest to the number of bytes left in this block. - BranchSize = -MBBStartOffset; - - for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i) - BranchSize += BlockSizes[i].first; - } + int BranchSize = computeBranchSize(Fn, &MBB, Dest, MBBStartOffset); // If this branch is in range, ignore it. if (isInt<16>(BranchSize)) { @@ -253,26 +380,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { if (MadeChange) { // If we're going to iterate again, make sure we've updated our // padding-based contributions to the block sizes. - unsigned Offset = InitialOffset; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) { - MachineBasicBlock *MBB = &*MFI; - - if (MBB->getNumber() > 0) { - auto &BS = BlockSizes[MBB->getNumber()-1]; - BS.first -= BS.second; - Offset -= BS.second; - - unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset); - - BS.first += AlignExtra; - BS.second = AlignExtra; - - Offset += AlignExtra; - } - - Offset += BlockSizes[MBB->getNumber()].first; - } + modifyAdjustment(Fn); } EverMadeChange |= MadeChange; diff --git a/lib/Target/PowerPC/PPCCCState.cpp b/lib/Target/PowerPC/PPCCCState.cpp index 5510a95430f5..5116f0d121f4 100644 --- a/lib/Target/PowerPC/PPCCCState.cpp +++ b/lib/Target/PowerPC/PPCCCState.cpp @@ -1,9 +1,8 @@ //===---- PPCCCState.cpp - CCState with PowerPC specific extensions ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCCCState.h b/lib/Target/PowerPC/PPCCCState.h index 9be9f11dbea3..e3499597474c 100644 --- a/lib/Target/PowerPC/PPCCCState.h +++ b/lib/Target/PowerPC/PPCCCState.h @@ -1,9 +1,8 @@ //===---- PPCCCState.h - CCState with PowerPC specific extensions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 6b9e2383e36f..2b8d9b87724f 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -1,9 +1,8 @@ //===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -72,70 +71,7 @@ using namespace llvm; static cl::opt CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); #endif -// The latency of mtctr is only justified if there are more than 4 -// comparisons that will be removed as a result. -static cl::opt -SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, - cl::desc("Loops with a constant trip count smaller than " - "this value will not use the count register.")); - -STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); - -namespace llvm { - void initializePPCCTRLoopsPass(PassRegistry&); -#ifndef NDEBUG - void initializePPCCTRLoopsVerifyPass(PassRegistry&); -#endif -} - namespace { - struct PPCCTRLoops : public FunctionPass { - -#ifndef NDEBUG - static int Counter; -#endif - - public: - static char ID; - - PPCCTRLoops() : FunctionPass(ID) { - initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - } - - private: - bool mightUseCTR(BasicBlock *BB); - bool convertToCTRLoop(Loop *L); - - private: - const PPCTargetMachine *TM; - const PPCSubtarget *STI; - const PPCTargetLowering *TLI; - const DataLayout *DL; - const TargetLibraryInfo *LibInfo; - const TargetTransformInfo *TTI; - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - bool PreserveLCSSA; - TargetSchedModel SchedModel; - }; - - char PPCCTRLoops::ID = 0; -#ifndef NDEBUG - int PPCCTRLoops::Counter = 0; -#endif #ifndef NDEBUG struct PPCCTRLoopsVerify : public MachineFunctionPass { @@ -161,16 +97,6 @@ namespace { #endif // NDEBUG } // end anonymous namespace -INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", - false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", - false, false) - -FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); } - #ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) @@ -183,511 +109,6 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() { } #endif // NDEBUG -bool PPCCTRLoops::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable(); - if (!TPC) - return false; - - TM = &TPC->getTM(); - STI = TM->getSubtargetImpl(F); - TLI = STI->getTargetLowering(); - - LI = &getAnalysis().getLoopInfo(); - SE = &getAnalysis().getSE(); - DT = &getAnalysis().getDomTree(); - TTI = &getAnalysis().getTTI(F); - DL = &F.getParent()->getDataLayout(); - auto *TLIP = getAnalysisIfAvailable(); - LibInfo = TLIP ? &TLIP->getTLI() : nullptr; - PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - - bool MadeChange = false; - - for (LoopInfo::iterator I = LI->begin(), E = LI->end(); - I != E; ++I) { - Loop *L = *I; - if (!L->getParentLoop()) - MadeChange |= convertToCTRLoop(L); - } - - return MadeChange; -} - -static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { - if (IntegerType *ITy = dyn_cast(Ty)) - return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); - - return false; -} - -// Determining the address of a TLS variable results in a function call in -// certain TLS models. -static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) { - const auto *GV = dyn_cast(MemAddr); - if (!GV) { - // Recurse to check for constants that refer to TLS global variables. - if (const auto *CV = dyn_cast(MemAddr)) - for (const auto &CO : CV->operands()) - if (memAddrUsesCTR(TM, CO)) - return true; - - return false; - } - - if (!GV->isThreadLocal()) - return false; - TLSModel::Model Model = TM.getTLSModel(GV); - return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; -} - -// Loop through the inline asm constraints and look for something that clobbers -// ctr. -static bool asmClobbersCTR(InlineAsm *IA) { - InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); - for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { - InlineAsm::ConstraintInfo &C = CIV[i]; - if (C.Type != InlineAsm::isInput) - for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) - if (StringRef(C.Codes[j]).equals_lower("{ctr}")) - return true; - } - return false; -} - -bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { - for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); - J != JE; ++J) { - if (CallInst *CI = dyn_cast(J)) { - // Inline ASM is okay, unless it clobbers the ctr register. - if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) { - if (asmClobbersCTR(IA)) - return true; - continue; - } - - if (Function *F = CI->getCalledFunction()) { - // Most intrinsics don't become function calls, but some might. - // sin, cos, exp and log are always calls. - unsigned Opcode = 0; - if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { - switch (F->getIntrinsicID()) { - default: continue; - // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr - // we're definitely using CTR. - case Intrinsic::ppc_is_decremented_ctr_nonzero: - case Intrinsic::ppc_mtctr: - return true; - -// VisualStudio defines setjmp as _setjmp -#if defined(_MSC_VER) && defined(setjmp) && \ - !defined(setjmp_undefined_for_msvc) -# pragma push_macro("setjmp") -# undef setjmp -# define setjmp_undefined_for_msvc -#endif - - case Intrinsic::setjmp: - -#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) - // let's return it to _setjmp state -# pragma pop_macro("setjmp") -# undef setjmp_undefined_for_msvc -#endif - - case Intrinsic::longjmp: - - // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp - // because, although it does clobber the counter register, the - // control can't then return to inside the loop unless there is also - // an eh_sjlj_setjmp. - case Intrinsic::eh_sjlj_setjmp: - - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: - case Intrinsic::powi: - case Intrinsic::log: - case Intrinsic::log2: - case Intrinsic::log10: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::pow: - case Intrinsic::sin: - case Intrinsic::cos: - return true; - case Intrinsic::copysign: - if (CI->getArgOperand(0)->getType()->getScalarType()-> - isPPC_FP128Ty()) - return true; - else - continue; // ISD::FCOPYSIGN is never a library call. - case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; - case Intrinsic::floor: Opcode = ISD::FFLOOR; break; - case Intrinsic::ceil: Opcode = ISD::FCEIL; break; - case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; - case Intrinsic::rint: Opcode = ISD::FRINT; break; - case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; - case Intrinsic::round: Opcode = ISD::FROUND; break; - case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; - case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; - case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; - case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; - } - } - - // PowerPC does not use [US]DIVREM or other library calls for - // operations on regular types which are not otherwise library calls - // (i.e. soft float or atomics). If adapting for targets that do, - // additional care is required here. - - LibFunc Func; - if (!F->hasLocalLinkage() && F->hasName() && LibInfo && - LibInfo->getLibFunc(F->getName(), Func) && - LibInfo->hasOptimizedCodeGen(Func)) { - // Non-read-only functions are never treated as intrinsics. - if (!CI->onlyReadsMemory()) - return true; - - // Conversion happens only for FP calls. - if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) - return true; - - switch (Func) { - default: return true; - case LibFunc_copysign: - case LibFunc_copysignf: - continue; // ISD::FCOPYSIGN is never a library call. - case LibFunc_copysignl: - return true; - case LibFunc_fabs: - case LibFunc_fabsf: - case LibFunc_fabsl: - continue; // ISD::FABS is never a library call. - case LibFunc_sqrt: - case LibFunc_sqrtf: - case LibFunc_sqrtl: - Opcode = ISD::FSQRT; break; - case LibFunc_floor: - case LibFunc_floorf: - case LibFunc_floorl: - Opcode = ISD::FFLOOR; break; - case LibFunc_nearbyint: - case LibFunc_nearbyintf: - case LibFunc_nearbyintl: - Opcode = ISD::FNEARBYINT; break; - case LibFunc_ceil: - case LibFunc_ceilf: - case LibFunc_ceill: - Opcode = ISD::FCEIL; break; - case LibFunc_rint: - case LibFunc_rintf: - case LibFunc_rintl: - Opcode = ISD::FRINT; break; - case LibFunc_round: - case LibFunc_roundf: - case LibFunc_roundl: - Opcode = ISD::FROUND; break; - case LibFunc_trunc: - case LibFunc_truncf: - case LibFunc_truncl: - Opcode = ISD::FTRUNC; break; - case LibFunc_fmin: - case LibFunc_fminf: - case LibFunc_fminl: - Opcode = ISD::FMINNUM; break; - case LibFunc_fmax: - case LibFunc_fmaxf: - case LibFunc_fmaxl: - Opcode = ISD::FMAXNUM; break; - } - } - - if (Opcode) { - EVT EVTy = - TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true); - - if (EVTy == MVT::Other) - return true; - - if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) - continue; - else if (EVTy.isVector() && - TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) - continue; - - return true; - } - } - - return true; - } else if (isa(J) && - J->getType()->getScalarType()->isPPC_FP128Ty()) { - // Most operations on ppc_f128 values become calls. - return true; - } else if (isa(J) || isa(J) || - isa(J) || isa(J)) { - CastInst *CI = cast(J); - if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || - CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || - isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) || - isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType())) - return true; - } else if (isLargeIntegerTy(!TM->isPPC64(), - J->getType()->getScalarType()) && - (J->getOpcode() == Instruction::UDiv || - J->getOpcode() == Instruction::SDiv || - J->getOpcode() == Instruction::URem || - J->getOpcode() == Instruction::SRem)) { - return true; - } else if (!TM->isPPC64() && - isLargeIntegerTy(false, J->getType()->getScalarType()) && - (J->getOpcode() == Instruction::Shl || - J->getOpcode() == Instruction::AShr || - J->getOpcode() == Instruction::LShr)) { - // Only on PPC32, for 128-bit integers (specifically not 64-bit - // integers), these might be runtime calls. - return true; - } else if (isa(J) || isa(J)) { - // On PowerPC, indirect jumps use the counter register. - return true; - } else if (SwitchInst *SI = dyn_cast(J)) { - if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) - return true; - } - - // FREM is always a call. - if (J->getOpcode() == Instruction::FRem) - return true; - - if (STI->useSoftFloat()) { - switch(J->getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FCmp: - return true; - } - } - - for (Value *Operand : J->operands()) - if (memAddrUsesCTR(*TM, Operand)) - return true; - } - - return false; -} -bool PPCCTRLoops::convertToCTRLoop(Loop *L) { - bool MadeChange = false; - - // Do not convert small short loops to CTR loop. - unsigned ConstTripCount = SE->getSmallConstantTripCount(L); - if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { - SmallPtrSet EphValues; - auto AC = getAnalysis().getAssumptionCache( - *L->getHeader()->getParent()); - CodeMetrics::collectEphemeralValues(L, &AC, EphValues); - CodeMetrics Metrics; - for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, *TTI, EphValues); - // 6 is an approximate latency for the mtctr instruction. - if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) - return false; - } - - // Process nested loops first. - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { - MadeChange |= convertToCTRLoop(*I); - LLVM_DEBUG(dbgs() << "Nested loop converted\n"); - } - - // If a nested loop has been converted, then we can't convert this loop. - if (MadeChange) - return MadeChange; - - // Bail out if the loop has irreducible control flow. - LoopBlocksRPO RPOT(L); - RPOT.perform(LI); - if (containsIrreducibleCFG(RPOT, *LI)) - return false; - -#ifndef NDEBUG - // Stop trying after reaching the limit (if any). - int Limit = CTRLoopLimit; - if (Limit >= 0) { - if (Counter >= CTRLoopLimit) - return false; - Counter++; - } -#endif - - // We don't want to spill/restore the counter register, and so we don't - // want to use the counter register if the loop contains calls. - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) - if (mightUseCTR(*I)) - return MadeChange; - - SmallVector ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - - // If there is an exit edge known to be frequently taken, - // we should not transform this loop. - for (auto &BB : ExitingBlocks) { - Instruction *TI = BB->getTerminator(); - if (!TI) continue; - - if (BranchInst *BI = dyn_cast(TI)) { - uint64_t TrueWeight = 0, FalseWeight = 0; - if (!BI->isConditional() || - !BI->extractProfMetadata(TrueWeight, FalseWeight)) - continue; - - // If the exit path is more frequent than the loop path, - // we return here without further analysis for this loop. - bool TrueIsExit = !L->contains(BI->getSuccessor(0)); - if (( TrueIsExit && FalseWeight < TrueWeight) || - (!TrueIsExit && FalseWeight > TrueWeight)) - return MadeChange; - } - } - - BasicBlock *CountedExitBlock = nullptr; - const SCEV *ExitCount = nullptr; - BranchInst *CountedExitBranch = nullptr; - for (SmallVectorImpl::iterator I = ExitingBlocks.begin(), - IE = ExitingBlocks.end(); I != IE; ++I) { - const SCEV *EC = SE->getExitCount(L, *I); - LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block " - << (*I)->getName() << ": " << *EC << "\n"); - if (isa(EC)) - continue; - if (const SCEVConstant *ConstEC = dyn_cast(EC)) { - if (ConstEC->getValue()->isZero()) - continue; - } else if (!SE->isLoopInvariant(EC, L)) - continue; - - if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) - continue; - - // If this exiting block is contained in a nested loop, it is not eligible - // for insertion of the branch-and-decrement since the inner loop would - // end up messing up the value in the CTR. - if (LI->getLoopFor(*I) != L) - continue; - - // We now have a loop-invariant count of loop iterations (which is not the - // constant zero) for which we know that this loop will not exit via this - // existing block. - - // We need to make sure that this block will run on every loop iteration. - // For this to be true, we must dominate all blocks with backedges. Such - // blocks are in-loop predecessors to the header block. - bool NotAlways = false; - for (pred_iterator PI = pred_begin(L->getHeader()), - PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { - if (!L->contains(*PI)) - continue; - - if (!DT->dominates(*I, *PI)) { - NotAlways = true; - break; - } - } - - if (NotAlways) - continue; - - // Make sure this blocks ends with a conditional branch. - Instruction *TI = (*I)->getTerminator(); - if (!TI) - continue; - - if (BranchInst *BI = dyn_cast(TI)) { - if (!BI->isConditional()) - continue; - - CountedExitBranch = BI; - } else - continue; - - // Note that this block may not be the loop latch block, even if the loop - // has a latch block. - CountedExitBlock = *I; - ExitCount = EC; - break; - } - - if (!CountedExitBlock) - return MadeChange; - - BasicBlock *Preheader = L->getLoopPreheader(); - - // If we don't have a preheader, then insert one. If we already have a - // preheader, then we can use it (except if the preheader contains a use of - // the CTR register because some such uses might be reordered by the - // selection DAG after the mtctr instruction). - if (!Preheader || mightUseCTR(Preheader)) - Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); - if (!Preheader) - return MadeChange; - - LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() - << "\n"); - - // Insert the count into the preheader and replace the condition used by the - // selected branch. - MadeChange = true; - - SCEVExpander SCEVE(*SE, *DL, "loopcnt"); - LLVMContext &C = SE->getContext(); - Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); - if (!ExitCount->getType()->isPointerTy() && - ExitCount->getType() != CountType) - ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); - ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); - Value *ECValue = - SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); - - IRBuilder<> CountBuilder(Preheader->getTerminator()); - Module *M = Preheader->getParent()->getParent(); - Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, - CountType); - CountBuilder.CreateCall(MTCTRFunc, ECValue); - - IRBuilder<> CondBuilder(CountedExitBranch); - Value *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero); - Value *NewCond = CondBuilder.CreateCall(DecFunc, {}); - Value *OldCond = CountedExitBranch->getCondition(); - CountedExitBranch->setCondition(NewCond); - - // The false branch must exit the loop. - if (!L->contains(CountedExitBranch->getSuccessor(0))) - CountedExitBranch->swapSuccessors(); - - // The old condition may be dead now, and may have even created a dead PHI - // (the original induction variable). - RecursivelyDeleteTriviallyDeadInstructions(OldCond); - // Run through the basic blocks of the loop and see if any of them have dead - // PHIs that can be removed. - for (auto I : L->blocks()) - DeleteDeadPHIs(I); - - ++NumCTRLoops; - return MadeChange; -} - #ifndef NDEBUG static bool clobbersCTR(const MachineInstr &MI) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { diff --git a/lib/Target/PowerPC/PPCCallingConv.cpp b/lib/Target/PowerPC/PPCCallingConv.cpp new file mode 100644 index 000000000000..77cdf5c939dc --- /dev/null +++ b/lib/Target/PowerPC/PPCCallingConv.cpp @@ -0,0 +1,162 @@ +//===-- PPCCallingConv.h - --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PPCRegisterInfo.h" +#include "PPCCallingConv.h" +#include "PPCSubtarget.h" +#include "PPCCCState.h" +using namespace llvm; + +inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &, + CCValAssign::LocInfo &, ISD::ArgFlagsTy &, + CCState &) { + llvm_unreachable("The AnyReg calling convention is only supported by the " \ + "stackmap and patchpoint intrinsics."); + // gracefully fallback to PPC C calling convention on Release builds. + return false; +} + +static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return true; +} + +static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs); + + // Skip one register if the first unallocated register has an even register + // number and there are still argument registers available which have not been + // allocated yet. RegNum is actually an index into ArgRegs, which means we + // need to skip a register if RegNum is odd. + if (RegNum != NumArgRegs && RegNum % 2 == 1) { + State.AllocateReg(ArgRegs[RegNum]); + } + + // Always return false here, as this function only makes sure that the first + // unallocated register has an odd register number and does not actually + // allocate a register for the current argument. + return false; +} + +static bool CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128( + unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs); + int RegsLeft = NumArgRegs - RegNum; + + // Skip if there is not enough registers left for long double type (4 gpr regs + // in soft float mode) and put long double argument on the stack. + if (RegNum != NumArgRegs && RegsLeft < 4) { + for (int i = 0; i < RegsLeft; i++) { + State.AllocateReg(ArgRegs[RegNum + i]); + } + } + + return false; +} + +static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8 + }; + + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs); + + // If there is only one Floating-point register left we need to put both f64 + // values of a split ppc_fp128 value on the stack. + if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { + State.AllocateReg(ArgRegs[RegNum]); + } + + // Always return false here, as this function only makes sure that the two f64 + // values a ppc_fp128 value is split into are both passed in registers or both + // passed on the stack and does not actually allocate a register for the + // current argument. + return false; +} + +// Split F64 arguments into two 32-bit consecutive registers. +static bool CC_PPC32_SPE_CustomSplitFP64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg HiRegList[] = { PPC::R3, PPC::R5, PPC::R7, PPC::R9 }; + static const MCPhysReg LoRegList[] = { PPC::R4, PPC::R6, PPC::R8, PPC::R10 }; + + // Try to get the first register. + unsigned Reg = State.AllocateReg(HiRegList); + if (!Reg) + return false; + + unsigned i; + for (i = 0; i < sizeof(HiRegList) / sizeof(HiRegList[0]); ++i) + if (HiRegList[i] == Reg) + break; + + unsigned T = State.AllocateReg(LoRegList[i]); + (void)T; + assert(T == LoRegList[i] && "Could not allocate register"); + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +// Same as above, but for return values, so only allocate for R3 and R4 +static bool CC_PPC32_SPE_RetF64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg HiRegList[] = { PPC::R3 }; + static const MCPhysReg LoRegList[] = { PPC::R4 }; + + // Try to get the first register. + unsigned Reg = State.AllocateReg(HiRegList, LoRegList); + if (!Reg) + return false; + + unsigned i; + for (i = 0; i < sizeof(HiRegList) / sizeof(HiRegList[0]); ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +#include "PPCGenCallingConv.inc" diff --git a/lib/Target/PowerPC/PPCCallingConv.h b/lib/Target/PowerPC/PPCCallingConv.h index eb904a858592..03d9be0a73d9 100644 --- a/lib/Target/PowerPC/PPCCallingConv.h +++ b/lib/Target/PowerPC/PPCCallingConv.h @@ -1,9 +1,8 @@ //=== PPCCallingConv.h - PPC Custom Calling Convention Routines -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,14 +19,27 @@ namespace llvm { -inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &, - CCValAssign::LocInfo &, ISD::ArgFlagsTy &, - CCState &) { - llvm_unreachable("The AnyReg calling convention is only supported by the " \ - "stackmap and patchpoint intrinsics."); - // gracefully fallback to PPC C calling convention on Release builds. - return false; -} +bool RetCC_PPC(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool RetCC_PPC_Cold(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_PPC32_SVR4(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_PPC32_SVR4_ByVal(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); +bool CC_PPC32_SVR4_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); } // End llvm namespace diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td index 22842d516e7d..369b9ce1a711 100644 --- a/lib/Target/PowerPC/PPCCallingConv.td +++ b/lib/Target/PowerPC/PPCCallingConv.td @@ -1,9 +1,8 @@ //===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -46,6 +45,7 @@ def RetCC_PPC64_AnyReg : CallingConv<[ ]>; // Return-value convention for PowerPC coldcc. +let Entry = 1 in def RetCC_PPC_Cold : CallingConv<[ // Use the same return registers as RetCC_PPC, but limited to only // one return value. The remaining return values will be saved to @@ -70,6 +70,7 @@ def RetCC_PPC_Cold : CallingConv<[ ]>; // Return-value convention for PowerPC +let Entry = 1 in def RetCC_PPC : CallingConv<[ CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, @@ -90,7 +91,7 @@ def RetCC_PPC : CallingConv<[ CCIfSubtarget<"hasSPE()", CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>, CCIfSubtarget<"hasSPE()", - CCIfType<[f64], CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>, + CCIfType<[f64], CCCustom<"CC_PPC32_SPE_RetF64">>>, // For P9, f128 are passed in vector registers. CCIfType<[f128], @@ -126,6 +127,7 @@ def CC_PPC64_AnyReg : CallingConv<[ // Simple calling convention for 64-bit ELF PowerPC fast isel. // Only handle ints and floats. All ints are promoted to i64. // Vector types and quadword ints are not handled. +let Entry = 1 in def CC_PPC64_ELF_FIS : CallingConv<[ CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, @@ -141,6 +143,7 @@ def CC_PPC64_ELF_FIS : CallingConv<[ // All small ints are promoted to i64. Vector types, quadword ints, // and multiple register returns are "supported" to avoid compile // errors, but none are handled by the fast selector. +let Entry = 1 in def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, @@ -179,6 +182,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[i32], CCIfSplit>>>, + CCIfType<[f64], + CCIfSubtarget<"hasSPE()", + CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>, CCIfSplit>>>, @@ -199,7 +205,7 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", - CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>, + CCCustom<"CC_PPC32_SPE_CustomSplitFP64">>>, CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>, @@ -228,12 +234,14 @@ def CC_PPC32_SVR4_Common : CallingConv<[ // This calling convention puts vector arguments always on the stack. It is used // to assign vector arguments which belong to the variable portion of the // parameter list of a variable argument function. +let Entry = 1 in def CC_PPC32_SVR4_VarArg : CallingConv<[ CCDelegateTo ]>; // In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to // put vector arguments in vector registers before putting them on the stack. +let Entry = 1 in def CC_PPC32_SVR4 : CallingConv<[ // QPX vectors mirror the scalar FP convention. CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", @@ -265,6 +273,7 @@ def CC_PPC32_SVR4 : CallingConv<[ // The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are // not passed by value. +let Entry = 1 in def CC_PPC32_SVR4_ByVal : CallingConv<[ CCIfByVal>, @@ -300,6 +309,13 @@ def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>; def CSR_SVR432_SPE : CalleeSavedRegs<(add CSR_SVR432_COMM, CSR_SPE)>; +def CSR_AIX32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, + R29, R30, R31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, X29, X30, X31, F14, F15, F16, F17, F18, @@ -316,6 +332,13 @@ def CSR_SVR464 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, F27, F28, F29, F30, F31, CR2, CR3, CR4 )>; +def CSR_AIX64 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, + X21, X22, X23, X24, X25, X26, X27, X28, + X29, X30, X31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + // CSRs that are handled by prologue, epilogue. def CSR_SRV464_TLS_PE : CalleeSavedRegs<(add)>; @@ -343,15 +366,22 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>; // and value may be altered by inter-library calls. // Do not include r12 as it is used as a scratch register. // Do not include return registers r3, f1, v2. -def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10), - (sequence "R%u", 14, 31), - F0, (sequence "F%u", 2, 31), - (sequence "CR%u", 0, 7))>; +def CSR_SVR32_ColdCC_Common : CalleeSavedRegs<(add (sequence "R%u", 4, 10), + (sequence "R%u", 14, 31), + (sequence "CR%u", 0, 7))>; + +def CSR_SVR32_ColdCC : CalleeSavedRegs<(add CSR_SVR32_ColdCC_Common, + F0, (sequence "F%u", 2, 31))>; + def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC, (sequence "V%u", 0, 1), (sequence "V%u", 3, 31))>; +def CSR_SVR32_ColdCC_SPE : CalleeSavedRegs<(add CSR_SVR32_ColdCC_Common, + (sequence "S%u", 4, 10), + (sequence "S%u", 14, 31))>; + def CSR_SVR64_ColdCC : CalleeSavedRegs<(add (sequence "X%u", 4, 10), (sequence "X%u", 14, 31), F0, (sequence "F%u", 2, 31), diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp index ac931f7d0ec0..aa5d830b549e 100644 --- a/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -1,9 +1,8 @@ //===------------- PPCEarlyReturn.cpp - Form Early Returns ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,10 +36,6 @@ using namespace llvm; STATISTIC(NumBCLR, "Number of early conditional returns"); STATISTIC(NumBLR, "Number of early returns"); -namespace llvm { - void initializePPCEarlyReturnPass(PassRegistry&); -} - namespace { // PPCEarlyReturn pass - For simple functions without epilogue code, move // returns up, and create conditional returns, to avoid unnecessary @@ -184,11 +179,11 @@ public: // nothing to do. if (MF.size() < 2) return Changed; - - for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + + // We can't use a range-based for loop due to clobbering the iterator. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E;) { MachineBasicBlock &B = *I++; - if (processBlock(B)) - Changed = true; + Changed |= processBlock(B); } return Changed; diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp index a03e691ef5bb..e8ef451c7ec9 100644 --- a/lib/Target/PowerPC/PPCExpandISEL.cpp +++ b/lib/Target/PowerPC/PPCExpandISEL.cpp @@ -1,9 +1,8 @@ //===------------- PPCExpandISEL.cpp - Expand ISEL instruction ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 3b2d92db78b9..264d6b590f95 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1,9 +1,8 @@ //===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -152,6 +151,14 @@ class PPCFastISel final : public FastISel { bool isVSSRCRegClass(const TargetRegisterClass *RC) const { return RC->getID() == PPC::VSSRCRegClassID; } + unsigned copyRegToRegClass(const TargetRegisterClass *ToRC, + unsigned SrcReg, unsigned Flag = 0, + unsigned SubReg = 0) { + unsigned TmpReg = createResultReg(ToRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), TmpReg).addReg(SrcReg, Flag, SubReg); + return TmpReg; + } bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value, bool isZExt, unsigned DestReg, const PPC::Predicate Pred); @@ -187,7 +194,6 @@ class PPCFastISel final : public FastISel { unsigned &NumBytes, bool IsVarArg); bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes); - LLVM_ATTRIBUTE_UNUSED CCAssignFn *usePPC32CCs(unsigned Flag); private: #include "PPCGenFastISel.inc" @@ -196,23 +202,6 @@ class PPCFastISel final : public FastISel { } // end anonymous namespace -#include "PPCGenCallingConv.inc" - -// Function whose sole purpose is to kill compiler warnings -// stemming from unused functions included from PPCGenCallingConv.inc. -CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) { - if (Flag == 1) - return CC_PPC32_SVR4; - else if (Flag == 2) - return CC_PPC32_SVR4_ByVal; - else if (Flag == 3) - return CC_PPC32_SVR4_VarArg; - else if (Flag == 4) - return RetCC_PPC_Cold; - else - return RetCC_PPC; -} - static Optional getComparePred(CmpInst::Predicate Pred) { switch (Pred) { // These are not representable with any single compare. @@ -874,7 +863,10 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, unsigned CmpOpc; bool NeedsExt = false; - auto RC = MRI.getRegClass(SrcReg1); + + auto RC1 = MRI.getRegClass(SrcReg1); + auto RC2 = SrcReg2 != 0 ? MRI.getRegClass(SrcReg2) : nullptr; + switch (SrcVT.SimpleTy) { default: return false; case MVT::f32: @@ -893,12 +885,10 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, } } else { CmpOpc = PPC::FCMPUS; - if (isVSSRCRegClass(RC)) { - unsigned TmpReg = createResultReg(&PPC::F4RCRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), TmpReg).addReg(SrcReg1); - SrcReg1 = TmpReg; - } + if (isVSSRCRegClass(RC1)) + SrcReg1 = copyRegToRegClass(&PPC::F4RCRegClass, SrcReg1); + if (RC2 && isVSSRCRegClass(RC2)) + SrcReg2 = copyRegToRegClass(&PPC::F4RCRegClass, SrcReg2); } break; case MVT::f64: @@ -915,7 +905,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, CmpOpc = PPC::EFDCMPGT; break; } - } else if (isVSFRCRegClass(RC)) { + } else if (isVSFRCRegClass(RC1) || (RC2 && isVSFRCRegClass(RC2))) { CmpOpc = PPC::XSCMPUDP; } else { CmpOpc = PPC::FCMPUD; @@ -997,12 +987,17 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) { // Round the result to single precision. unsigned DestReg; - + auto RC = MRI.getRegClass(SrcReg); if (PPCSubTarget->hasSPE()) { DestReg = createResultReg(&PPC::SPE4RCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::EFSCFD), DestReg) .addReg(SrcReg); + } else if (isVSFRCRegClass(RC)) { + DestReg = createResultReg(&PPC::VSSRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::XSRSP), DestReg) + .addReg(SrcReg); } else { DestReg = createResultReg(&PPC::F4RCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1217,21 +1212,19 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { if (SrcReg == 0) return false; - // Convert f32 to f64 if necessary. This is just a meaningless copy - // to get the register class right. + // Convert f32 to f64 or convert VSSRC to VSFRC if necessary. This is just a + // meaningless copy to get the register class right. const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg); - if (InRC == &PPC::F4RCRegClass) { - unsigned TmpReg = createResultReg(&PPC::F8RCRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), TmpReg) - .addReg(SrcReg); - SrcReg = TmpReg; - } + if (InRC == &PPC::F4RCRegClass) + SrcReg = copyRegToRegClass(&PPC::F8RCRegClass, SrcReg); + else if (InRC == &PPC::VSSRCRegClass) + SrcReg = copyRegToRegClass(&PPC::VSFRCRegClass, SrcReg); // Determine the opcode for the conversion, which takes place - // entirely within FPRs. + // entirely within FPRs or VSRs. unsigned DestReg; unsigned Opc; + auto RC = MRI.getRegClass(SrcReg); if (PPCSubTarget->hasSPE()) { DestReg = createResultReg(&PPC::GPRCRegClass); @@ -1239,6 +1232,12 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ; else Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ; + } else if (isVSFRCRegClass(RC)) { + DestReg = createResultReg(&PPC::VSFRCRegClass); + if (DstVT == MVT::i32) + Opc = IsSigned ? PPC::XSCVDPSXWS : PPC::XSCVDPUXWS; + else + Opc = IsSigned ? PPC::XSCVDPSXDS : PPC::XSCVDPUXDS; } else { DestReg = createResultReg(&PPC::F8RCRegClass); if (DstVT == MVT::i32) @@ -1520,11 +1519,7 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte if (RetVT == CopyVT) { const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT); - ResultReg = createResultReg(CpyRC); - - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg) - .addReg(SourcePhysReg); + ResultReg = copyRegToRegClass(CpyRC, SourcePhysReg); // If necessary, round the floating result to single precision. } else if (CopyVT == MVT::f64) { @@ -1537,12 +1532,9 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte // used along the fast-isel path (not lowered), and downstream logic // also doesn't like a direct subreg copy on a physical reg.) } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) { - ResultReg = createResultReg(&PPC::GPRCRegClass); // Convert physical register from G8RC to GPRC. SourcePhysReg -= PPC::X0 - PPC::R0; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg) - .addReg(SourcePhysReg); + ResultReg = copyRegToRegClass(&PPC::GPRCRegClass, SourcePhysReg); } assert(ResultReg && "ResultReg unset!"); @@ -1894,13 +1886,8 @@ bool PPCFastISel::SelectTrunc(const Instruction *I) { return false; // The only interesting case is when we need to switch register classes. - if (SrcVT == MVT::i64) { - unsigned ResultReg = createResultReg(&PPC::GPRCRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - ResultReg).addReg(SrcReg, 0, PPC::sub_32); - SrcReg = ResultReg; - } + if (SrcVT == MVT::i64) + SrcReg = copyRegToRegClass(&PPC::GPRCRegClass, SrcReg, 0, PPC::sub_32); updateValueMap(I, SrcReg); return true; @@ -1977,6 +1964,13 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) { case Instruction::Sub: return SelectBinaryIntOp(I, ISD::SUB); case Instruction::Call: + // On AIX, call lowering uses the DAG-ISEL path currently so that the + // callee of the direct function call instruction will be mapped to the + // symbol for the function's entry point, which is distinct from the + // function descriptor symbol. The latter is the symbol whose XCOFF symbol + // name is the C-linkage name of the source level function. + if (TM.getTargetTriple().isOSAIX()) + break; return selectCall(I); case Instruction::Ret: return SelectRet(I); diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 8263954994d2..ebfb1ef7f49b 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,7 +29,6 @@ using namespace llvm; #define DEBUG_TYPE "framelowering" -STATISTIC(NumNoNeedForFrame, "Number of functions without frames"); STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue"); STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue"); @@ -73,10 +71,10 @@ static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) { } static unsigned computeLinkageSize(const PPCSubtarget &STI) { - if (STI.isDarwinABI() || STI.isPPC64()) + if ((STI.isDarwinABI() || STI.isAIXABI()) || STI.isPPC64()) return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4); - // SVR4 ABI: + // 32-bit SVR4 ABI: return 8; } @@ -446,12 +444,27 @@ static bool MustSaveLR(const MachineFunction &MF, unsigned LR) { return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired(); } +/// determineFrameLayoutAndUpdate - Determine the size of the frame and maximum +/// call frame size. Update the MachineFunction object with the stack size. +unsigned +PPCFrameLowering::determineFrameLayoutAndUpdate(MachineFunction &MF, + bool UseEstimate) const { + unsigned NewMaxCallFrameSize = 0; + unsigned FrameSize = determineFrameLayout(MF, UseEstimate, + &NewMaxCallFrameSize); + MF.getFrameInfo().setStackSize(FrameSize); + MF.getFrameInfo().setMaxCallFrameSize(NewMaxCallFrameSize); + return FrameSize; +} + /// determineFrameLayout - Determine the size of the frame and maximum call /// frame size. -unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, - bool UpdateMF, - bool UseEstimate) const { - MachineFrameInfo &MFI = MF.getFrameInfo(); +unsigned +PPCFrameLowering::determineFrameLayout(const MachineFunction &MF, + bool UseEstimate, + unsigned *NewMaxCallFrameSize) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const PPCFunctionInfo *FI = MF.getInfo(); // Get the number of bytes to allocate from the FrameInfo unsigned FrameSize = @@ -469,6 +482,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !MustSaveLR(MF, LR) && // No need to save LR. + !FI->mustSaveTOC() && // No need to save TOC. !RegInfo->hasBasePointer(MF); // No special alignment. // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless @@ -477,10 +491,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, // Check whether we can skip adjusting the stack pointer (by using red zone) if (!DisableRedZone && CanUseRedZone && FitsInRedZone) { - NumNoNeedForFrame++; // No need for frame - if (UpdateMF) - MFI.setStackSize(0); return 0; } @@ -496,9 +507,9 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, if (MFI.hasVarSizedObjects()) maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; - // Update maximum call frame size. - if (UpdateMF) - MFI.setMaxCallFrameSize(maxCallFrameSize); + // Update the new max call frame size if the caller passes in a valid pointer. + if (NewMaxCallFrameSize) + *NewMaxCallFrameSize = maxCallFrameSize; // Include call frame size in total. FrameSize += maxCallFrameSize; @@ -506,10 +517,6 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, // Make sure the frame is aligned. FrameSize = (FrameSize + AlignMask) & ~AlignMask; - // Update frame info. - if (UpdateMF) - MFI.setStackSize(FrameSize); - return FrameSize; } @@ -690,7 +697,7 @@ PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const { const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MachineFunction &MF = *(MBB->getParent()); bool HasBP = RegInfo->hasBasePointer(MF); - unsigned FrameSize = determineFrameLayout(MF, false); + unsigned FrameSize = determineFrameLayout(MF); int NegFrameSize = -FrameSize; bool IsLargeFrame = !isInt<16>(NegFrameSize); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -713,6 +720,50 @@ bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { return findScratchRegister(TmpMBB, true); } +bool PPCFrameLowering::stackUpdateCanBeMoved(MachineFunction &MF) const { + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + PPCFunctionInfo *FI = MF.getInfo(); + + // Abort if there is no register info or function info. + if (!RegInfo || !FI) + return false; + + // Only move the stack update on ELFv2 ABI and PPC64. + if (!Subtarget.isELFv2ABI() || !Subtarget.isPPC64()) + return false; + + // Check the frame size first and return false if it does not fit the + // requirements. + // We need a non-zero frame size as well as a frame that will fit in the red + // zone. This is because by moving the stack pointer update we are now storing + // to the red zone until the stack pointer is updated. If we get an interrupt + // inside the prologue but before the stack update we now have a number of + // stores to the red zone and those stores must all fit. + MachineFrameInfo &MFI = MF.getFrameInfo(); + unsigned FrameSize = MFI.getStackSize(); + if (!FrameSize || FrameSize > Subtarget.getRedZoneSize()) + return false; + + // Frame pointers and base pointers complicate matters so don't do anything + // if we have them. For example having a frame pointer will sometimes require + // a copy of r1 into r31 and that makes keeping track of updates to r1 more + // difficult. + if (hasFP(MF) || RegInfo->hasBasePointer(MF)) + return false; + + // Calls to fast_cc functions use different rules for passing parameters on + // the stack from the ABI and using PIC base in the function imposes + // similar restrictions to using the base pointer. It is not generally safe + // to move the stack pointer update in these situations. + if (FI->hasFastCall() || FI->usesPICBase()) + return false; + + // Finally we can move the stack update if we do not require register + // scavenging. Register scavenging can introduce more spills and so + // may make the frame size larger than we have computed. + return !RegInfo->requiresFrameIndexScavenging(MF); +} + void PPCFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -748,7 +799,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, MBBI = MBB.begin(); // Work out frame sizes. - unsigned FrameSize = determineFrameLayout(MF); + unsigned FrameSize = determineFrameLayoutAndUpdate(MF); int NegFrameSize = -FrameSize; if (!isInt<32>(NegFrameSize)) llvm_unreachable("Unhandled stack size!"); @@ -759,6 +810,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // Check if the link register (LR) must be saved. PPCFunctionInfo *FI = MF.getInfo(); bool MustSaveLR = FI->mustSaveLR(); + bool MustSaveTOC = FI->mustSaveTOC(); const SmallVectorImpl &MustSaveCRs = FI->getMustSaveCRs(); bool MustSaveCR = !MustSaveCRs.empty(); // Do we have a frame pointer and/or base pointer for this function? @@ -770,6 +822,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; + unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2; unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.) @@ -855,6 +908,45 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, assert((isPPC64 || !MustSaveCR) && "Prologue CR saving supported only in 64-bit mode"); + // Check if we can move the stack update instruction (stdu) down the prologue + // past the callee saves. Hopefully this will avoid the situation where the + // saves are waiting for the update on the store with update to complete. + MachineBasicBlock::iterator StackUpdateLoc = MBBI; + bool MovingStackUpdateDown = false; + + // Check if we can move the stack update. + if (stackUpdateCanBeMoved(MF)) { + const std::vector &Info = MFI.getCalleeSavedInfo(); + for (CalleeSavedInfo CSI : Info) { + int FrIdx = CSI.getFrameIdx(); + // If the frame index is not negative the callee saved info belongs to a + // stack object that is not a fixed stack object. We ignore non-fixed + // stack objects because we won't move the stack update pointer past them. + if (FrIdx >= 0) + continue; + + if (MFI.isFixedObjectIndex(FrIdx) && MFI.getObjectOffset(FrIdx) < 0) { + StackUpdateLoc++; + MovingStackUpdateDown = true; + } else { + // We need all of the Frame Indices to meet these conditions. + // If they do not, abort the whole operation. + StackUpdateLoc = MBBI; + MovingStackUpdateDown = false; + break; + } + } + + // If the operation was not aborted then update the object offset. + if (MovingStackUpdateDown) { + for (CalleeSavedInfo CSI : Info) { + int FrIdx = CSI.getFrameIdx(); + if (FrIdx < 0) + MFI.setObjectOffset(FrIdx, MFI.getObjectOffset(FrIdx) + NegFrameSize); + } + } + } + // If we need to spill the CR and the LR but we don't have two separate // registers available, we must spill them one at a time if (MustSaveCR && SingleScratchReg && MustSaveLR) { @@ -918,7 +1010,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, } if (MustSaveLR) - BuildMI(MBB, MBBI, dl, StoreInst) + BuildMI(MBB, StackUpdateLoc, dl, StoreInst) .addReg(ScratchReg, getKillRegState(true)) .addImm(LROffset) .addReg(SPReg); @@ -986,7 +1078,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, HasSTUX = true; } else if (!isLargeFrame) { - BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg) + BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg) .addReg(SPReg) .addImm(NegFrameSize) .addReg(SPReg); @@ -1004,6 +1096,16 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, HasSTUX = true; } + // Save the TOC register after the stack pointer update if a prologue TOC + // save is required for the function. + if (MustSaveTOC) { + assert(isELFv2ABI && "TOC saves in the prologue only supported on ELFv2"); + BuildMI(MBB, StackUpdateLoc, dl, TII.get(PPC::STD)) + .addReg(TOCReg, getKillRegState(true)) + .addImm(TOCSaveOffset) + .addReg(SPReg); + } + if (!HasRedZone) { assert(!isPPC64 && "A red zone is always available on PPC64"); if (HasSTUX) { @@ -1205,6 +1307,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, if (PPC::CRBITRCRegClass.contains(Reg)) continue; + if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) + continue; + // For SVR4, don't emit a move for the CR spill slot if we haven't // spilled CRs. if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4) @@ -1234,6 +1339,12 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIRegister); } else { int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); + // We have changed the object offset above but we do not want to change + // the actual offsets in the CFI instruction so we have to undo the + // offset change here. + if (MovingStackUpdateDown) + Offset -= NegFrameSize; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -1380,6 +1491,32 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, unsigned RBReg = SPReg; unsigned SPAdd = 0; + // Check if we can move the stack update instruction up the epilogue + // past the callee saves. This will allow the move to LR instruction + // to be executed before the restores of the callee saves which means + // that the callee saves can hide the latency from the MTLR instrcution. + MachineBasicBlock::iterator StackUpdateLoc = MBBI; + if (stackUpdateCanBeMoved(MF)) { + const std::vector & Info = MFI.getCalleeSavedInfo(); + for (CalleeSavedInfo CSI : Info) { + int FrIdx = CSI.getFrameIdx(); + // If the frame index is not negative the callee saved info belongs to a + // stack object that is not a fixed stack object. We ignore non-fixed + // stack objects because we won't move the update of the stack pointer + // past them. + if (FrIdx >= 0) + continue; + + if (MFI.isFixedObjectIndex(FrIdx) && MFI.getObjectOffset(FrIdx) < 0) + StackUpdateLoc--; + else { + // Abort the operation as we can't update all CSR restores. + StackUpdateLoc = MBBI; + break; + } + } + } + if (FrameSize) { // In the prologue, the loaded (or persistent) stack pointer value is // offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red @@ -1409,7 +1546,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, } } else if (!isLargeFrame && !HasBP && !MFI.hasVarSizedObjects()) { if (HasRedZone) { - BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + BuildMI(MBB, StackUpdateLoc, dl, AddImmInst, SPReg) .addReg(SPReg) .addImm(FrameSize); } else { @@ -1433,7 +1570,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, .addReg(FPReg); RBReg = FPReg; } - BuildMI(MBB, MBBI, dl, LoadInst, RBReg) + BuildMI(MBB, StackUpdateLoc, dl, LoadInst, RBReg) .addImm(0) .addReg(SPReg); } @@ -1466,7 +1603,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // a base register anyway, because it may happen to be R0. bool LoadedLR = false; if (MustSaveLR && RBReg == SPReg && isInt<16>(LROffset+SPAdd)) { - BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg) + BuildMI(MBB, StackUpdateLoc, dl, LoadInst, ScratchReg) .addImm(LROffset+SPAdd) .addReg(RBReg); LoadedLR = true; @@ -1538,7 +1675,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, .addReg(TempReg, getKillRegState(i == e-1)); if (MustSaveLR) - BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg); + BuildMI(MBB, StackUpdateLoc, dl, MTLRInst).addReg(ScratchReg); // Callee pop calling convention. Pop parameter/linkage area. Used for tail // call optimization @@ -1732,6 +1869,9 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); + assert((!MF.getInfo()->mustSaveTOC() || + (Reg != PPC::X2 && Reg != PPC::R2)) && + "Not expecting to try to spill R2 in a function that must save TOC"); if (PPC::GPRCRegClass.contains(Reg) || PPC::SPE4RCRegClass.contains(Reg)) { HasGPSaveArea = true; @@ -1947,7 +2087,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, // the 16-bit immediate. We don't know the complete frame size here // because we've not yet computed callee-saved register spills or the // needed alignment padding. - unsigned StackSize = determineFrameLayout(MF, false, true); + unsigned StackSize = determineFrameLayout(MF, true); MachineFrameInfo &MFI = MF.getFrameInfo(); if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { @@ -2041,6 +2181,8 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + PPCFunctionInfo *FI = MF->getInfo(); + bool MustSaveTOC = FI->mustSaveTOC(); DebugLoc DL; bool CRSpilled = false; MachineInstrBuilder CRMIB; @@ -2071,6 +2213,10 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, continue; } + // The actual spill will happen in the prologue. + if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) + continue; + // Insert the spill to the stack frame. if (IsCRField) { PPCFunctionInfo *FuncInfo = MF->getInfo(); @@ -2198,6 +2344,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + PPCFunctionInfo *FI = MF->getInfo(); + bool MustSaveTOC = FI->mustSaveTOC(); bool CR2Spilled = false; bool CR3Spilled = false; bool CR4Spilled = false; @@ -2220,6 +2368,9 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI()) continue; + if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) + continue; + if (Reg == PPC::CR2) { CR2Spilled = true; // The spill slot is associated only with CR2, which is the diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index 69bd1484d6e5..d116e9fd22e1 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -1,9 +1,8 @@ //===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,7 +12,6 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H #define LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H -#include "PPC.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -73,12 +71,29 @@ class PPCFrameLowering: public TargetFrameLowering { */ void createTailCallBranchInstr(MachineBasicBlock &MBB) const; + /** + * Check if the conditions are correct to allow for the stack update + * to be moved past the CSR save/restore code. + */ + bool stackUpdateCanBeMoved(MachineFunction &MF) const; + public: PPCFrameLowering(const PPCSubtarget &STI); - unsigned determineFrameLayout(MachineFunction &MF, - bool UpdateMF = true, - bool UseEstimate = false) const; + /** + * Determine the frame layout and update the machine function. + */ + unsigned determineFrameLayoutAndUpdate(MachineFunction &MF, + bool UseEstimate = false) const; + + /** + * Determine the frame layout but do not update the machine function. + * The MachineFunction object can be const in this case as it is not + * modified. + */ + unsigned determineFrameLayout(const MachineFunction &MF, + bool UseEstimate = false, + unsigned *NewMaxCallFrameSize = nullptr) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp index 5f6966cecd61..391ebcc1a143 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -1,9 +1,8 @@ //===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,8 @@ //===----------------------------------------------------------------------===// #include "PPCHazardRecognizers.h" -#include "PPC.h" #include "PPCInstrInfo.h" -#include "PPCTargetMachine.h" +#include "PPCSubtarget.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h index 4b502147ca63..5b32147ca88d 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.h +++ b/lib/Target/PowerPC/PPCHazardRecognizers.h @@ -1,9 +1,8 @@ //===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 31acd0ff870f..543cac075f55 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -219,13 +218,6 @@ namespace { SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl); - /// SelectAddrImm - Returns true if the address N can be represented by - /// a base register plus a signed 16-bit displacement [r+imm]. - bool SelectAddrImm(SDValue N, SDValue &Disp, - SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0); - } - /// SelectAddrImmOffs - Return true if the operand is valid for a preinc /// immediate field. Note that the operand at this point is already the /// result of a prior SelectAddressRegImm call. @@ -239,26 +231,61 @@ namespace { return false; } - /// SelectAddrIdx - Given the specified addressed, check to see if it can be - /// represented as an indexed [r+r] operation. Returns false if it can - /// be represented by [r+imm], which are preferred. + /// SelectAddrIdx - Given the specified address, check to see if it can be + /// represented as an indexed [r+r] operation. + /// This is for xform instructions whose associated displacement form is D. + /// The last parameter \p 0 means associated D form has no requirment for 16 + /// bit signed displacement. + /// Returns false if it can be represented by [r+imm], which are preferred. bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) { - return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG); + return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 0); + } + + /// SelectAddrIdx4 - Given the specified address, check to see if it can be + /// represented as an indexed [r+r] operation. + /// This is for xform instructions whose associated displacement form is DS. + /// The last parameter \p 4 means associated DS form 16 bit signed + /// displacement must be a multiple of 4. + /// Returns false if it can be represented by [r+imm], which are preferred. + bool SelectAddrIdxX4(SDValue N, SDValue &Base, SDValue &Index) { + return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 4); + } + + /// SelectAddrIdx16 - Given the specified address, check to see if it can be + /// represented as an indexed [r+r] operation. + /// This is for xform instructions whose associated displacement form is DQ. + /// The last parameter \p 16 means associated DQ form 16 bit signed + /// displacement must be a multiple of 16. + /// Returns false if it can be represented by [r+imm], which are preferred. + bool SelectAddrIdxX16(SDValue N, SDValue &Base, SDValue &Index) { + return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 16); } - /// SelectAddrIdxOnly - Given the specified addressed, force it to be + /// SelectAddrIdxOnly - Given the specified address, force it to be /// represented as an indexed [r+r] operation. bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) { return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG); } + + /// SelectAddrImm - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement [r+imm]. + /// The last parameter \p 0 means D form has no requirment for 16 bit signed + /// displacement. + bool SelectAddrImm(SDValue N, SDValue &Disp, + SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0); + } /// SelectAddrImmX4 - Returns true if the address N can be represented by - /// a base register plus a signed 16-bit displacement that is a multiple of 4. - /// Suitable for use by STD and friends. + /// a base register plus a signed 16-bit displacement that is a multiple of + /// 4 (last parameter). Suitable for use by STD and friends. bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4); } + /// SelectAddrImmX16 - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement that is a multiple of + /// 16(last parameter). Suitable for use by STXV and friends. bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) { return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16); } @@ -412,7 +439,8 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) { if (PPCSubTarget->isTargetELF()) { GlobalBaseReg = PPC::R30; - if (M->getPICLevel() == PICLevel::SmallPIC) { + if (!PPCSubTarget->isSecurePlt() && + M->getPICLevel() == PICLevel::SmallPIC) { BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); MF->getInfo()->setUsesPICBase(true); @@ -2373,7 +2401,7 @@ public: // Here we try to match complex bit permutations into a set of // rotate-and-shift/shift/and/or instructions, using a set of heuristics - // known to produce optimial code for common cases (like i32 byte swapping). + // known to produce optimal code for common cases (like i32 byte swapping). SDNode *Select(SDNode *N) { Memoizer.clear(); auto Result = @@ -4214,12 +4242,12 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG, // Without this setb optimization, the outer SELECT_CC will be manually // selected to SELECT_CC_I4/SELECT_CC_I8 Pseudo, then expand-isel-pseudos pass - // transforms pseduo instruction to isel instruction. When there are more than + // transforms pseudo instruction to isel instruction. When there are more than // one use for result like zext/sext, with current optimization we only see // isel is replaced by setb but can't see any significant gain. Since // setb has longer latency than original isel, we should avoid this. Another // point is that setb requires comparison always kept, it can break the - // oppotunity to get the comparison away if we have in future. + // opportunity to get the comparison away if we have in future. if (!SetOrSelCC.hasOneUse() || (!InnerIsSel && !FalseRes.hasOneUse())) return false; @@ -4354,13 +4382,23 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (trySETCC(N)) return; break; - - case PPCISD::CALL: { - const Module *M = MF->getFunction().getParent(); - + // These nodes will be transformed into GETtlsADDR32 node, which + // later becomes BL_TLS __tls_get_addr(sym at tlsgd)@PLT + case PPCISD::ADDI_TLSLD_L_ADDR: + case PPCISD::ADDI_TLSGD_L_ADDR: { + const Module *Mod = MF->getFunction().getParent(); if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 || !PPCSubTarget->isSecurePlt() || !PPCSubTarget->isTargetELF() || - M->getPICLevel() == PICLevel::SmallPIC) + Mod->getPICLevel() == PICLevel::SmallPIC) + break; + // Attach global base pointer on GETtlsADDR32 node in order to + // generate secure plt code for TLS symbols. + getGlobalBaseReg(); + } break; + case PPCISD::CALL: { + if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 || + !TM.isPositionIndependent() || !PPCSubTarget->isSecurePlt() || + !PPCSubTarget->isTargetELF()) break; SDValue Op = N->getOperand(1); @@ -5305,7 +5343,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { SDValue V = Queue.pop_back_val(); for (const SDValue &O : V.getNode()->ops()) { - unsigned b; + unsigned b = 0; uint64_t M = 0, A = 0; SDValue OLHS, ORHS; if (O.getOpcode() == ISD::OR) { diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 39608cb74bee..24d50074860d 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1,9 +1,8 @@ //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -45,6 +44,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -70,8 +70,10 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" @@ -111,6 +113,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); static cl::opt DisableSCO("disable-ppc-sco", cl::desc("disable sibling call optimization on ppc"), cl::Hidden); +static cl::opt DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32", +cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden); + static cl::opt EnableQuadPrecision("enable-ppc-quad-precision", cl::desc("enable quad precision float support on ppc"), cl::Hidden); @@ -119,6 +124,8 @@ STATISTIC(NumSiblingCalls, "Number of sibling calls"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); +static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -550,7 +557,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); - setOperationAction(ISD::ABS, VT, Custom); + + // For v2i64, these are only valid with P8Vector. This is corrected after + // the loop. + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + + if (Subtarget.hasVSX()) { + setOperationAction(ISD::FMAXNUM, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + } // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { @@ -635,11 +653,28 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } + if (!Subtarget.hasP8Vector()) { + setOperationAction(ISD::SMAX, MVT::v2i64, Expand); + setOperationAction(ISD::SMIN, MVT::v2i64, Expand); + setOperationAction(ISD::UMAX, MVT::v2i64, Expand); + setOperationAction(ISD::UMIN, MVT::v2i64, Expand); + } + + for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}) + setOperationAction(ISD::ABS, VT, Custom); // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + // Vector truncates to sub-word integer that fit in an Altivec/VSX register + // are cheap, so handle them before they get expanded to scalar. + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::AND , MVT::v4i32, Legal); setOperationAction(ISD::OR , MVT::v4i32, Legal); setOperationAction(ISD::XOR , MVT::v4i32, Legal); @@ -804,6 +839,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FNEG, MVT::v2f64, Legal); setOperationAction(ISD::FABS, MVT::v4f32, Legal); setOperationAction(ISD::FABS, MVT::v2f64, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal); if (Subtarget.hasDirectMove()) setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); @@ -866,6 +903,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FPOWI, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); } + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); } @@ -1060,6 +1098,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -1232,22 +1271,6 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, return Align; } -unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const { - if (Subtarget.hasSPE() && VT == MVT::f64) - return 2; - return PPCTargetLowering::getNumRegisters(Context, VT); -} - -MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const { - if (Subtarget.hasSPE() && VT == MVT::f64) - return MVT::i32; - return PPCTargetLowering::getRegisterType(Context, VT); -} - bool PPCTargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } @@ -1256,6 +1279,10 @@ bool PPCTargetLowering::hasSPE() const { return Subtarget.hasSPE(); } +bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { + return VT.isScalarInteger(); +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; @@ -1365,7 +1392,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::QBFLT: return "PPCISD::QBFLT"; case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; + case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; + case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; + case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; + case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH"; } return nullptr; } @@ -2202,16 +2233,43 @@ bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } + +/// SelectAddressEVXRegReg - Given the specified address, check to see if it can +/// be represented as an indexed [r+r] operation. +bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base, + SDValue &Index, + SelectionDAG &DAG) const { + for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); + UI != E; ++UI) { + if (MemSDNode *Memop = dyn_cast(*UI)) { + if (Memop->getMemoryVT() == MVT::f64) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + } + } + return false; +} + /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it -/// can be more efficiently represented with [r+imm]. +/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is +/// non-zero and N can be represented by a base register plus a signed 16-bit +/// displacement, make a more precise judgement by checking (displacement % \p +/// EncodingAlignment). bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, - SDValue &Index, - SelectionDAG &DAG) const { + SDValue &Index, SelectionDAG &DAG, + unsigned EncodingAlignment) const { int16_t imm = 0; if (N.getOpcode() == ISD::ADD) { - if (isIntS16Immediate(N.getOperand(1), imm)) - return false; // r+i + // Is there any SPE load/store (f64), which can't handle 16bit offset? + // SPE load/store can only handle 8-bit offsets. + if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG)) + return true; + if (isIntS16Immediate(N.getOperand(1), imm) && + (!EncodingAlignment || !(imm % EncodingAlignment))) + return false; // r+i if (N.getOperand(1).getOpcode() == PPCISD::Lo) return false; // r+i @@ -2219,8 +2277,9 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, Index = N.getOperand(1); return true; } else if (N.getOpcode() == ISD::OR) { - if (isIntS16Immediate(N.getOperand(1), imm)) - return false; // r+i can fold it if we can. + if (isIntS16Immediate(N.getOperand(1), imm) && + (!EncodingAlignment || !(imm % EncodingAlignment))) + return false; // r+i can fold it if we can. // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably @@ -2284,22 +2343,22 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. If \p Alignment is non-zero, only accept +/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept /// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, - unsigned Alignment) const { + unsigned EncodingAlignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. - if (SelectAddressRegReg(N, Disp, Base, DAG)) + if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment)) return false; if (N.getOpcode() == ISD::ADD) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Alignment || (imm % Alignment) == 0)) { + (!EncodingAlignment || (imm % EncodingAlignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); @@ -2323,7 +2382,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } else if (N.getOpcode() == ISD::OR) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Alignment || (imm % Alignment) == 0)) { + (!EncodingAlignment || (imm % EncodingAlignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. @@ -2349,7 +2408,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" int16_t Imm; - if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { + if (isIntS16Immediate(CN, Imm) && + (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -2359,7 +2419,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && - (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { + (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -2416,24 +2476,45 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, /// Returns true if we should use a direct load into vector instruction /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence. -static bool usePartialVectorLoads(SDNode *N) { - if (!N->hasOneUse()) - return false; +static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) { // If there are any other uses other than scalar to vector, then we should // keep it as a scalar load -> direct move pattern to prevent multiple - // loads. Currently, only check for i64 since we have lxsd/lfd to do this - // efficiently, but no update equivalent. - if (LoadSDNode *LD = dyn_cast(N)) { - EVT MemVT = LD->getMemoryVT(); - if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) { - SDNode *User = *(LD->use_begin()); - if (User->getOpcode() == ISD::SCALAR_TO_VECTOR) - return true; - } + // loads. + LoadSDNode *LD = dyn_cast(N); + if (!LD) + return false; + + EVT MemVT = LD->getMemoryVT(); + if (!MemVT.isSimple()) + return false; + switch(MemVT.getSimpleVT().SimpleTy) { + case MVT::i64: + break; + case MVT::i32: + if (!ST.hasP8Vector()) + return false; + break; + case MVT::i16: + case MVT::i8: + if (!ST.hasP9Vector()) + return false; + break; + default: + return false; } - return false; + SDValue LoadedVal(N, 0); + if (!LoadedVal.hasOneUse()) + return false; + + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); + UI != UE; ++UI) + if (UI.getUse().get().getResNo() == 0 && + UI->getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + + return true; } /// getPreIndexedAddressParts - returns true by value, base pointer and @@ -2464,7 +2545,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // Do not generate pre-inc forms for specific loads that feed scalar_to_vector // instructions because we can fold these into a more efficient instruction // instead, (such as LXSD). - if (isLoad && usePartialVectorLoads(N)) { + if (isLoad && usePartialVectorLoads(N, Subtarget)) { return false; } @@ -2745,7 +2826,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, const Module *M = DAG.getMachineFunction().getFunction().getParent(); PICLevel::Level picLevel = M->getPICLevel(); - TLSModel::Model Model = getTargetMachine().getTLSModel(GV); + const TargetMachine &TM = getTargetMachine(); + TLSModel::Model Model = TM.getTLSModel(GV); if (Model == TLSModel::LocalExec) { SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, @@ -2769,8 +2851,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA); - } else - GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + } else { + if (!TM.isPositionIndependent()) + GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + else if (picLevel == PICLevel::SmallPIC) + GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); + else + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr); return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); @@ -3147,101 +3235,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV, nextOffset)); } -#include "PPCGenCallingConv.inc" - -// Function whose sole purpose is to kill compiler warnings -// stemming from unused functions included from PPCGenCallingConv.inc. -CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { - return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; -} - -bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - return true; -} - -bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const MCPhysReg ArgRegs[] = { - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - const unsigned NumArgRegs = array_lengthof(ArgRegs); - - unsigned RegNum = State.getFirstUnallocated(ArgRegs); - - // Skip one register if the first unallocated register has an even register - // number and there are still argument registers available which have not been - // allocated yet. RegNum is actually an index into ArgRegs, which means we - // need to skip a register if RegNum is odd. - if (RegNum != NumArgRegs && RegNum % 2 == 1) { - State.AllocateReg(ArgRegs[RegNum]); - } - - // Always return false here, as this function only makes sure that the first - // unallocated register has an odd register number and does not actually - // allocate a register for the current argument. - return false; -} - -bool -llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const MCPhysReg ArgRegs[] = { - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - const unsigned NumArgRegs = array_lengthof(ArgRegs); - - unsigned RegNum = State.getFirstUnallocated(ArgRegs); - int RegsLeft = NumArgRegs - RegNum; - - // Skip if there is not enough registers left for long double type (4 gpr regs - // in soft float mode) and put long double argument on the stack. - if (RegNum != NumArgRegs && RegsLeft < 4) { - for (int i = 0; i < RegsLeft; i++) { - State.AllocateReg(ArgRegs[RegNum + i]); - } - } - - return false; -} - -bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const MCPhysReg ArgRegs[] = { - PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, - PPC::F8 - }; - - const unsigned NumArgRegs = array_lengthof(ArgRegs); - - unsigned RegNum = State.getFirstUnallocated(ArgRegs); - - // If there is only one Floating-point register left we need to put both f64 - // values of a split ppc_fp128 value on the stack. - if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { - State.AllocateReg(ArgRegs[RegNum]); - } - - // Always return false here, as this function only makes sure that the two f64 - // values a ppc_fp128 value is split into are both passed in registers or both - // passed on the stack and does not actually allocate a register for the - // current argument. - return false; -} - /// FPR - The set of FP registers that should be allocated for arguments, /// on Darwin. static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, @@ -3449,7 +3442,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // Reserve space for the linkage area on the stack. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, PtrByteSize); - if (useSoftFloat() || hasSPE()) + if (useSoftFloat()) CCInfo.PreAnalyzeFormalArguments(Ins); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -3482,7 +3475,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else if (Subtarget.hasSPE()) - RC = &PPC::SPERCRegClass; + // SPE passes doubles in GPR pairs. + RC = &PPC::GPRCRegClass; else RC = &PPC::F8RCRegClass; break; @@ -3506,13 +3500,26 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( break; } - // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, - ValVT == MVT::i1 ? MVT::i32 : ValVT); - - if (ValVT == MVT::i1) - ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + SDValue ArgValue; + // Transform the arguments stored in physical registers into + // virtual ones. + if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) { + assert(i + 1 < e && "No second half of double precision argument"); + unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC); + unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC); + SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32); + SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32); + if (!Subtarget.isLittleEndian()) + std::swap (ArgValueLo, ArgValueHi); + ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo, + ArgValueHi); + } else { + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, + ValVT == MVT::i1 ? MVT::i32 : ValVT); + if (ValVT == MVT::i1) + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + } InVals.push_back(ArgValue); } else { @@ -4448,24 +4455,27 @@ static bool isFunctionGlobalAddress(SDValue Callee); static bool callsShareTOCBase(const Function *Caller, SDValue Callee, const TargetMachine &TM) { - // If !G, Callee can be an external symbol. - GlobalAddressSDNode *G = dyn_cast(Callee); - if (!G) - return false; - + // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols + // don't have enough information to determine if the caller and calle share + // the same TOC base, so we have to pessimistically assume they don't for + // correctness. + GlobalAddressSDNode *G = dyn_cast(Callee); + if (!G) + return false; + + const GlobalValue *GV = G->getGlobal(); // The medium and large code models are expected to provide a sufficiently // large TOC to provide all data addressing needs of a module with a // single TOC. Since each module will be addressed with a single TOC then we // only need to check that caller and callee don't cross dso boundaries. if (CodeModel::Medium == TM.getCodeModel() || CodeModel::Large == TM.getCodeModel()) - return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); + return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV); // Otherwise we need to ensure callee and caller are in the same section, // since the linker may allocate multiple TOCs, and we don't know which // sections will belong to the same TOC base. - const GlobalValue *GV = G->getGlobal(); if (!GV->isStrongDefinitionForLinker()) return false; @@ -4917,6 +4927,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isAIXABI = Subtarget.isAIXABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); NodeTys.push_back(MVT::Other); // Returns a chain @@ -4943,17 +4954,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. if (isFunctionGlobalAddress(Callee)) { GlobalAddressSDNode *G = cast(Callee); + // A call to a TLS address is actually an indirect call to a // thread-specific pointer. unsigned OpFlags = 0; if (UsePlt) OpFlags = PPCII::MO_PLT; - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, - // every direct call is) turn it into a TargetGlobalAddress / - // TargetExternalSymbol node so that legalize doesn't hack it. Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, Callee.getValueType(), 0, OpFlags); needIndirectCall = false; @@ -5095,17 +5107,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); - // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live - // into the call. - // We do need to reserve X2 to appease the verifier for the PATCHPOINT. - if (isSVR4ABI && isPPC64) { + // All calls, in the AIX ABI and 64-bit ELF ABIs, need the TOC register + // live into the call. + // We do need to reserve R2/X2 to appease the verifier for the PATCHPOINT. + if ((isSVR4ABI && isPPC64) || isAIXABI) { setUsesTOCBasePtr(DAG); - // We cannot add X2 as an operand here for PATCHPOINT, because there is no - // way to mark dependencies as implicit here. We will add the X2 dependency - // in EmitInstrWithCustomInserter. - if (!isPatchPoint) - Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is + // no way to mark dependencies as implicit here. + // We will add the R2/X2 dependency in EmitInstrWithCustomInserter. + if (!isPatchPoint) + Ops.push_back(DAG.getRegister(isPPC64 ? PPC::X2 + : PPC::R2, PtrVT)); } return CallOpc; @@ -5129,10 +5142,27 @@ SDValue PPCTargetLowering::LowerCallResult( CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Val = DAG.getCopyFromReg(Chain, dl, - VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); + SDValue Val; + + if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { + SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + if (!Subtarget.isLittleEndian()) + std::swap (Lo, Hi); + Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi); + } else { + Val = DAG.getCopyFromReg(Chain, dl, + VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -5206,18 +5236,24 @@ SDValue PPCTargetLowering::FinishCall( } // Add a NOP immediately after the branch instruction when using the 64-bit - // SVR4 ABI. At link time, if caller and callee are in a different module and + // SVR4 or the AIX ABI. + // At link time, if caller and callee are in a different module and // thus have a different TOC, the call will be replaced with a call to a stub // function which saves the current TOC, loads the TOC of the callee and // branches to the callee. The NOP will be replaced with a load instruction // which restores the TOC of the caller from the TOC save slot of the current // stack frame. If caller and callee belong to the same module (and have the - // same TOC), the NOP will remain unchanged. + // same TOC), the NOP will remain unchanged, or become some other NOP. MachineFunction &MF = DAG.getMachineFunction(); - if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && - !isPatchPoint) { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + if (!isTailCall && !isPatchPoint && + ((Subtarget.isSVR4ABI() && Subtarget.isPPC64()) || + Subtarget.isAIXABI())) { if (CallOpc == PPCISD::BCTRL) { + if (Subtarget.isAIXABI()) + report_fatal_error("Indirect call on AIX is not implemented."); + // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. // See PrepareCall() for more information about calls through function @@ -5229,7 +5265,6 @@ SDValue PPCTargetLowering::FinishCall( // allocated and an unnecessary move instruction being generated. CallOpc = PPCISD::BCTRL_LOAD_TOC; - EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); @@ -5245,6 +5280,19 @@ SDValue PPCTargetLowering::FinishCall( } } + if (Subtarget.isAIXABI() && isFunctionGlobalAddress(Callee)) { + // On AIX, direct function calls reference the symbol for the function's + // entry point, which is named by inserting a "." before the function's + // C-linkage name. + GlobalAddressSDNode *G = cast(Callee); + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + MCSymbol *S = Context.getOrCreateSymbol(Twine(".") + + Twine(G->getGlobal()->getName())); + Callee = DAG.getMCSymbol(S, PtrVT); + // Replace the GlobalAddressSDNode Callee with the MCSymbolSDNode. + Ops[1] = Callee; + } + Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); @@ -5314,16 +5362,20 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, !isTailCall) Callee = LowerGlobalAddress(Callee, DAG); - if (Subtarget.isSVR4ABI()) { - if (Subtarget.isPPC64()) - return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, isPatchPoint, Outs, OutVals, Ins, - dl, DAG, InVals, CS); - else - return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, isPatchPoint, Outs, OutVals, Ins, - dl, DAG, InVals, CS); - } + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) + return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, isPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); + + if (Subtarget.isSVR4ABI()) + return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, isPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); + + if (Subtarget.isAIXABI()) + return LowerCall_AIX(Chain, Callee, CallConv, isVarArg, + isTailCall, isPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, @@ -5444,12 +5496,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( bool seenFloatArg = false; // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, j = 0, e = ArgLocs.size(); + // i - Tracks the index into the list of registers allocated for the call + // RealArgIdx - Tracks the index into the list of actual function arguments + // j - Tracks the index into the list of byval arguments + for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size(); i != e; - ++i) { + ++i, ++RealArgIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[RealArgIdx]; + ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags; if (Flags.isByVal()) { // Argument is an aggregate which is passed by value, thus we need to @@ -5498,7 +5553,17 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( if (VA.isRegLoc()) { seenFloatArg |= VA.getLocVT().isFloatingPoint(); // Put argument in a physical register. - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) { + bool IsLE = Subtarget.isLittleEndian(); + SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(IsLE ? 0 : 1, dl)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0))); + SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(IsLE ? 1 : 0, dl)); + RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(), + SVal.getValue(0))); + } else + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { // Put argument in the parameter list area of the current stack frame. assert(VA.isMemLoc()); @@ -6613,6 +6678,128 @@ SDValue PPCTargetLowering::LowerCall_Darwin( NumBytes, Ins, InVals, CS); } + +SDValue PPCTargetLowering::LowerCall_AIX( + SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, bool isPatchPoint, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals, + ImmutableCallSite CS) const { + + assert((CallConv == CallingConv::C || CallConv == CallingConv::Fast) && + "Unimplemented calling convention!"); + if (isVarArg || isPatchPoint) + report_fatal_error("This call type is unimplemented on AIX."); + + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + bool isPPC64 = PtrVT == MVT::i64; + unsigned PtrByteSize = isPPC64 ? 8 : 4; + unsigned NumOps = Outs.size(); + + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, parameter list area. + // On XCOFF, we start with 24/48, which is reserved space for + // [SP][CR][LR][2 x reserved][TOC]. + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if the callee + // is variadic. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + unsigned NumBytes = LinkageSize + 8 * PtrByteSize; + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog + // inserter pass. + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); + SDValue CallSeqStart = Chain; + + static const MCPhysReg GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10 + }; + static const MCPhysReg GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10 + }; + + const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64) + : array_lengthof(GPR_32); + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; + unsigned GPR_idx = 0; + + SmallVector, 8> RegsToPass; + + if (isTailCall) + report_fatal_error("Handling of tail call is unimplemented!"); + int SPDiff = 0; + + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + // Promote integers if needed. + if (Arg.getValueType() == MVT::i1 || + (isPPC64 && Arg.getValueType() == MVT::i32)) { + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, PtrVT, Arg); + } + + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) + report_fatal_error("Passing structure by value is unimplemented!"); + + switch (Arg.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (GPR_idx != NumGPRs) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + else + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + break; + case MVT::f32: + case MVT::f64: + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + case MVT::v1i128: + case MVT::f128: + case MVT::v4f64: + case MVT::v4i1: + report_fatal_error("Handling of this parameter type is unimplemented!"); + } + } + + if (!isFunctionGlobalAddress(Callee) && + !isa(Callee)) + report_fatal_error("Handling of indirect call is unimplemented!"); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (auto Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag); + InFlag = Chain.getValue(1); + } + + return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, + /* unused except on PPC64 ELFv1 */ false, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); +} + bool PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, @@ -6644,11 +6831,11 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector RetOps(1, Chain); // Copy the result values into the output registers. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Arg = OutVals[i]; + SDValue Arg = OutVals[RealResIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -6663,8 +6850,21 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; } - - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { + bool isLittleEndian = Subtarget.isLittleEndian(); + // Legalize ret f64 -> ret 2 x i32. + SDValue SVal = + DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl)); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl)); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + } else + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } @@ -6890,6 +7090,61 @@ SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(0)); } +SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, + SelectionDAG &DAG) const { + + // Implements a vector truncate that fits in a vector register as a shuffle. + // We want to legalize vector truncates down to where the source fits in + // a vector register (and target is therefore smaller than vector register + // size). At that point legalization will try to custom lower the sub-legal + // result and get here - where we can contain the truncate as a single target + // operation. + + // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows: + // to + // + // We will implement it for big-endian ordering as this (where x denotes + // undefined): + // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to + // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u> + // + // The same operation in little-endian ordering will be: + // to + // + + assert(Op.getValueType().isVector() && "Vector type expected."); + + SDLoc DL(Op); + SDValue N1 = Op.getOperand(0); + unsigned SrcSize = N1.getValueType().getSizeInBits(); + assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector"); + SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); + + EVT TrgVT = Op.getValueType(); + unsigned TrgNumElts = TrgVT.getVectorNumElements(); + EVT EltVT = TrgVT.getVectorElementType(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + + // First list the elements we want to keep. + unsigned SizeMult = SrcSize / TrgVT.getSizeInBits(); + SmallVector ShuffV; + if (Subtarget.isLittleEndian()) + for (unsigned i = 0; i < TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult); + else + for (unsigned i = 1; i <= TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult - 1); + + // Populate the remaining elements with undefs. + for (unsigned i = TrgNumElts; i < WideNumElts; ++i) + // ShuffV.push_back(i + WideNumElts); + ShuffV.push_back(WideNumElts + 1); + + SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc); + return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV); +} + /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -9604,10 +9859,63 @@ SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { BifID = Intrinsic::ppc_altivec_vmaxsh; else if (VT == MVT::v16i8) BifID = Intrinsic::ppc_altivec_vmaxsb; - + return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); } +// Custom lowering for fpext vf32 to v2f64 +SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::FP_EXTEND && + "Should only be called for ISD::FP_EXTEND"); + + // We only want to custom lower an extend from v2f32 to v2f64. + if (Op.getValueType() != MVT::v2f64 || + Op.getOperand(0).getValueType() != MVT::v2f32) + return SDValue(); + + SDLoc dl(Op); + SDValue Op0 = Op.getOperand(0); + + switch (Op0.getOpcode()) { + default: + return SDValue(); + case ISD::FADD: + case ISD::FMUL: + case ISD::FSUB: { + SDValue NewLoad[2]; + for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) { + // Ensure both input are loads. + SDValue LdOp = Op0.getOperand(i); + if (LdOp.getOpcode() != ISD::LOAD) + return SDValue(); + // Generate new load node. + LoadSDNode *LD = cast(LdOp); + SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; + NewLoad[i] = + DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, + DAG.getVTList(MVT::v4f32, MVT::Other), + LoadOps, LD->getMemoryVT(), + LD->getMemOperand()); + } + SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, + NewLoad[0], NewLoad[1], + Op0.getNode()->getFlags()); + return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp); + } + case ISD::LOAD: { + LoadSDNode *LD = cast(Op0); + SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; + SDValue NewLd = + DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, + DAG.getVTList(MVT::v4f32, MVT::Other), + LoadOps, LD->getMemoryVT(), LD->getMemOperand()); + return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd); + } + } + llvm_unreachable("ERROR:Should return for all cases within swtich."); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9661,6 +9969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -9701,7 +10010,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (cast(N->getOperand(1))->getZExtValue() != - Intrinsic::ppc_is_decremented_ctr_nonzero) + Intrinsic::loop_decrement) break; assert(N->getValueType(0) == MVT::i1 && @@ -9737,6 +10046,14 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; + case ISD::TRUNCATE: { + EVT TrgVT = N->getValueType(0); + if (TrgVT.isVector() && + isOperationCustom(N->getOpcode(), TrgVT) && + N->getOperand(0).getValueType().getSizeInBits() <= 128) + Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); + return; + } case ISD::BITCAST: // Don't handle bitcast here. return; @@ -9822,10 +10139,10 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); - unsigned dest = MI.getOperand(0).getReg(); - unsigned ptrA = MI.getOperand(1).getReg(); - unsigned ptrB = MI.getOperand(2).getReg(); - unsigned incr = MI.getOperand(3).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -9841,7 +10158,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned TmpReg = (!BinOpcode) ? incr : + Register TmpReg = (!BinOpcode) ? incr : RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); @@ -9949,20 +10266,20 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned PtrReg = RegInfo.createVirtualRegister(RC); - unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); - unsigned ShiftReg = + Register PtrReg = RegInfo.createVirtualRegister(RC); + Register Shift1Reg = RegInfo.createVirtualRegister(GPRC); + Register ShiftReg = isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); - unsigned Incr2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); - unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); - unsigned Ptr1Reg; - unsigned TmpReg = + Register Incr2Reg = RegInfo.createVirtualRegister(GPRC); + Register MaskReg = RegInfo.createVirtualRegister(GPRC); + Register Mask2Reg = RegInfo.createVirtualRegister(GPRC); + Register Mask3Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC); + Register TmpDestReg = RegInfo.createVirtualRegister(GPRC); + Register Ptr1Reg; + Register TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC); // thisMBB: @@ -10764,23 +11081,23 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned PtrReg = RegInfo.createVirtualRegister(RC); - unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); - unsigned ShiftReg = + Register PtrReg = RegInfo.createVirtualRegister(RC); + Register Shift1Reg = RegInfo.createVirtualRegister(GPRC); + Register ShiftReg = isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); - unsigned NewVal2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned NewVal3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned OldVal2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned OldVal3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); - unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); - unsigned Ptr1Reg; - unsigned TmpReg = RegInfo.createVirtualRegister(GPRC); - unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC); + Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC); + Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC); + Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC); + Register MaskReg = RegInfo.createVirtualRegister(GPRC); + Register Mask2Reg = RegInfo.createVirtualRegister(GPRC); + Register Mask3Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC); + Register TmpDestReg = RegInfo.createVirtualRegister(GPRC); + Register Ptr1Reg; + Register TmpReg = RegInfo.createVirtualRegister(GPRC); + Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; // thisMBB: // ... // fallthrough --> loopMBB @@ -10968,7 +11285,147 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); - return BB; + BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), + MI.getOperand(0).getReg()) + .addReg(CRReg); + } else if (MI.getOpcode() == PPC::TBEGIN_RET) { + DebugLoc Dl = MI.getDebugLoc(); + unsigned Imm = MI.getOperand(1).getImm(); + BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm); + BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), + MI.getOperand(0).getReg()) + .addReg(PPC::CR0EQ); + } else if (MI.getOpcode() == PPC::SETRNDi) { + DebugLoc dl = MI.getDebugLoc(); + unsigned OldFPSCRReg = MI.getOperand(0).getReg(); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); + + // The floating point rounding mode is in the bits 62:63 of FPCSR, and has + // the following settings: + // 00 Round to nearest + // 01 Round to 0 + // 10 Round to +inf + // 11 Round to -inf + + // When the operand is immediate, using the two least significant bits of + // the immediate to set the bits 62:63 of FPSCR. + unsigned Mode = MI.getOperand(1).getImm(); + BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0)) + .addImm(31); + + BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0)) + .addImm(30); + } else if (MI.getOpcode() == PPC::SETRND) { + DebugLoc dl = MI.getDebugLoc(); + + // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg + // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg. + // If the target doesn't have DirectMove, we should use stack to do the + // conversion, because the target doesn't have the instructions like mtvsrd + // or mfvsrd to do this conversion directly. + auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) { + if (Subtarget.hasDirectMove()) { + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg) + .addReg(SrcReg); + } else { + // Use stack to do the register copy. + unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD; + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg); + if (RC == &PPC::F8RCRegClass) { + // Copy register from F8RCRegClass to G8RCRegclass. + assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) && + "Unsupported RegClass."); + + StoreOp = PPC::STFD; + LoadOp = PPC::LD; + } else { + // Copy register from G8RCRegClass to F8RCRegclass. + assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) && + (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) && + "Unsupported RegClass."); + } + + MachineFrameInfo &MFI = F->getFrameInfo(); + int FrameIdx = MFI.CreateStackObject(8, 8, false); + + MachineMemOperand *MMOStore = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, FrameIdx, 0), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + + // Store the SrcReg into the stack. + BuildMI(*BB, MI, dl, TII->get(StoreOp)) + .addReg(SrcReg) + .addImm(0) + .addFrameIndex(FrameIdx) + .addMemOperand(MMOStore); + + MachineMemOperand *MMOLoad = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, FrameIdx, 0), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + + // Load from the stack where SrcReg is stored, and save to DestReg, + // so we have done the RegClass conversion from RegClass::SrcReg to + // RegClass::DestReg. + BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg) + .addImm(0) + .addFrameIndex(FrameIdx) + .addMemOperand(MMOLoad); + } + }; + + unsigned OldFPSCRReg = MI.getOperand(0).getReg(); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); + + // When the operand is gprc register, use two least significant bits of the + // register and mtfsf instruction to set the bits 62:63 of FPSCR. + // + // copy OldFPSCRTmpReg, OldFPSCRReg + // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1) + // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62 + // copy NewFPSCRReg, NewFPSCRTmpReg + // mtfsf 255, NewFPSCRReg + MachineOperand SrcOp = MI.getOperand(1); + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + + copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg); + + unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + + // The first operand of INSERT_SUBREG should be a register which has + // subregisters, we only care about its RegClass, so we should use an + // IMPLICIT_DEF register. + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg); + BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg) + .addReg(ImDefReg) + .add(SrcOp) + .addImm(1); + + unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg) + .addReg(OldFPSCRTmpReg) + .addReg(ExtSrcReg) + .addImm(0) + .addImm(62); + + unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg); + + // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63 + // bits of FPSCR. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)) + .addImm(255) + .addReg(NewFPSCRReg) + .addImm(0) + .addImm(0); } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -11006,7 +11463,9 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); - UseOneConstNR = true; + // The Newton-Raphson computation with a single constant does not provide + // enough accuracy on some CPUs. + UseOneConstNR = !Subtarget.needsTwoConstNR(); return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } return SDValue(); @@ -12062,9 +12521,14 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { "Should be called with a BUILD_VECTOR node"); SDLoc dl(N); + + // Return early for non byte-sized type, as they can't be consecutive. + if (!N->getValueType(0).getVectorElementType().isByteSized()) + return SDValue(); + bool InputsAreConsecutiveLoads = true; bool InputsAreReverseConsecutive = true; - unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; + unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize(); SDValue FirstInput = N->getOperand(0); bool IsRoundOfExtLoad = false; @@ -12332,9 +12796,8 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, ConstantSDNode *Ext2Op = dyn_cast(Ext2.getOperand(1)); if (!Ext1Op || !Ext2Op) return SDValue(); - if (Ext1.getValueType() != MVT::i32 || - Ext2.getValueType() != MVT::i32) - if (Ext1.getOperand(0) != Ext2.getOperand(0)) + if (Ext1.getOperand(0).getValueType() != MVT::v4i32 || + Ext1.getOperand(0) != Ext2.getOperand(0)) return SDValue(); int FirstElem = Ext1Op->getZExtValue(); @@ -12664,6 +13127,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return combineSRA(N, DCI); case ISD::SRL: return combineSRL(N, DCI); + case ISD::MUL: + return combineMUL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -13246,7 +13711,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(Cond.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero) { + Intrinsic::loop_decrement) { // We now need to make the intrinsic dead (it cannot be instruction // selected). @@ -13272,14 +13737,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::AND && LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(0).getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_decrement && isa(LHS.getOperand(1)) && !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_decrement && isa(RHS)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Counter decrement comparison is not EQ or NE"); @@ -13355,9 +13820,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); - case ISD::ABS: + case ISD::ABS: return combineABS(N, DCI); - case ISD::VSELECT: + case ISD::VSELECT: return combineVSelect(N, DCI); } @@ -13453,6 +13918,15 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { if (!ML) break; + if (!DisableInnermostLoopAlign32) { + // If the nested loop is an innermost loop, prefer to a 32-byte alignment, + // so that we can decrease cache misses and branch-prediction misses. + // Actual alignment of the loop will depend on the hotness check and other + // logic in alignBlocks. + if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty()) + return 5; + } + const PPCInstrInfo *TII = Subtarget.getInstrInfo(); // For small loops (between 5 and 8 instructions), align to a 32-byte @@ -13502,7 +13976,7 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } else if (Constraint == "wa" || Constraint == "wd" || Constraint == "wf" || Constraint == "ws" || - Constraint == "wi") { + Constraint == "wi" || Constraint == "ww") { return C_RegisterClass; // VSX registers. } return TargetLowering::getConstraintType(Constraint); @@ -13530,10 +14004,12 @@ PPCTargetLowering::getSingleConstraintMatchWeight( StringRef(constraint) == "wf") && type->isVectorTy()) return CW_Register; - else if (StringRef(constraint) == "ws" && type->isDoubleTy()) - return CW_Register; else if (StringRef(constraint) == "wi" && type->isIntegerTy(64)) return CW_Register; // just hold 64-bit integers data. + else if (StringRef(constraint) == "ws" && type->isDoubleTy()) + return CW_Register; + else if (StringRef(constraint) == "ww" && type->isFloatTy()) + return CW_Register; switch (*constraint) { default: @@ -13619,7 +14095,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Constraint == "wf" || Constraint == "wi") && Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); - } else if (Constraint == "ws" && Subtarget.hasVSX()) { + } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) { if (VT == MVT::f32 && Subtarget.hasP8Vector()) return std::make_pair(0U, &PPC::VSSRCRegClass); else @@ -13865,7 +14341,7 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { if (CModel == CodeModel::Small || CModel == CodeModel::Large) return true; - // JumpTable and BlockAddress are accessed as got-indirect. + // JumpTable and BlockAddress are accessed as got-indirect. if (isa(GA) || isa(GA)) return true; @@ -14082,18 +14558,16 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. -EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { +EVT PPCTargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - const Function &F = MF.getFunction(); // When expanding a memset, require at least two QPX instructions to cover // the cost of loading the value to be stored from the constant pool. if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && - !F.hasFnAttribute(Attribute::NoImplicitFloat)) { + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { return MVT::v4f64; } @@ -14178,6 +14652,7 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, + MachineMemOperand::Flags, bool *Fast) const { if (DisablePPCUnaligned) return false; @@ -14324,7 +14799,7 @@ void PPCTargetLowering::insertCopiesSplitCSR( BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); - // Insert the copy-back instructions right before the terminator + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) @@ -14345,7 +14820,8 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const { return TargetLowering::insertSSPDeclarations(M); } -bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { if (!VT.isSimple() || !Subtarget.hasVSX()) return false; @@ -14585,6 +15061,89 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N, return SDValue(); } +SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1)); + if (!ConstOpOrElement) + return SDValue(); + + // An imul is usually smaller than the alternative sequence for legal type. + if (DAG.getMachineFunction().getFunction().hasMinSize() && + isOperationLegal(ISD::MUL, N->getValueType(0))) + return SDValue(); + + auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool { + switch (this->Subtarget.getDarwinDirective()) { + default: + // TODO: enhance the condition for subtarget before pwr8 + return false; + case PPC::DIR_PWR8: + // type mul add shl + // scalar 4 1 1 + // vector 7 2 2 + return true; + case PPC::DIR_PWR9: + // type mul add shl + // scalar 5 2 2 + // vector 7 2 2 + + // The cycle RATIO of related operations are showed as a table above. + // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both + // scalar and vector type. For 2 instrs patterns, add/sub + shl + // are 4, it is always profitable; but for 3 instrs patterns + // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6. + // So we should only do it for vector type. + return IsAddOne && IsNeg ? VT.isVector() : true; + } + }; + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + const APInt &MulAmt = ConstOpOrElement->getAPIntValue(); + bool IsNeg = MulAmt.isNegative(); + APInt MulAmtAbs = MulAmt.abs(); + + if ((MulAmtAbs - 1).isPowerOf2()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + // (mul x, -(2^N + 1)) => -(add (shl x, N), x) + + if (!IsProfitable(IsNeg, true, VT)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT)); + SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); + + if (!IsNeg) + return Res; + + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); + } else if ((MulAmtAbs + 1).isPowerOf2()) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + + if (!IsProfitable(IsNeg, false, VT)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT)); + + if (!IsNeg) + return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0); + else + return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1); + + } else { + return SDValue(); + } +} + bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // Only duplicate to increase tail-calls for the 64bit SysV ABIs. if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 30acd60eba6f..97422c6eda36 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -1,9 +1,8 @@ //===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,6 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H #define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H -#include "PPC.h" #include "PPCInstrInfo.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" @@ -41,7 +39,7 @@ namespace llvm { // the enum. The order of elements in this enum matters! // Values that are added after this entry: // STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE - // are considerd memory opcodes and are treated differently than entries + // are considered memory opcodes and are treated differently than entries // that come before it. For example, ADD or MUL should be placed before // the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come // after it. @@ -161,7 +159,7 @@ namespace llvm { /// CALL - A direct function call. /// CALL_NOP is a call with the special NOP which follows 64-bit - /// SVR4 calls. + /// SVR4 calls and 32-bit/64-bit AIX calls. CALL, CALL_NOP, /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a @@ -193,9 +191,18 @@ namespace llvm { /// Direct move from a GPR to a VSX register (zero) MTVSRZ, - /// Direct move of 2 consective GPR to a VSX register. + /// Direct move of 2 consecutive GPR to a VSX register. BUILD_FP128, + /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and + /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is + /// unsupported for this target. + /// Merge 2 GPRs to a single SPE register. + BUILD_SPE64, + + /// Extract SPE register component, second argument is high or low. + EXTRACT_SPE, + /// Extract a subvector from signed integer vector and convert to FP. /// It is primarily used to convert a (widened) illegal integer vector /// type to a legal floating point vector type. @@ -265,11 +272,11 @@ namespace llvm { CR6UNSET, /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS - /// on PPC32. + /// for non-position independent code on PPC32. PPC32_GOT, /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and - /// local dynamic TLS on PPC32. + /// local dynamic TLS and position indendepent code on PPC32. PPC32_PICGOT, /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec @@ -405,6 +412,9 @@ namespace llvm { /// representation. QBFLT, + /// Custom extend v4f32 to v2f64. + FP_EXTEND_LH, + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of /// the GPRC input, then stores it through Ptr. Type can be either i16 or @@ -446,6 +456,10 @@ namespace llvm { /// an xxswapd. LXVD2X, + /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a + /// v2f32 value into the lower half of a VSR register. + LD_VSX_LH, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. @@ -620,6 +634,8 @@ namespace llvm { return true; } + bool preferIncOfAddToSubOfNot(EVT VT) const override; + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } @@ -653,18 +669,27 @@ namespace llvm { ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; + /// SelectAddressEVXRegReg - Given the specified addressed, check to see if + /// it can be more efficiently represented as [r+imm]. + bool SelectAddressEVXRegReg(SDValue N, SDValue &Base, SDValue &Index, + SelectionDAG &DAG) const; + /// SelectAddressRegReg - Given the specified addressed, check to see if it - /// can be represented as an indexed [r+r] operation. Returns false if it - /// can be more efficiently represented with [r+imm]. + /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment + /// is non-zero, only accept displacement which is not suitable for [r+imm]. + /// Returns false if it can be represented by [r+imm], which are preferred. bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, - SelectionDAG &DAG) const; + SelectionDAG &DAG, + unsigned EncodingAlignment = 0) const; /// SelectAddressRegImm - Returns true if the address N can be represented /// by a base register plus a signed 16-bit displacement [r+imm], and if it - /// is not better represented as reg+reg. If Aligned is true, only accept - /// displacements suitable for STD and friends, i.e. multiples of 4. + /// is not better represented as reg+reg. If \p EncodingAlignment is + /// non-zero, only accept displacements suitable for instruction encoding + /// requirement, i.e. multiples of 4 for DS form. bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, - SelectionDAG &DAG, unsigned Alignment) const; + SelectionDAG &DAG, + unsigned EncodingAlignment) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. @@ -833,14 +858,14 @@ namespace llvm { EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; /// Is unaligned memory access allowed for the given type, and is it fast /// relative to software emulation. - bool allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align = 1, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align = 1, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *Fast = nullptr) const override; /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be @@ -888,7 +913,8 @@ namespace llvm { bool useLoadStackGuardNode() const override; void insertSSPDeclarations(Module &M) const override; - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; unsigned getJumpTableEncoding() const override; bool isJumpTableRelative() const override; @@ -898,14 +924,6 @@ namespace llvm { unsigned JTI, MCContext &Ctx) const override; - unsigned getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const override; - - MVT getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const override; - private: struct ReuseLoadInfo { SDValue Ptr; @@ -953,6 +971,8 @@ namespace llvm { SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const; + SDValue LowerTRUNCATEVector(SDValue Op, SelectionDAG &DAG) const; + SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; @@ -1019,6 +1039,7 @@ namespace llvm { SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; @@ -1106,6 +1127,15 @@ namespace llvm { const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite CS) const; + SDValue LowerCall_AIX(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, bool isPatchPoint, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl &InVals, + ImmutableCallSite CS) const; SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; @@ -1119,6 +1149,7 @@ namespace llvm { SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; @@ -1137,8 +1168,6 @@ namespace llvm { int &RefinementSteps) const override; unsigned combineRepeatedFPDivisors() const override; - CCAssignFn *useFastISelCCs(unsigned Flag) const; - SDValue combineElementTruncationToVectorTruncation(SDNode *N, DAGCombinerInfo &DCI) const; @@ -1169,30 +1198,6 @@ namespace llvm { } // end namespace PPC - bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); - - bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); - - bool - CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); - - bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); - bool isIntS16Immediate(SDNode *N, int16_t &Imm); bool isIntS16Immediate(SDValue Op, int16_t &Imm); diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 2ce6ad3293eb..d598567f8e4e 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1,9 +1,8 @@ //===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -168,7 +167,7 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs), (ins memrix:$src), "bctrl\n\tld 2, $src", IIC_BrB, - [(PPCbctrl_load_toc ixaddr:$src)]>, + [(PPCbctrl_load_toc iaddrX4:$src)]>, Requires<[In64BitMode]>; } @@ -193,6 +192,12 @@ def : Pat<(PPCcall (i64 texternalsym:$dst)), def : Pat<(PPCcall_nop (i64 texternalsym:$dst)), (BL8_NOP texternalsym:$dst)>; +// Calls for AIX +def : Pat<(PPCcall (i64 mcsym:$dst)), + (BL8 mcsym:$dst)>; +def : Pat<(PPCcall_nop (i64 mcsym:$dst)), + (BL8_NOP mcsym:$dst)>; + // Atomic operations // FIXME: some of these might be used with constant operands. This will result // in constant materialization instructions that may be redundant. We currently @@ -383,7 +388,7 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), PPC970_DGroup_First, PPC970_Unit_FXU; } let hasSideEffects = 1, Defs = [CTR8] in { -let Pattern = [(int_ppc_mtctr i64:$rS)] in +let Pattern = [(int_set_loop_iterations i64:$rS)] in def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), "mtctr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; @@ -720,10 +725,17 @@ defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; -defm EXTSWSLI : XSForm_1r<31, 445, (outs g8rc:$rA), (ins gprc:$rS, u6imm:$SH), - "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI, - [(set i64:$rA, (PPCextswsli i32:$rS, (i32 imm:$SH)))]>, - isPPC64, Requires<[IsISA3_0]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm EXTSWSLI_32_64 : XSForm_1r<31, 445, (outs g8rc:$rA), + (ins gprc:$rS, u6imm:$SH), + "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI, + [(set i64:$rA, + (PPCextswsli i32:$rS, (i32 imm:$SH)))]>, + isPPC64, Requires<[IsISA3_0]>; + +defm EXTSWSLI : XSForm_1rc<31, 445, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), + "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI, + []>, isPPC64, Requires<[IsISA3_0]>; // For fast-isel: let isCodeGenOnly = 1, Defs = [CARRY] in @@ -773,13 +785,21 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), let Predicates = [IsISA3_0] in { def MADDHD : VAForm_1a<48, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), "maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; -def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), +def MADDHDU : VAForm_1a<49, + (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; -def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), - "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; +def MADDLD : VAForm_1a<51, (outs gprc :$RT), (ins gprc:$RA, gprc:$RB, gprc:$RC), + "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, + [(set i32:$RT, (add_without_simm16 (mul_without_simm16 i32:$RA, i32:$RB), i32:$RC))]>, + isPPC64; def SETB : XForm_44<31, 128, (outs gprc:$RT), (ins crrc:$BFA), "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def MADDLD8 : VAForm_1a<51, + (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), + "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, + [(set i64:$RT, (add_without_simm16 (mul_without_simm16 i64:$RA, i64:$RB), i64:$RC))]>, + isPPC64; def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA), "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; } @@ -911,7 +931,7 @@ def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src), def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src), "lwa $rD, $src", IIC_LdStLWA, [(set i64:$rD, - (aligned4sextloadi32 ixaddr:$src))]>, isPPC64, + (aligned4sextloadi32 iaddrX4:$src))]>, isPPC64, PPC970_DGroup_Cracked; let Interpretation64Bit = 1, isCodeGenOnly = 1 in def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src), @@ -920,7 +940,7 @@ def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src), PPC970_DGroup_Cracked; def LWAX : XForm_1_memOp<31, 341, (outs g8rc:$rD), (ins memrr:$src), "lwax $rD, $src", IIC_LdStLHA, - [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64, + [(set i64:$rD, (sextloadi32 xaddrX4:$src))]>, isPPC64, PPC970_DGroup_Cracked; // For fast-isel: let isCodeGenOnly = 1, mayLoad = 1 in { @@ -1022,7 +1042,7 @@ def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), let PPC970_Unit = 2 in { def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src), "ld $rD, $src", IIC_LdStLD, - [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64; + [(set i64:$rD, (aligned4load iaddrX4:$src))]>, isPPC64; // The following four definitions are selected for small code model only. // Otherwise, we need to create two instructions to form a 32-bit offset, // so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select(). @@ -1045,7 +1065,7 @@ def LDtocBA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), def LDX : XForm_1_memOp<31, 21, (outs g8rc:$rD), (ins memrr:$src), "ldx $rD, $src", IIC_LdStLD, - [(set i64:$rD, (load xaddr:$src))]>, isPPC64; + [(set i64:$rD, (load xaddrX4:$src))]>, isPPC64; def LDBRX : XForm_1_memOp<31, 532, (outs g8rc:$rD), (ins memrr:$src), "ldbrx $rD, $src", IIC_LdStLoad, [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64; @@ -1214,10 +1234,10 @@ def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst), // Normal 8-byte stores. def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst), "std $rS, $dst", IIC_LdStSTD, - [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64; + [(aligned4store i64:$rS, iaddrX4:$dst)]>, isPPC64; def STDX : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst), "stdx $rS, $dst", IIC_LdStSTD, - [(store i64:$rS, xaddr:$dst)]>, isPPC64, + [(store i64:$rS, xaddrX4:$dst)]>, isPPC64, PPC970_DGroup_Cracked; def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst), "stdbrx $rS, $dst", IIC_LdStStore, @@ -1433,11 +1453,11 @@ def : Pat<(unaligned4store i64:$rS, xoaddr:$dst), (STDX $rS, xoaddr:$dst)>; // 64-bits atomic loads and stores -def : Pat<(atomic_load_64 ixaddr:$src), (LD memrix:$src)>; -def : Pat<(atomic_load_64 xaddr:$src), (LDX memrr:$src)>; +def : Pat<(atomic_load_64 iaddrX4:$src), (LD memrix:$src)>; +def : Pat<(atomic_load_64 xaddrX4:$src), (LDX memrr:$src)>; -def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>; -def : Pat<(atomic_store_64 xaddr:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>; +def : Pat<(atomic_store_64 iaddrX4:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>; +def : Pat<(atomic_store_64 xaddrX4:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>; let Predicates = [IsISA3_0] in { diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index 69b19e45c3e9..8176c5120a83 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1,9 +1,8 @@ //===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -822,7 +821,9 @@ def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1, + isReMaterializable = 1 in { + def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins), "vxor $vD, $vD, $vD", IIC_VecFP, [(set v16i8:$vD, (v16i8 immAllZerosV))]>; @@ -899,6 +900,32 @@ def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>; def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>; def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>; +// Max/Min +def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)), + (v16i8 (VMAXUB $src1, $src2))>; +def : Pat<(v16i8 (smax v16i8:$src1, v16i8:$src2)), + (v16i8 (VMAXSB $src1, $src2))>; +def : Pat<(v8i16 (umax v8i16:$src1, v8i16:$src2)), + (v8i16 (VMAXUH $src1, $src2))>; +def : Pat<(v8i16 (smax v8i16:$src1, v8i16:$src2)), + (v8i16 (VMAXSH $src1, $src2))>; +def : Pat<(v4i32 (umax v4i32:$src1, v4i32:$src2)), + (v4i32 (VMAXUW $src1, $src2))>; +def : Pat<(v4i32 (smax v4i32:$src1, v4i32:$src2)), + (v4i32 (VMAXSW $src1, $src2))>; +def : Pat<(v16i8 (umin v16i8:$src1, v16i8:$src2)), + (v16i8 (VMINUB $src1, $src2))>; +def : Pat<(v16i8 (smin v16i8:$src1, v16i8:$src2)), + (v16i8 (VMINSB $src1, $src2))>; +def : Pat<(v8i16 (umin v8i16:$src1, v8i16:$src2)), + (v8i16 (VMINUH $src1, $src2))>; +def : Pat<(v8i16 (smin v8i16:$src1, v8i16:$src2)), + (v8i16 (VMINSH $src1, $src2))>; +def : Pat<(v4i32 (umin v4i32:$src1, v4i32:$src2)), + (v4i32 (VMINUW $src1, $src2))>; +def : Pat<(v4i32 (smin v4i32:$src1, v4i32:$src2)), + (v4i32 (VMINSW $src1, $src2))>; + // Shuffles. // Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x) diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h index cf71b1c59869..323f7e39adf7 100644 --- a/lib/Target/PowerPC/PPCInstrBuilder.h +++ b/lib/Target/PowerPC/PPCInstrBuilder.h @@ -1,9 +1,8 @@ //===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index 2fe765dd99e1..a48eb1690695 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -1,9 +1,8 @@ //===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -38,14 +37,6 @@ class I opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin> let TSFlags{2} = PPC970_Cracked; let TSFlags{5-3} = PPC970_Unit; - /// Indicate that the VSX instruction is to use VSX numbering/encoding. - /// Since ISA 3.0, there are scalar instructions that use the upper - /// half of the VSX register set only. Rather than adding further complexity - /// to the register class set, the VSX registers just include the Altivec - /// registers and this flag decides the numbering to be used for them. - bits<1> UseVSXReg = 0; - let TSFlags{6} = UseVSXReg; - // Indicate that this instruction is of type X-Form Load or Store bits<1> XFormMemOp = 0; let TSFlags{7} = XFormMemOp; @@ -74,7 +65,6 @@ class PPC970_Unit_VALU { bits<3> PPC970_Unit = 5; } class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; } class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; } -class UseVSXReg { bits<1> UseVSXReg = 1; } class XFormMemOp { bits<1> XFormMemOp = 1; } // Two joined instructions; used to emit two adjacent instructions as one. @@ -730,6 +720,7 @@ class XForm_25_memOp opcode, bits<10> xo, dag OOL, dag IOL, : XForm_base_r3xo_memOp { } +// [PO RT /// RB XO RC] class XForm_26 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : XForm_base_r3xo { @@ -1193,9 +1184,9 @@ class XX2_RD6_DCMX7_RS6 opcode, bits<4> xo1, bits<3> xo2, let Inst{11-15} = DCMX{4-0}; let Inst{16-20} = XB{4-0}; let Inst{21-24} = xo1; - let Inst{25} = DCMX{5}; + let Inst{25} = DCMX{6}; let Inst{26-28} = xo2; - let Inst{29} = DCMX{6}; + let Inst{29} = DCMX{5}; let Inst{30} = XB{5}; let Inst{31} = XT{5}; } diff --git a/lib/Target/PowerPC/PPCInstrHTM.td b/lib/Target/PowerPC/PPCInstrHTM.td index 0efe797c765d..104b57a70a2e 100644 --- a/lib/Target/PowerPC/PPCInstrHTM.td +++ b/lib/Target/PowerPC/PPCInstrHTM.td @@ -1,9 +1,8 @@ //===-- PPCInstrHTM.td - The PowerPC Hardware Transactional Memory -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,55 +20,53 @@ def HTM_get_imm : SDNodeXForm; let hasSideEffects = 1 in { -def TCHECK_RET : PPCCustomInserterPseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>; +def TCHECK_RET : PPCCustomInserterPseudo<(outs gprc:$out), (ins), "#TCHECK_RET", []>; +def TBEGIN_RET : PPCCustomInserterPseudo<(outs gprc:$out), (ins u1imm:$R), "#TBEGIN_RET", []>; } let Predicates = [HasHTM] in { +let Defs = [CR0] in { def TBEGIN : XForm_htm0 <31, 654, - (outs crrc0:$ret), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>; + (outs), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>; def TEND : XForm_htm1 <31, 686, - (outs crrc0:$ret), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>; + (outs), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>; def TABORT : XForm_base_r3xo <31, 910, - (outs crrc0:$ret), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR, + (outs), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR, []>, isDOT { let RST = 0; let B = 0; } def TABORTWC : XForm_base_r3xo <31, 782, - (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B), + (outs), (ins u5imm:$RTS, gprc:$A, gprc:$B), "tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>, isDOT; def TABORTWCI : XForm_base_r3xo <31, 846, - (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B), + (outs), (ins u5imm:$RTS, gprc:$A, u5imm:$B), "tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>, isDOT; def TABORTDC : XForm_base_r3xo <31, 814, - (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B), + (outs), (ins u5imm:$RTS, gprc:$A, gprc:$B), "tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>, isDOT; def TABORTDCI : XForm_base_r3xo <31, 878, - (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B), + (outs), (ins u5imm:$RTS, gprc:$A, u5imm:$B), "tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>, isDOT; def TSR : XForm_htm2 <31, 750, - (outs crrc0:$ret), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>, + (outs), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>, isDOT; -def TCHECK : XForm_htm3 <31, 718, - (outs), (ins crrc:$BF), "tcheck $BF", IIC_SprMTSPR, []>; - - def TRECLAIM : XForm_base_r3xo <31, 942, - (outs crrc:$ret), (ins gprc:$A), "treclaim. $A", + (outs), (ins gprc:$A), "treclaim. $A", IIC_SprMTSPR, []>, isDOT { let RST = 0; @@ -77,13 +74,17 @@ def TRECLAIM : XForm_base_r3xo <31, 942, } def TRECHKPT : XForm_base_r3xo <31, 1006, - (outs crrc:$ret), (ins), "trechkpt.", IIC_SprMTSPR, []>, + (outs), (ins), "trechkpt.", IIC_SprMTSPR, []>, isDOT { let RST = 0; let A = 0; let B = 0; } +} + +def TCHECK : XForm_htm3 <31, 718, + (outs crrc:$BF), (ins), "tcheck $BF", IIC_SprMTSPR, []>; // Builtins // All HTM instructions, with the exception of tcheck, set CR0 with the @@ -94,15 +95,11 @@ def TRECHKPT : XForm_base_r3xo <31, 1006, // tbegin builtin API which defines a return value of 1 as success. def : Pat<(int_ppc_tbegin i32:$R), - (XORI - (EXTRACT_SUBREG ( - TBEGIN (HTM_get_imm imm:$R)), sub_eq), - 1)>; + (XORI (TBEGIN_RET(HTM_get_imm imm:$R)), 1)>; def : Pat<(int_ppc_tend i32:$R), (TEND (HTM_get_imm imm:$R))>; - def : Pat<(int_ppc_tabort i32:$R), (TABORT $R)>; @@ -167,6 +164,8 @@ def : Pat<(int_ppc_tsuspend), (TSR 0)>; def : Pat<(i64 (int_ppc_ttest)), - (RLDICL (i64 (COPY (TABORTWCI 0, ZERO, 0))), 36, 28)>; + (RLDICL (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (TABORTWCI 0, (LI 0), 0), sub_32)), + 36, 28)>; } // [HasHTM] diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index d754ce2990d2..a787bdd56b9d 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -333,6 +332,17 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::ADDIStocHA: case PPC::ADDItocL: case PPC::LOAD_STACK_GUARD: + case PPC::XXLXORz: + case PPC::XXLXORspz: + case PPC::XXLXORdpz: + case PPC::V_SET0B: + case PPC::V_SET0H: + case PPC::V_SET0: + case PPC::V_SETALLONESB: + case PPC::V_SETALLONESH: + case PPC::V_SETALLONES: + case PPC::CRSET: + case PPC::CRUNSET: return true; } return false; @@ -381,9 +391,9 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // Swap op1/op2 assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) && "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo."); - unsigned Reg0 = MI.getOperand(0).getReg(); - unsigned Reg1 = MI.getOperand(1).getReg(); - unsigned Reg2 = MI.getOperand(2).getReg(); + Register Reg0 = MI.getOperand(0).getReg(); + Register Reg1 = MI.getOperand(1).getReg(); + Register Reg2 = MI.getOperand(2).getReg(); unsigned SubReg1 = MI.getOperand(1).getSubReg(); unsigned SubReg2 = MI.getOperand(2).getSubReg(); bool Reg1IsKill = MI.getOperand(1).isKill(); @@ -411,7 +421,7 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, if (NewMI) { // Create a new instruction. - unsigned Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg(); + Register Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg(); bool Reg0IsDead = MI.getOperand(0).isDead(); return BuildMI(MF, MI.getDebugLoc(), MI.getDesc()) .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead)) @@ -942,12 +952,16 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } else if (PPC::G8RCRegClass.contains(SrcReg) && PPC::VSFRCRegClass.contains(DestReg)) { + assert(Subtarget.hasDirectMove() && + "Subtarget doesn't support directmove, don't know how to copy."); BuildMI(MBB, I, DL, get(PPC::MTVSRD), DestReg).addReg(SrcReg); NumGPRtoVSRSpill++; getKillRegState(KillSrc); return; } else if (PPC::VSFRCRegClass.contains(SrcReg) && PPC::G8RCRegClass.contains(DestReg)) { + assert(Subtarget.hasDirectMove() && + "Subtarget doesn't support directmove, don't know how to copy."); BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; @@ -963,7 +977,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - unsigned Opc; if (PPC::GPRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::OR; @@ -996,6 +1009,8 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opc = PPC::QVFMRb; else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; + else if (PPC::SPE4RCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::OR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) Opc = PPC::EVOR; else @@ -1066,6 +1081,10 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg, OpcodeIndex = SOK_Float8Spill; } else if (PPC::F4RCRegClass.contains(Reg)) { OpcodeIndex = SOK_Float4Spill; + } else if (PPC::SPERCRegClass.contains(Reg)) { + OpcodeIndex = SOK_SPESpill; + } else if (PPC::SPE4RCRegClass.contains(Reg)) { + OpcodeIndex = SOK_SPE4Spill; } else if (PPC::CRRCRegClass.contains(Reg)) { OpcodeIndex = SOK_CRSpill; } else if (PPC::CRBITRCRegClass.contains(Reg)) { @@ -1152,6 +1171,10 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg, OpcodeIndex = SOK_Float8Spill; } else if (PPC::F4RCRegClass.contains(Reg)) { OpcodeIndex = SOK_Float4Spill; + } else if (PPC::SPERCRegClass.contains(Reg)) { + OpcodeIndex = SOK_SPESpill; + } else if (PPC::SPE4RCRegClass.contains(Reg)) { + OpcodeIndex = SOK_SPE4Spill; } else if (PPC::CRRCRegClass.contains(Reg)) { OpcodeIndex = SOK_CRSpill; } else if (PPC::CRBITRCRegClass.contains(Reg)) { @@ -1632,6 +1655,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD) return false; + const TargetRegisterInfo *TRI = &getRegisterInfo(); // The record forms set the condition register based on a signed comparison // with zero (so says the ISA manual). This is not as straightforward as it // seems, however, because this is always a 64-bit comparison on PPC64, even @@ -1645,6 +1669,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW; bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD; + // Look through copies unless that gets us to a physical register. + unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI); + if (TargetRegisterInfo::isVirtualRegister(ActualSrc)) + SrcReg = ActualSrc; + // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; @@ -1745,7 +1774,6 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, return false; PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm(); - PPC::Predicate NewPred = Pred; unsigned PredCond = PPC::getPredicateCondition(Pred); unsigned PredHint = PPC::getPredicateHint(Pred); int16_t Immed = (int16_t)Value; @@ -1755,25 +1783,23 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (Immed == -1 && PredCond == PPC::PRED_GT) // We convert "greater than -1" into "greater than or equal to 0", // since we are assuming signed comparison by !equalityOnly - NewPred = PPC::getPredicate(PPC::PRED_GE, PredHint); + Pred = PPC::getPredicate(PPC::PRED_GE, PredHint); else if (Immed == -1 && PredCond == PPC::PRED_LE) // We convert "less than or equal to -1" into "less than 0". - NewPred = PPC::getPredicate(PPC::PRED_LT, PredHint); + Pred = PPC::getPredicate(PPC::PRED_LT, PredHint); else if (Immed == 1 && PredCond == PPC::PRED_LT) // We convert "less than 1" into "less than or equal to 0". - NewPred = PPC::getPredicate(PPC::PRED_LE, PredHint); + Pred = PPC::getPredicate(PPC::PRED_LE, PredHint); else if (Immed == 1 && PredCond == PPC::PRED_GE) // We convert "greater than or equal to 1" into "greater than 0". - NewPred = PPC::getPredicate(PPC::PRED_GT, PredHint); + Pred = PPC::getPredicate(PPC::PRED_GT, PredHint); else return false; - PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), - NewPred)); + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), Pred)); } // Search for Sub. - const TargetRegisterInfo *TRI = &getRegisterInfo(); --I; // Get ready to iterate backward from CmpInstr. @@ -1992,7 +2018,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); - if (Opcode == PPC::INLINEASM) { + if (Opcode == PPC::INLINEASM || Opcode == PPC::INLINEASM_BR) { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); @@ -2358,13 +2384,6 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI; It++; unsigned Reg = MI.getOperand(i).getReg(); - // MachineInstr::readsRegister only returns true if the machine - // instruction reads the exact register or its super-register. It - // does not consider uses of sub-registers which seems like strange - // behaviour. Nonetheless, if we end up with a 64-bit register here, - // get the corresponding 32-bit register to check. - if (PPC::G8RCRegClass.contains(Reg)) - Reg = Reg - PPC::X0 + PPC::R0; // Is this register defined by some form of add-immediate (including // load-immediate) within this basic block? @@ -2381,7 +2400,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( return &*It; } break; - } else if (It->readsRegister(Reg, &getRegisterInfo())) + } else if (It->readsRegister(Reg, &getRegisterInfo())) // If we see another use of this reg between the def and the MI, // we want to flat it so the def isn't deleted. SeenIntermediateUse = true; @@ -2424,6 +2443,83 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const { return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0]; } +void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, + unsigned RegNo) const { + const MachineRegisterInfo &MRI = + StartMI.getParent()->getParent()->getRegInfo(); + if (MRI.isSSA()) + return; + + // Instructions between [StartMI, EndMI] should be in same basic block. + assert((StartMI.getParent() == EndMI.getParent()) && + "Instructions are not in same basic block"); + + bool IsKillSet = false; + + auto clearOperandKillInfo = [=] (MachineInstr &MI, unsigned Index) { + MachineOperand &MO = MI.getOperand(Index); + if (MO.isReg() && MO.isUse() && MO.isKill() && + getRegisterInfo().regsOverlap(MO.getReg(), RegNo)) + MO.setIsKill(false); + }; + + // Set killed flag for EndMI. + // No need to do anything if EndMI defines RegNo. + int UseIndex = + EndMI.findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo()); + if (UseIndex != -1) { + EndMI.getOperand(UseIndex).setIsKill(true); + IsKillSet = true; + // Clear killed flag for other EndMI operands related to RegNo. In some + // upexpected cases, killed may be set multiple times for same register + // operand in same MI. + for (int i = 0, e = EndMI.getNumOperands(); i != e; ++i) + if (i != UseIndex) + clearOperandKillInfo(EndMI, i); + } + + // Walking the inst in reverse order (EndMI -> StartMI]. + MachineBasicBlock::reverse_iterator It = EndMI; + MachineBasicBlock::reverse_iterator E = EndMI.getParent()->rend(); + // EndMI has been handled above, skip it here. + It++; + MachineOperand *MO = nullptr; + for (; It != E; ++It) { + // Skip insturctions which could not be a def/use of RegNo. + if (It->isDebugInstr() || It->isPosition()) + continue; + + // Clear killed flag for all It operands related to RegNo. In some + // upexpected cases, killed may be set multiple times for same register + // operand in same MI. + for (int i = 0, e = It->getNumOperands(); i != e; ++i) + clearOperandKillInfo(*It, i); + + // If killed is not set, set killed for its last use or set dead for its def + // if no use found. + if (!IsKillSet) { + if ((MO = It->findRegisterUseOperand(RegNo, false, &getRegisterInfo()))) { + // Use found, set it killed. + IsKillSet = true; + MO->setIsKill(true); + continue; + } else if ((MO = It->findRegisterDefOperand(RegNo, false, true, + &getRegisterInfo()))) { + // No use found, set dead for its def. + assert(&*It == &StartMI && "No new def between StartMI and EndMI."); + MO->setIsDead(true); + break; + } + } + + if ((&*It) == &StartMI) + break; + } + // Ensure RegMo liveness is killed after EndMI. + assert((IsKillSet || (MO && MO->isDead())) && + "RegNo should be killed or dead"); +} + // If this instruction has an immediate form and one of its operands is a // result of a load-immediate or an add-immediate, convert it to // the immediate form if the constant is in range. @@ -2440,8 +2536,9 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, return false; assert(ForwardingOperand < MI.getNumOperands() && "The forwarding operand needs to be valid at this point"); - bool KillFwdDefMI = !SeenIntermediateUse && - MI.getOperand(ForwardingOperand).isKill(); + bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill(); + bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled; + unsigned ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg(); if (KilledDef && KillFwdDefMI) *KilledDef = DefMI; @@ -2450,8 +2547,9 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, // If this is a reg+reg instruction that has a reg+imm form, // and one of the operands is produced by an add-immediate, // try to convert it. - if (HasImmForm && transformToImmFormFedByAdd(MI, III, ForwardingOperand, - *DefMI, KillFwdDefMI)) + if (HasImmForm && + transformToImmFormFedByAdd(MI, III, ForwardingOperand, *DefMI, + KillFwdDefMI)) return true; if ((DefMI->getOpcode() != PPC::LI && DefMI->getOpcode() != PPC::LI8) || @@ -2466,7 +2564,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, // If this is a reg+reg instruction that has a reg+imm form, // and one of the operands is produced by LI, convert it now. if (HasImmForm) - return transformToImmFormFedByLI(MI, III, ForwardingOperand, SExtImm); + return transformToImmFormFedByLI(MI, III, ForwardingOperand, *DefMI, SExtImm); bool ReplaceWithLI = false; bool Is64BitLI = false; @@ -2486,6 +2584,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, case PPC::CMPLDI: { // Doing this post-RA would require dataflow analysis to reliably find uses // of the CR register set by the compare. + // No need to fixup killed/dead flag since this transformation is only valid + // before RA. if (PostRA) return false; // If a compare-immediate is fed by an immediate and is itself an input of @@ -2662,6 +2762,14 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, if (KilledDef && SetCR) *KilledDef = nullptr; replaceInstrWithLI(MI, LII); + + // Fixup killed/dead flag after transformation. + // Pattern: + // ForwardingOperandReg = LI imm1 + // y = op2 imm2, ForwardingOperandReg(killed) + if (IsForwardingOperandKilled) + fixupIsDeadOrKill(*DefMI, MI, ForwardingOperandReg); + LLVM_DEBUG(dbgs() << "With:\n"); LLVM_DEBUG(MI.dump()); return true; @@ -2669,10 +2777,6 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, return false; } -static bool isVFReg(unsigned Reg) { - return PPC::VFRCRegClass.contains(Reg); -} - bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III, bool PostRA) const { unsigned Opc = MI.getOpcode(); @@ -3007,7 +3111,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::LXSSPX: if (PostRA) { - if (isVFReg(MI.getOperand(0).getReg())) + if (isVFRegister(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::LXSSP; else { III.ImmOpcode = PPC::LFS; @@ -3021,7 +3125,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::LXSDX: if (PostRA) { - if (isVFReg(MI.getOperand(0).getReg())) + if (isVFRegister(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::LXSD; else { III.ImmOpcode = PPC::LFD; @@ -3039,7 +3143,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::STXSSPX: if (PostRA) { - if (isVFReg(MI.getOperand(0).getReg())) + if (isVFRegister(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::STXSSP; else { III.ImmOpcode = PPC::STFS; @@ -3053,7 +3157,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::STXSDX: if (PostRA) { - if (isVFReg(MI.getOperand(0).getReg())) + if (isVFRegister(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::STXSD; else { III.ImmOpcode = PPC::STFD; @@ -3110,7 +3214,7 @@ static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) { } } -// Check if the 'MI' that has the index OpNoForForwarding +// Check if the 'MI' that has the index OpNoForForwarding // meets the requirement described in the ImmInstrInfo. bool PPCInstrInfo::isUseMIElgibleForForwarding(MachineInstr &MI, const ImmInstrInfo &III, @@ -3156,7 +3260,7 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI, MachineOperand *&RegMO) const { unsigned Opc = DefMI.getOpcode(); if (Opc != PPC::ADDItocL && Opc != PPC::ADDI && Opc != PPC::ADDI8) - return false; + return false; assert(DefMI.getNumOperands() >= 3 && "Add inst must have at least three operands"); @@ -3169,11 +3273,10 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI, return isAnImmediateOperand(*ImmMO); } -bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO, - const MachineInstr &DefMI, - const MachineInstr &MI, - bool KillDefMI - ) const { +bool PPCInstrInfo::isRegElgibleForForwarding( + const MachineOperand &RegMO, const MachineInstr &DefMI, + const MachineInstr &MI, bool KillDefMI, + bool &IsFwdFeederRegKilled) const { // x = addi y, imm // ... // z = lfdx 0, x -> z = lfd imm(y) @@ -3184,14 +3287,7 @@ bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO, if (MRI.isSSA()) return false; - // MachineInstr::readsRegister only returns true if the machine - // instruction reads the exact register or its super-register. It - // does not consider uses of sub-registers which seems like strange - // behaviour. Nonetheless, if we end up with a 64-bit register here, - // get the corresponding 32-bit register to check. unsigned Reg = RegMO.getReg(); - if (PPC::G8RCRegClass.contains(Reg)) - Reg = Reg - PPC::X0 + PPC::R0; // Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg. MachineBasicBlock::const_reverse_iterator It = MI; @@ -3200,15 +3296,17 @@ bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO, for (; It != E; ++It) { if (It->modifiesRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI) return false; + else if (It->killsRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI) + IsFwdFeederRegKilled = true; // Made it to DefMI without encountering a clobber. if ((&*It) == &DefMI) break; } assert((&*It) == &DefMI && "DefMI is missing"); - // If DefMI also uses the register to be forwarded, we can only forward it + // If DefMI also defines the register to be forwarded, we can only forward it // if DefMI is being erased. - if (DefMI.readsRegister(Reg, &getRegisterInfo())) + if (DefMI.modifiesRegister(Reg, &getRegisterInfo())) return KillDefMI; return true; @@ -3271,11 +3369,9 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO, // is the literal zero, attempt to forward the source of the add-immediate to // the corresponding D-Form instruction with the displacement coming from // the immediate being added. -bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI, - const ImmInstrInfo &III, - unsigned OpNoForForwarding, - MachineInstr &DefMI, - bool KillDefMI) const { +bool PPCInstrInfo::transformToImmFormFedByAdd( + MachineInstr &MI, const ImmInstrInfo &III, unsigned OpNoForForwarding, + MachineInstr &DefMI, bool KillDefMI) const { // RegMO ImmMO // | | // x = addi reg, imm <----- DefMI @@ -3300,10 +3396,19 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI, if (!isImmElgibleForForwarding(*ImmMO, DefMI, III, Imm)) return false; + bool IsFwdFeederRegKilled = false; // Check if the RegMO can be forwarded to MI. - if (!isRegElgibleForForwarding(*RegMO, DefMI, MI, KillDefMI)) + if (!isRegElgibleForForwarding(*RegMO, DefMI, MI, KillDefMI, + IsFwdFeederRegKilled)) return false; + // Get killed info in case fixup needed after transformation. + unsigned ForwardKilledOperandReg = ~0U; + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + bool PostRA = !MRI.isSSA(); + if (PostRA && MI.getOperand(OpNoForForwarding).isKill()) + ForwardKilledOperandReg = MI.getOperand(OpNoForForwarding).getReg(); + // We know that, the MI and DefMI both meet the pattern, and // the Imm also meet the requirement with the new Imm-form. // It is safe to do the transformation now. @@ -3327,7 +3432,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI, // Otherwise, it is Constant Pool Index(CPI) or Global, // which is relocation in fact. We need to replace the special zero // register with ImmMO. - // Before that, we need to fixup the target flags for imm. + // Before that, we need to fixup the target flags for imm. // For some reason, we miss to set the flag for the ImmMO if it is CPI. if (DefMI.getOpcode() == PPC::ADDItocL) ImmMO->setTargetFlags(PPCII::MO_TOC_LO); @@ -3354,6 +3459,22 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI, // Update the opcode. MI.setDesc(get(III.ImmOpcode)); + // Fix up killed/dead flag after transformation. + // Pattern 1: + // x = ADD KilledFwdFeederReg, imm + // n = opn KilledFwdFeederReg(killed), regn + // y = XOP 0, x + // Pattern 2: + // x = ADD reg(killed), imm + // y = XOP 0, x + if (IsFwdFeederRegKilled || RegMO->isKill()) + fixupIsDeadOrKill(DefMI, MI, RegMO->getReg()); + // Pattern 3: + // ForwardKilledOperandReg = ADD reg, imm + // y = XOP 0, ForwardKilledOperandReg(killed) + if (ForwardKilledOperandReg != ~0U) + fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + LLVM_DEBUG(dbgs() << "With:\n"); LLVM_DEBUG(MI.dump()); @@ -3363,6 +3484,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI, bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, const ImmInstrInfo &III, unsigned ConstantOpNo, + MachineInstr &DefMI, int64_t Imm) const { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); bool PostRA = !MRI.isSSA(); @@ -3401,6 +3523,11 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, return false; } + // Get killed info in case fixup needed after transformation. + unsigned ForwardKilledOperandReg = ~0U; + if (PostRA && MI.getOperand(ConstantOpNo).isKill()) + ForwardKilledOperandReg = MI.getOperand(ConstantOpNo).getReg(); + unsigned Opc = MI.getOpcode(); bool SpecialShift32 = Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo; @@ -3483,6 +3610,13 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, } } } + + // Fix up killed/dead flag after transformation. + // Pattern: + // ForwardKilledOperandReg = LI imm + // y = XOP reg, ForwardKilledOperandReg(killed) + if (ForwardKilledOperandReg != ~0U) + fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); return true; } @@ -3784,3 +3918,133 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, } return false; } + +bool PPCInstrInfo::isBDNZ(unsigned Opcode) const { + return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ)); +} + +bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, + MachineInstr *&CmpInst) const { + MachineBasicBlock *LoopEnd = L.getBottomBlock(); + MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator(); + // We really "analyze" only CTR loops right now. + if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) { + IndVarInst = nullptr; + CmpInst = &*I; + return false; + } + return true; +} + +MachineInstr * +PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const { + + unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop); + + // The loop set-up instruction should be in preheader + for (auto &I : PreHeader.instrs()) + if (I.getOpcode() == LOOPi) + return &I; + return nullptr; +} + +unsigned PPCInstrInfo::reduceLoopCount( + MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar, + MachineInstr &Cmp, SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, unsigned Iter, + unsigned MaxIter) const { + // We expect a hardware loop currently. This means that IndVar is set + // to null, and the compare is the ENDLOOP instruction. + assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop"); + MachineFunction *MF = MBB.getParent(); + DebugLoc DL = Cmp.getDebugLoc(); + MachineInstr *Loop = findLoopInstr(PreHeader); + if (!Loop) + return 0; + unsigned LoopCountReg = Loop->getOperand(0).getReg(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg); + + if (!LoopCount) + return 0; + // If the loop trip count is a compile-time value, then just change the + // value. + if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) { + int64_t Offset = LoopCount->getOperand(1).getImm(); + if (Offset <= 1) { + LoopCount->eraseFromParent(); + Loop->eraseFromParent(); + return 0; + } + LoopCount->getOperand(1).setImm(Offset - 1); + return Offset - 1; + } + + // The loop trip count is a run-time value. + // We need to subtract one from the trip count, + // and insert branch later to check if we're done with the loop. + + // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1, + // so we don't need to generate any thing here. + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg( + Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true)); + return LoopCountReg; +} + +// Return true if get the base operand, byte offset of an instruction and the +// memory width. Width is the size of memory that is being loaded/stored. +bool PPCInstrInfo::getMemOperandWithOffsetWidth( + const MachineInstr &LdSt, + const MachineOperand *&BaseReg, + int64_t &Offset, + unsigned &Width, + const TargetRegisterInfo *TRI) const { + assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); + + // Handle only loads/stores with base register followed by immediate offset. + if (LdSt.getNumExplicitOperands() != 3) + return false; + if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg()) + return false; + + if (!LdSt.hasOneMemOperand()) + return false; + + Width = (*LdSt.memoperands_begin())->getSize(); + Offset = LdSt.getOperand(1).getImm(); + BaseReg = &LdSt.getOperand(2); + return true; +} + +bool PPCInstrInfo::areMemAccessesTriviallyDisjoint( + const MachineInstr &MIa, const MachineInstr &MIb, + AliasAnalysis * /*AA*/) const { + assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); + assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); + + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || + MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) + return false; + + // Retrieve the base register, offset from the base register and width. Width + // is the size of memory that is being loaded/stored (e.g. 1, 2, 4). If + // base registers are identical, and the offset of a lower memory access + + // the width doesn't overlap the offset of a higher memory access, + // then the memory accesses are different. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; + int64_t OffsetA = 0, OffsetB = 0; + unsigned int WidthA = 0, WidthB = 0; + if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && + getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { + if (BaseOpA->isIdenticalTo(*BaseOpB)) { + int LowOffset = std::min(OffsetA, OffsetB); + int HighOffset = std::max(OffsetA, OffsetB); + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + if (LowOffset + LowWidth <= HighOffset) + return true; + } + } + return false; +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index 7ed558b835af..70fb757e8f1e 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -1,9 +1,8 @@ //===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H #define LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H -#include "PPC.h" #include "PPCRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -66,9 +64,6 @@ enum { /// Shift count to bypass PPC970 flags NewDef_Shift = 6, - /// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX - /// register (v0-v31). - UseVSXReg = 0x1 << NewDef_Shift, /// This instruction is an X-Form memory operation. XFormMemOp = 0x1 << (NewDef_Shift+1) }; @@ -129,12 +124,12 @@ class PPCInstrInfo : public PPCGenInstrInfo { // If the inst has imm-form and one of its operand is produced by a LI, // put the imm into the inst directly and remove the LI if possible. bool transformToImmFormFedByLI(MachineInstr &MI, const ImmInstrInfo &III, - unsigned ConstantOpNo, int64_t Imm) const; + unsigned ConstantOpNo, MachineInstr &DefMI, + int64_t Imm) const; // If the inst has imm-form and one of its operand is produced by an // add-immediate, try to transform it when possible. bool transformToImmFormFedByAdd(MachineInstr &MI, const ImmInstrInfo &III, - unsigned ConstantOpNo, - MachineInstr &DefMI, + unsigned ConstantOpNo, MachineInstr &DefMI, bool KillDefMI) const; // Try to find that, if the instruction 'MI' contains any operand that // could be forwarded from some inst that feeds it. If yes, return the @@ -159,8 +154,8 @@ class PPCInstrInfo : public PPCGenInstrInfo { int64_t &Imm) const; bool isRegElgibleForForwarding(const MachineOperand &RegMO, const MachineInstr &DefMI, - const MachineInstr &MI, - bool KillDefMI) const; + const MachineInstr &MI, bool KillDefMI, + bool &IsFwdFeederRegKilled) const; const unsigned *getStoreOpcodesForSpillArray() const; const unsigned *getLoadOpcodesForSpillArray() const; virtual void anchor(); @@ -362,6 +357,22 @@ public: unsigned SrcReg2, int Mask, int Value, const MachineRegisterInfo *MRI) const override; + + /// Return true if get the base operand, byte offset of an instruction and + /// the memory width. Width is the size of memory that is being + /// loaded/stored (e.g. 1, 2, 4, 8). + bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, + int64_t &Offset, unsigned &Width, + const TargetRegisterInfo *TRI) const; + + /// Return true if two MIs access different memory addresses and false + /// otherwise + bool + areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, + AliasAnalysis *AA = nullptr) const override; + /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. /// @@ -412,6 +423,18 @@ public: bool convertToImmediateForm(MachineInstr &MI, MachineInstr **KilledDef = nullptr) const; + + /// Fixup killed/dead flag for register \p RegNo between instructions [\p + /// StartMI, \p EndMI]. Some PostRA transformations may violate register + /// killed/dead flags semantics, this function can be called to fix up. Before + /// calling this function, + /// 1. Ensure that \p RegNo liveness is killed after instruction \p EndMI. + /// 2. Ensure that there is no new definition between (\p StartMI, \p EndMI) + /// and possible definition for \p RegNo is \p StartMI or \p EndMI. + /// 3. Ensure that all instructions between [\p StartMI, \p EndMI] are in same + /// basic block. + void fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, + unsigned RegNo) const; void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const; void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo, int64_t Imm) const; @@ -429,14 +452,55 @@ public: /// operands). static unsigned getRegNumForOperand(const MCInstrDesc &Desc, unsigned Reg, unsigned OpNo) { - if (Desc.TSFlags & PPCII::UseVSXReg) { - if (isVRRegister(Reg)) - Reg = PPC::VSX32 + (Reg - PPC::V0); - else if (isVFRegister(Reg)) - Reg = PPC::VSX32 + (Reg - PPC::VF0); + int16_t regClass = Desc.OpInfo[OpNo].RegClass; + switch (regClass) { + // We store F0-F31, VF0-VF31 in MCOperand and it should be F0-F31, + // VSX32-VSX63 during encoding/disassembling + case PPC::VSSRCRegClassID: + case PPC::VSFRCRegClassID: + if (isVFRegister(Reg)) + return PPC::VSX32 + (Reg - PPC::VF0); + break; + // We store VSL0-VSL31, V0-V31 in MCOperand and it should be VSL0-VSL31, + // VSX32-VSX63 during encoding/disassembling + case PPC::VSRCRegClassID: + if (isVRRegister(Reg)) + return PPC::VSX32 + (Reg - PPC::V0); + break; + // Other RegClass doesn't need mapping + default: + break; } return Reg; } + + /// Check \p Opcode is BDNZ (Decrement CTR and branch if it is still nonzero). + bool isBDNZ(unsigned Opcode) const; + + /// Find the hardware loop instruction used to set-up the specified loop. + /// On PPC, we have two instructions used to set-up the hardware loop + /// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8) + /// instructions to indicate the end of a loop. + MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const; + + /// Analyze the loop code to find the loop induction variable and compare used + /// to compute the number of iterations. Currently, we analyze loop that are + /// controlled using hardware loops. In this case, the induction variable + /// instruction is null. For all other cases, this function returns true, + /// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will + /// return new values when we can analyze the readonly loop \p L, otherwise, + /// nothing got changed + bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, + MachineInstr *&CmpInst) const override; + /// Generate code to reduce the loop iteration by one and check if the loop + /// is finished. Return the value/register of the new loop count. We need + /// this function when peeling off one or more iterations of a loop. This + /// function assumes the last iteration is peeled first. + unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, + unsigned Iter, unsigned MaxIter) const override; }; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index dd3f1ac79089..c313337047f0 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -1,9 +1,8 @@ //===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -231,6 +230,18 @@ def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128", SDTCisSameAs<1,2>]>, []>; +def PPCbuild_spe64: SDNode<"PPCISD::BUILD_SPE64", + SDTypeProfile<1, 2, + [SDTCisVT<0, f64>, SDTCisVT<1,i32>, + SDTCisVT<1,i32>]>, + []>; + +def PPCextract_spe : SDNode<"PPCISD::EXTRACT_SPE", + SDTypeProfile<1, 2, + [SDTCisVT<0, i32>, SDTCisVT<1, f64>, + SDTCisPtrTy<2>]>, + []>; + // These are target-independent nodes, but have target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart, [SDNPHasChain, SDNPOutGlue]>; @@ -458,6 +469,17 @@ def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), return !isOffsetMultipleOf(N, 16); }]>; +// PatFrag for binary operation whose operands are both non-constant +class BinOpWithoutSImm16Operand : + PatFrag<(ops node:$left, node:$right), (opcode node:$left, node:$right), [{ + int16_t Imm; + return !isIntS16Immediate(N->getOperand(0), Imm) + && !isIntS16Immediate(N->getOperand(1), Imm); +}]>; + +def add_without_simm16 : BinOpWithoutSImm16Operand; +def mul_without_simm16 : BinOpWithoutSImm16Operand; + //===----------------------------------------------------------------------===// // PowerPC Flag Definitions. @@ -546,10 +568,6 @@ def PPCRegCRRCAsmOperand : AsmOperandClass { def crrc : RegisterOperand { let ParserMatchClass = PPCRegCRRCAsmOperand; } -def crrc0 : RegisterOperand { - let ParserMatchClass = PPCRegCRRCAsmOperand; -} - def PPCRegSPERCAsmOperand : AsmOperandClass { let Name = "RegSPERC"; let PredicateMethod = "isRegNumber"; } @@ -737,7 +755,9 @@ def abscondbrtarget : Operand { def calltarget : Operand { let PrintMethod = "printBranchOperand"; let EncoderMethod = "getDirectBrEncoding"; + let DecoderMethod = "DecodePCRel24BranchTarget"; let ParserMatchClass = PPCDirectBrAsmOperand; + let OperandType = "OPERAND_PCREL"; } def abscalltarget : Operand { let PrintMethod = "printAbsBranchOperand"; @@ -881,11 +901,24 @@ def pred : Operand { } // Define PowerPC specific addressing mode. -def iaddr : ComplexPattern; -def xaddr : ComplexPattern; + +// d-form +def iaddr : ComplexPattern; // "stb" +// ds-form +def iaddrX4 : ComplexPattern; // "std" +// dq-form +def iaddrX16 : ComplexPattern; // "stxv" + +// Below forms are all x-form addressing mode, use three different ones so we +// can make a accurate check for x-form instructions in ISEL. +// x-form addressing mode whose associated diplacement form is D. +def xaddr : ComplexPattern; // "stbx" +// x-form addressing mode whose associated diplacement form is DS. +def xaddrX4 : ComplexPattern; // "stdx" +// x-form addressing mode whose associated diplacement form is DQ. +def xaddrX16 : ComplexPattern; // "stxvx" + def xoaddr : ComplexPattern; -def ixaddr : ComplexPattern; // "std" -def iqaddr : ComplexPattern; // "stxv" // The address in a single register. This is used with the SjLj // pseudo-instructions. @@ -1309,6 +1342,15 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { } } +// Set the float rounding mode. +let Uses = [RM], Defs = [RM] in { +def SETRNDi : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins u2imm:$RND), + "#SETRNDi", [(set f64:$FRT, (int_ppc_setrnd (i32 imm:$RND)))]>; + +def SETRND : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins gprc:$in), + "#SETRND", [(set f64:$FRT, (int_ppc_setrnd gprc :$in))]>; +} + let Defs = [LR] in def MovePCtoLR : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR", []>, PPC970_Unit_BRU; @@ -1435,6 +1477,9 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { def BCLn : BForm_4<16, 4, 0, 1, (outs), (ins crbitrc:$bi, condbrtarget:$dst), "bcl 4, $bi, $dst">; + def BL_NOP : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; } } let Uses = [CTR, RM] in { @@ -2512,6 +2557,7 @@ def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD), [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>; let isCodeGenOnly = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins), "creqv $dst, $dst, $dst", IIC_BrCR, [(set i1:$dst, 1)]>; @@ -2519,6 +2565,7 @@ def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins), def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins), "crxor $dst, $dst, $dst", IIC_BrCR, [(set i1:$dst, 0)]>; +} let Defs = [CR1EQ], CRD = 6 in { def CR6SET : XLForm_1_ext<19, 289, (outs), (ins), @@ -2566,7 +2613,7 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), PPC970_DGroup_First, PPC970_Unit_FXU; } let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in { -let Pattern = [(int_ppc_mtctr i32:$rS)] in +let Pattern = [(int_set_loop_iterations i32:$rS)] in def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), "mtctr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; @@ -2993,9 +3040,16 @@ def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm), // Calls def : Pat<(PPCcall (i32 tglobaladdr:$dst)), (BL tglobaladdr:$dst)>; + def : Pat<(PPCcall (i32 texternalsym:$dst)), (BL texternalsym:$dst)>; +// Calls for AIX only +def : Pat<(PPCcall (i32 mcsym:$dst)), + (BL mcsym:$dst)>; +def : Pat<(PPCcall_nop (i32 mcsym:$dst)), + (BL_NOP mcsym:$dst)>; + def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; @@ -4071,6 +4125,10 @@ def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB), def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>; +let Defs = [CR0] in +def SLBFEEo : XForm_26<31, 979, (outs gprc:$RT), (ins gprc:$RB), + "slbfee. $RT, $RB", IIC_SprSLBFEE, []>, isDOT; + def TLBIA : XForm_0<31, 370, (outs), (ins), "tlbia", IIC_SprTLBIA, []>; diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td index ef589ad01fd7..d67041d46d9f 100644 --- a/lib/Target/PowerPC/PPCInstrQPX.td +++ b/lib/Target/PowerPC/PPCInstrQPX.td @@ -1,9 +1,8 @@ //===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCInstrSPE.td b/lib/Target/PowerPC/PPCInstrSPE.td index 9f5891a45f22..935c3044ae47 100644 --- a/lib/Target/PowerPC/PPCInstrSPE.td +++ b/lib/Target/PowerPC/PPCInstrSPE.td @@ -1,9 +1,8 @@ //=======-- PPCInstrSPE.td - The PowerPC SPE Extension -*- tablegen -*-=======// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -512,7 +511,7 @@ def EVLWWSPLATX : EVXForm_1<792, (outs sperc:$RT), (ins memrr:$src), def EVMERGEHI : EVXForm_1<556, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB), "evmergehi $RT, $RA, $RB", IIC_VecGeneral, []>; -def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB), +def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins gprc:$RA, gprc:$RB), "evmergelo $RT, $RA, $RB", IIC_VecGeneral, []>; def EVMERGEHILO : EVXForm_1<558, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB), "evmergehilo $RT, $RA, $RB", IIC_VecGeneral, []>; @@ -887,4 +886,14 @@ def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)), (SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)), (SELECT_SPE (CRXOR $lhs, $rhs), $tval, $fval)>; + + +def : Pat<(f64 (PPCbuild_spe64 i32:$rB, i32:$rA)), + (f64 (COPY_TO_REGCLASS (EVMERGELO $rA, $rB), SPERC))>; + +def : Pat<(i32 (PPCextract_spe f64:$rA, 1)), + (i32 (EXTRACT_SUBREG (EVMERGEHI $rA, $rA), sub_32))>; +def : Pat<(i32 (PPCextract_spe f64:$rA, 0)), + (i32 (EXTRACT_SUBREG $rA, sub_32))>; + } diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 0f073388dc74..07f38a61d098 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -1,9 +1,8 @@ //===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -54,6 +53,15 @@ def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass { def spilltovsrrc : RegisterOperand { let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand; } + +def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [ + SDTCisVT<0, v4f32>, SDTCisPtrTy<1> +]>; + +def SDT_PPCfpextlh : SDTypeProfile<1, 1, [ + SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32> +]>; + // Little-endian-specific nodes. def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>, SDTCisPtrTy<1> @@ -85,6 +93,10 @@ def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>; def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; +def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>; +def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, ValueType OutTy, ValueType InTy> { @@ -124,7 +136,6 @@ def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">; let Predicates = [HasVSX] in { let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. -let UseVSXReg = 1 in { let hasSideEffects = 0 in { // VSX instructions don't have side effects. let Uses = [RM] in { @@ -841,12 +852,12 @@ let Uses = [RM] in { "xxlxor $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>; } // isCommutable - let isCodeGenOnly = 1 in - def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins), + + let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1, + isReMaterializable = 1 in { + def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set v4i32:$XT, (v4i32 immAllZerosV))]>; - - let isCodeGenOnly = 1 in { def XXLXORdpz : XX3Form_SetZero<60, 154, (outs vsfrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, @@ -895,11 +906,10 @@ let Uses = [RM] in { (PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>; let isCodeGenOnly = 1 in def XXSPLTWs : XX2Form_2<60, 164, - (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM), + (outs vsrc:$XT), (ins vsfrc:$XB, u2imm:$UIM), "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>; } // hasSideEffects -} // UseVSXReg = 1 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. @@ -961,6 +971,10 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def : Pat<(v4i32 (vnot_ppc v4i32:$A)), (v4i32 (XXLNOR $A, $A))>; +def : Pat<(v4i32 (or (and (vnot_ppc v4i32:$C), v4i32:$A), + (and v4i32:$B, v4i32:$C))), + (v4i32 (XXSEL $A, $B, $C))>; + let Predicates = [IsBigEndian] in { def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>; @@ -1063,6 +1077,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)), def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)), (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>; +def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>; + // Loads. let Predicates = [HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>; @@ -1176,6 +1192,15 @@ def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC), def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC), (XXSEL $vC, $vB, $vA)>; +def : Pat<(v4f32 (fmaxnum v4f32:$src1, v4f32:$src2)), + (v4f32 (XVMAXSP $src1, $src2))>; +def : Pat<(v4f32 (fminnum v4f32:$src1, v4f32:$src2)), + (v4f32 (XVMINSP $src1, $src2))>; +def : Pat<(v2f64 (fmaxnum v2f64:$src1, v2f64:$src2)), + (v2f64 (XVMAXDP $src1, $src2))>; +def : Pat<(v2f64 (fminnum v2f64:$src1, v2f64:$src2)), + (v2f64 (XVMINDP $src1, $src2))>; + let Predicates = [IsLittleEndian] in { def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))), (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; @@ -1248,7 +1273,7 @@ def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">; def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">; let Predicates = [HasP8Vector] in { let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. - let isCommutable = 1, UseVSXReg = 1 in { + let isCommutable = 1 in { def XXLEQV : XX3Form<60, 186, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxleqv $XT, $XA, $XB", IIC_VecGeneral, @@ -1258,12 +1283,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. "xxlnand $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA, v4i32:$XB)))]>; - } // isCommutable, UseVSXReg + } // isCommutable def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B), (XXLEQV $A, $B)>; - let UseVSXReg = 1 in { def XXLORC : XX3Form<60, 170, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlorc $XT, $XA, $XB", IIC_VecGeneral, @@ -1312,7 +1336,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. "#STIWX", [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; } // mayStore - } // UseVSXReg = 1 def : Pat<(f64 (extloadf32 xoaddr:$src)), (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>; @@ -1342,7 +1365,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)), (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>; - let UseVSXReg = 1 in { // VSX Elementary Scalar FP arithmetic (SP) let isCommutable = 1 in { def XSADDSP : XX3Form<60, 0, @@ -1354,7 +1376,10 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. "xsmulsp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fmul f32:$XA, f32:$XB))]>; } // isCommutable - + def XSSUBSP : XX3Form<60, 8, + (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), + "xssubsp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>; def XSDIVSP : XX3Form<60, 24, (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), "xsdivsp $XT, $XA, $XB", IIC_FPDivS, @@ -1374,10 +1399,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. (outs vssrc:$XT), (ins vssrc:$XB), "xsrsqrtesp $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfrsqrte f32:$XB))]>; - def XSSUBSP : XX3Form<60, 8, - (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), - "xssubsp $XT, $XA, $XB", IIC_VecFP, - [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>; // FMA Instructions let BaseName = "XSMADDASP" in { @@ -1470,7 +1491,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. "xscvdpspn $XT, $XB", IIC_VecFP, []>; def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB), "xscvspdpn $XT, $XB", IIC_VecFP, []>; - } // UseVSXReg = 1 let Predicates = [IsLittleEndian] in { def : Pat; + def : Pat<(v2i64 (smax v2i64:$src1, v2i64:$src2)), + (v2i64 (VMAXSD (COPY_TO_REGCLASS $src1, VRRC), + (COPY_TO_REGCLASS $src2, VRRC)))>; + def : Pat<(v2i64 (umax v2i64:$src1, v2i64:$src2)), + (v2i64 (VMAXUD (COPY_TO_REGCLASS $src1, VRRC), + (COPY_TO_REGCLASS $src2, VRRC)))>; + def : Pat<(v2i64 (smin v2i64:$src1, v2i64:$src2)), + (v2i64 (VMINSD (COPY_TO_REGCLASS $src1, VRRC), + (COPY_TO_REGCLASS $src2, VRRC)))>; + def : Pat<(v2i64 (umin v2i64:$src1, v2i64:$src2)), + (v2i64 (VMINUD (COPY_TO_REGCLASS $src1, VRRC), + (COPY_TO_REGCLASS $src2, VRRC)))>; } // AddedComplexity = 400 } // HasP8Vector -let UseVSXReg = 1, AddedComplexity = 400 in { +let AddedComplexity = 400 in { let Predicates = [HasDirectMove] in { // VSX direct move instructions def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT), @@ -1525,7 +1557,7 @@ let Predicates = [HasDirectMove] in { [(set i64:$rA, (PPCmfvsr f64:$XT))]>, Requires<[In64BitMode]>; let isCodeGenOnly = 1 in - def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT), + def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsrc:$XT), "mfvsrd $rA, $XT", IIC_VecGeneral, []>, Requires<[In64BitMode]>; @@ -1557,7 +1589,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in { []>, Requires<[In64BitMode]>; } // IsISA3_0, HasDirectMove -} // UseVSXReg = 1 +} // AddedComplexity = 400 // We want to parse this from asm, but we don't want to emit this as it would // be emitted with a VSX reg. So leave Emit = 0 here. @@ -2415,7 +2447,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { list pattern> : X_VT5_XO5_VB5_VSFR, isDOT; - let UseVSXReg = 1 in { // [PO T XO B XO BX /] class XX2_RT5_XO5_XB6 opcode, bits<5> xo2, bits<9> xo, string opc, list pattern> @@ -2434,7 +2465,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { InstrItinClass itin, list pattern> : XX3Form; - } // UseVSXReg = 1 // [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5 opcode, bits<10> xo, string opc, @@ -2482,69 +2512,70 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let isCommutable = 1 in { def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp", [(set f128:$vT, (fadd f128:$vA, f128:$vB))]>; + def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp", + [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>; + } + def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" , + [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>; + def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp", + [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>; + // Square-Root + def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp", + [(set f128:$vT, (fsqrt f128:$vB))]>; + // (Negative) Multiply-{Add/Subtract} + def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp", + [(set f128:$vT, + (fma f128:$vA, f128:$vB, + f128:$vTi))]>; + def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" , + [(set f128:$vT, + (fma f128:$vA, f128:$vB, + (fneg f128:$vTi)))]>; + def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp", + [(set f128:$vT, + (fneg (fma f128:$vA, f128:$vB, + f128:$vTi)))]>; + def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp", + [(set f128:$vT, + (fneg (fma f128:$vA, f128:$vB, + (fneg f128:$vTi))))]>; + + let isCommutable = 1 in { def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo", [(set f128:$vT, (int_ppc_addf128_round_to_odd f128:$vA, f128:$vB))]>; - def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp", - [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>; def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo", [(set f128:$vT, (int_ppc_mulf128_round_to_odd f128:$vA, f128:$vB))]>; } - - def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" , - [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>; def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", [(set f128:$vT, (int_ppc_subf128_round_to_odd f128:$vA, f128:$vB))]>; - def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp", - [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>; def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", [(set f128:$vT, (int_ppc_divf128_round_to_odd f128:$vA, f128:$vB))]>; - - // Square-Root - def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp", - [(set f128:$vT, (fsqrt f128:$vB))]>; def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", [(set f128:$vT, (int_ppc_sqrtf128_round_to_odd f128:$vB))]>; - // (Negative) Multiply-{Add/Subtract} - def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp", - [(set f128:$vT, - (fma f128:$vA, f128:$vB, - f128:$vTi))]>; def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo", [(set f128:$vT, (int_ppc_fmaf128_round_to_odd f128:$vA,f128:$vB,f128:$vTi))]>; - def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" , - [(set f128:$vT, - (fma f128:$vA, f128:$vB, - (fneg f128:$vTi)))]>; def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" , [(set f128:$vT, (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, (fneg f128:$vTi)))]>; - def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp", - [(set f128:$vT, - (fneg (fma f128:$vA, f128:$vB, - f128:$vTi)))]>; def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo", [(set f128:$vT, (fneg (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, f128:$vTi)))]>; - def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp", - [(set f128:$vT, - (fneg (fma f128:$vA, f128:$vB, - (fneg f128:$vTi))))]>; def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo", [(set f128:$vT, (fneg (int_ppc_fmaf128_round_to_odd @@ -2572,8 +2603,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // DP/QP Compare Exponents def XSCMPEXPDP : XX3Form_1<60, 59, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), - "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>, - UseVSXReg; + "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>; def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>; // DP Compare ==, >=, >, != @@ -2631,7 +2661,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))), (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>; - let UseVSXReg = 1 in { //===--------------------------------------------------------------------===// // Round to Floating-Point Integer Instructions @@ -2648,8 +2677,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { [(set v4f32:$XT, (int_ppc_vsx_xvcvsphp v4f32:$XB))]>; - } // UseVSXReg = 1 - // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a // separate pattern so that it can convert the input register class from // VRRC(v8i16) to VSRC. @@ -2691,7 +2718,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Insert Exponent DP/QP // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB), - "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>, UseVSXReg; + "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>; // vB NOTE: only vB.dword[0] is used, that's why we don't use // X_VT5_VA5_VB5 form def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB), @@ -2712,7 +2739,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v2i64 (XSXEXPQP $vA)), sub_64)))>; // Vector Insert Word - let UseVSXReg = 1 in { // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB. def XXINSERTW : XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT), @@ -2726,7 +2752,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165, (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM), "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>; - } // UseVSXReg = 1 // Vector Insert Exponent DP/SP def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc, @@ -2759,20 +2784,17 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { //===--------------------------------------------------------------------===// // Test Data Class SP/DP/QP - let UseVSXReg = 1 in { def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298, (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB), "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>; def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362, (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB), "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>; - } // UseVSXReg = 1 def XSTSTDCQP : X_BF3_DCMX7_RS5 <63, 708, (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB), "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>; // Vector Test Data Class SP/DP - let UseVSXReg = 1 in { def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5, (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB), "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP, @@ -2783,7 +2805,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP, [(set v2i64: $XT, (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>; - } // UseVSXReg = 1 //===--------------------------------------------------------------------===// @@ -2824,7 +2845,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Vector Splat Immediate Byte def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8), - "xxspltib $XT, $IMM8", IIC_VecPerm, []>, UseVSXReg; + "xxspltib $XT, $IMM8", IIC_VecPerm, []>; //===--------------------------------------------------------------------===// // Vector/Scalar Load/Store Instructions @@ -2834,7 +2855,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let mayLoad = 1, mayStore = 0 in { // Load Vector def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src), - "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg; + "lxv $XT, $src", IIC_LdStLFD, []>; // Load DWord def LXSD : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src), "lxsd $vD, $src", IIC_LdStLFD, []>; @@ -2847,7 +2868,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { class X_XT6_RA5_RB5 opcode, bits<10> xo, string opc, RegisterOperand vtype, list pattern> : XX1Form_memOp, UseVSXReg; + !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>; // Load as Integer Byte/Halfword & Zero Indexed def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc, @@ -2861,16 +2882,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Load Vector Indexed def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc, - [(set v2f64:$XT, (load xaddr:$src))]>; + [(set v2f64:$XT, (load xaddrX16:$src))]>; // Load Vector (Left-justified) with Length def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvl $XT, $src, $rB", IIC_LdStLoad, - [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>, - UseVSXReg; + [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>; def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvll $XT, $src, $rB", IIC_LdStLoad, - [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>, - UseVSXReg; + [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>; // Load Vector Word & Splat Indexed def LXVWSX : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>; @@ -2881,7 +2900,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let mayStore = 1, mayLoad = 0 in { // Store Vector def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst), - "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg; + "stxv $XT, $dst", IIC_LdStSTFD, []>; // Store DWord def STXSD : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst), "stxsd $vS, $dst", IIC_LdStSTFD, []>; @@ -2893,7 +2912,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { class X_XS6_RA5_RB5 opcode, bits<10> xo, string opc, RegisterOperand vtype, list pattern> : XX1Form_memOp, UseVSXReg; + !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>; // Store as Integer Byte/Halfword Indexed def STXSIBX : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsfrc, @@ -2901,8 +2920,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def STXSIHX : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsfrc, [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>; let isCodeGenOnly = 1 in { - def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vrrc, []>; - def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vrrc, []>; + def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsrc, []>; + def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsrc, []>; } // Store Vector Halfword*8/Byte*16 Indexed @@ -2911,21 +2930,19 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Store Vector Indexed def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc, - [(store v2f64:$XT, xaddr:$dst)]>; + [(store v2f64:$XT, xaddrX16:$dst)]>; // Store Vector (Left-justified) with Length def STXVL : XX1Form_memOp<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), "stxvl $XT, $dst, $rB", IIC_LdStLoad, [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst, - i64:$rB)]>, - UseVSXReg; + i64:$rB)]>; def STXVLL : XX1Form_memOp<31, 429, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), "stxvll $XT, $dst, $rB", IIC_LdStLoad, [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst, - i64:$rB)]>, - UseVSXReg; + i64:$rB)]>; } // mayStore let Predicates = [IsLittleEndian] in { @@ -3045,24 +3062,24 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { } // IsLittleEndian, HasP9Vector // D-Form Load/Store - def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(f128 (quadwOffsetLoad iqaddr:$src)), + def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>; + def : Pat<(f128 (quadwOffsetLoad iaddrX16:$src)), (COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddrX16:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddrX16:$src)), (LXV memrix16:$src)>; - def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(quadwOffsetStore f128:$rS, iqaddr:$dst), + def : Pat<(quadwOffsetStore v4f32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore f128:$rS, iaddrX16:$dst), (STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>; - def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst), + def : Pat<(quadwOffsetStore v2i64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst), + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>; @@ -3159,109 +3176,109 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let Predicates = [IsBigEndian, HasP9Vector] in { // Scalar stores of i8 def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst), - (STXSIBXv $S, xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>; // Scalar stores of i16 def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst), - (STXSIHXv $S, xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>; } // IsBigEndian, HasP9Vector let Predicates = [IsLittleEndian, HasP9Vector] in { // Scalar stores of i8 def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst), - (STXSIBXv $S, xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst), - (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>; + (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>; // Scalar stores of i16 def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst), - (STXSIHXv $S, xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst), - (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; + (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>; } // IsLittleEndian, HasP9Vector @@ -3273,53 +3290,97 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def DFLOADf32 : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src), "#DFLOADf32", - [(set f32:$XT, (load ixaddr:$src))]>; + [(set f32:$XT, (load iaddrX4:$src))]>; def DFLOADf64 : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src), "#DFLOADf64", - [(set f64:$XT, (load ixaddr:$src))]>; + [(set f64:$XT, (load iaddrX4:$src))]>; def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst), "#DFSTOREf32", - [(store f32:$XT, ixaddr:$dst)]>; + [(store f32:$XT, iaddrX4:$dst)]>; def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst), "#DFSTOREf64", - [(store f64:$XT, ixaddr:$dst)]>; + [(store f64:$XT, iaddrX4:$dst)]>; - def : Pat<(f64 (extloadf32 ixaddr:$src)), - (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>; - def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))), - (f32 (DFLOADf32 ixaddr:$src))>; + def : Pat<(f64 (extloadf32 iaddrX4:$src)), + (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$src), VSFRC)>; + def : Pat<(f32 (fpround (f64 (extloadf32 iaddrX4:$src)))), + (f32 (DFLOADf32 iaddrX4:$src))>; + def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)), + (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>; + def : Pat<(v4f32 (PPCldvsxlh iaddrX4:$src)), + (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC)>; let AddedComplexity = 400 in { // The following pseudoinstructions are used to ensure the utilization // of all 64 VSX registers. let Predicates = [IsLittleEndian, HasP9Vector] in { - def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))), + def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))), (v2i64 (XXPERMDIs - (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>; - def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))), + (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>; + def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))), (v2i64 (XXPERMDIs - (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>; + (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>; - def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))), + def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))), (v2f64 (XXPERMDIs - (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>; - def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))), + (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>; + def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))), (v2f64 (XXPERMDIs - (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>; - } + (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>; + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), xaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), xaddrX4:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), iaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64), + iaddrX4:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>; + } // IsLittleEndian, HasP9Vector let Predicates = [IsBigEndian, HasP9Vector] in { - def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))), - (v2i64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>; - def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))), - (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>; - - def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))), - (v2f64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>; - def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))), - (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>; - } + def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))), + (v2i64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>; + def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))), + (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>; + + def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))), + (v2f64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>; + def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))), + (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), xaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), xaddrX4:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), iaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), + sub_64), iaddrX4:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src), + (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>; + } // IsBigEndian, HasP9Vector } let Predicates = [IsBigEndian, HasP9Vector] in { @@ -3455,14 +3516,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { } // IsLittleEndian, HasP9Vector // Convert (Un)Signed DWord in memory -> QP - def : Pat<(f128 (sint_to_fp (i64 (load xaddr:$src)))), - (f128 (XSCVSDQP (LXSDX xaddr:$src)))>; - def : Pat<(f128 (sint_to_fp (i64 (load ixaddr:$src)))), - (f128 (XSCVSDQP (LXSD ixaddr:$src)))>; - def : Pat<(f128 (uint_to_fp (i64 (load xaddr:$src)))), - (f128 (XSCVUDQP (LXSDX xaddr:$src)))>; - def : Pat<(f128 (uint_to_fp (i64 (load ixaddr:$src)))), - (f128 (XSCVUDQP (LXSD ixaddr:$src)))>; + def : Pat<(f128 (sint_to_fp (i64 (load xaddrX4:$src)))), + (f128 (XSCVSDQP (LXSDX xaddrX4:$src)))>; + def : Pat<(f128 (sint_to_fp (i64 (load iaddrX4:$src)))), + (f128 (XSCVSDQP (LXSD iaddrX4:$src)))>; + def : Pat<(f128 (uint_to_fp (i64 (load xaddrX4:$src)))), + (f128 (XSCVUDQP (LXSDX xaddrX4:$src)))>; + def : Pat<(f128 (uint_to_fp (i64 (load iaddrX4:$src)))), + (f128 (XSCVUDQP (LXSD iaddrX4:$src)))>; // Convert Unsigned HWord in memory -> QP def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)), @@ -3483,13 +3544,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Instructions for store(fptosi). // The 8-byte version is repeated here due to availability of D-Form STXSD. def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddr:$dst, 8), + (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddrX4:$dst, 8), (STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), - xaddr:$dst)>; + xaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ixaddr:$dst, 8), + (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), iaddrX4:$dst, 8), (STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), - ixaddr:$dst)>; + iaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4), (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>; @@ -3500,11 +3561,11 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1), (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddr:$dst, 8), - (STXSDX (XSCVDPSXDS f64:$src), xaddr:$dst)>; + (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddrX4:$dst, 8), + (STXSDX (XSCVDPSXDS f64:$src), xaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ixaddr:$dst, 8), - (STXSD (XSCVDPSXDS f64:$src), ixaddr:$dst)>; + (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), iaddrX4:$dst, 8), + (STXSD (XSCVDPSXDS f64:$src), iaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2), (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>; @@ -3514,13 +3575,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Instructions for store(fptoui). def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddr:$dst, 8), + (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddrX4:$dst, 8), (STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), - xaddr:$dst)>; + xaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ixaddr:$dst, 8), + (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), iaddrX4:$dst, 8), (STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), - ixaddr:$dst)>; + iaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4), (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>; @@ -3531,11 +3592,11 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1), (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddr:$dst, 8), - (STXSDX (XSCVDPUXDS f64:$src), xaddr:$dst)>; + (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddrX4:$dst, 8), + (STXSDX (XSCVDPUXDS f64:$src), xaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr - (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ixaddr:$dst, 8), - (STXSD (XSCVDPUXDS f64:$src), ixaddr:$dst)>; + (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), iaddrX4:$dst, 8), + (STXSD (XSCVDPUXDS f64:$src), iaddrX4:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2), (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>; @@ -3668,13 +3729,13 @@ def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } def FltToLongLoadP9 { - dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A))))); + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddrX4:$A))))); } def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } def FltToULongLoadP9 { - dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A))))); + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddrX4:$A))))); } def FltToLong { dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A))))); @@ -3704,13 +3765,13 @@ def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } def DblToIntLoadP9 { - dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A))))); + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddrX4:$A))))); } def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } def DblToUIntLoadP9 { - dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A))))); + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddrX4:$A))))); } def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); @@ -3834,8 +3895,38 @@ let AddedComplexity = 400 in { def : Pat; + def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + + // Elements in a register on a BE system are in order <0, 1, 2, 3>. + // The store instructions store the second word from the left. + // So to align element zero, we need to modulo-left-shift by 3 words. + // Similar logic applies for elements 2 and 3. + foreach Idx = [ [0,3], [2,1], [3,2] ] in { + def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src), + (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))), + sub_64), xoaddr:$src)>; + def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src), + (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))), + sub_64), xoaddr:$src)>; + } } + let Predicates = [HasP8Vector, IsBigEndian, NoP9Vector] in { + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64), + xoaddr:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64), + xoaddr:$src)>; + } + // Big endian, available on all targets with VSX let Predicates = [IsBigEndian, HasVSX] in { def : Pat<(v2f64 (build_vector f64:$A, f64:$B)), @@ -3871,8 +3962,38 @@ let AddedComplexity = 400 in { def : Pat; + def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + + // Elements in a register on a LE system are in order <3, 2, 1, 0>. + // The store instructions store the second word from the left. + // So to align element 3, we need to modulo-left-shift by 3 words. + // Similar logic applies for elements 0 and 1. + foreach Idx = [ [0,2], [1,1], [3,3] ] in { + def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src), + (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))), + sub_64), xoaddr:$src)>; + def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src), + (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))), + sub_64), xoaddr:$src)>; + } } + let Predicates = [HasP8Vector, IsLittleEndian, NoP9Vector] in { + def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64), + xoaddr:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64), + xoaddr:$src)>; + def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src), + (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + } + let Predicates = [IsLittleEndian, HasVSX] in { // Little endian, available on all targets with VSX def : Pat<(v2f64 (build_vector f64:$A, f64:$B)), @@ -3969,17 +4090,17 @@ let AddedComplexity = 400 in { (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; + (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; + (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>; def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS - (DFLOADf32 ixaddr:$A), + (DFLOADf32 iaddrX4:$A), VSFRC)), 0))>; def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS - (DFLOADf32 ixaddr:$A), + (DFLOADf32 iaddrX4:$A), VSFRC)), 0))>; } diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index 0b57dd9b618d..4d45d96d4479 100644 --- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -1,9 +1,8 @@ //===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -65,12 +64,6 @@ static cl::opt MaxVars("ppc-preinc-prep-max-vars", STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form"); -namespace llvm { - - void initializePPCLoopPreIncPrepPass(PassRegistry&); - -} // end namespace llvm - namespace { class PPCLoopPreIncPrep : public FunctionPass { @@ -338,7 +331,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { // iteration space), insert a new preheader for the loop. if (!LoopPredecessor || !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { - LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); + LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); if (LoopPredecessor) MadeChange = true; } diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index e731c0bc0c23..027e6bd1ba06 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -111,16 +110,16 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, RefKind = MCSymbolRefExpr::VK_PLT; const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + const Module *M = MF->getFunction().getParent(); const PPCSubtarget *Subtarget = &(MF->getSubtarget()); const TargetMachine &TM = Printer.TM; const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx); - // -msecure-plt option works only in PIC mode. If secure plt mode - // is on add 32768 to symbol. + // If -msecure-plt -fPIC, add 32768 to symbol. if (Subtarget->isSecurePlt() && TM.isPositionIndependent() && + M->getPICLevel() == PICLevel::BigPIC && MO.getTargetFlags() == PPCII::MO_PLT) - Expr = MCBinaryExpr::createAdd(Expr, - MCConstantExpr::create(32768, Ctx), - Ctx); + Expr = + MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(32768, Ctx), Ctx); if (!MO.isJTI() && MO.getOffset()) Expr = MCBinaryExpr::createAdd(Expr, diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index 0068df19f0c8..446246358e96 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1,9 +1,8 @@ //===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // @@ -22,9 +21,12 @@ #include "PPC.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" +#include "PPCMachineFunctionInfo.h" #include "PPCTargetMachine.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -38,6 +40,7 @@ using namespace llvm; STATISTIC(RemoveTOCSave, "Number of TOC saves removed"); STATISTIC(MultiTOCSaves, "Number of functions with multiple TOC saves that must be kept"); +STATISTIC(NumTOCSavesInPrologue, "Number of TOC saves placed in the prologue"); STATISTIC(NumEliminatedSExt, "Number of eliminated sign-extensions"); STATISTIC(NumEliminatedZExt, "Number of eliminated zero-extensions"); STATISTIC(NumOptADDLIs, "Number of optimized ADD instruction fed by LI"); @@ -48,6 +51,10 @@ STATISTIC(NumFunctionsEnteredInMIPeephole, STATISTIC(NumFixedPointIterations, "Number of fixed-point iterations converting reg-reg instructions " "to reg-imm ones"); +STATISTIC(NumRotatesCollapsed, + "Number of pairs of rotate left, clear left/right collapsed"); +STATISTIC(NumEXTSWAndSLDICombined, + "Number of pairs of EXTSW and SLDI combined as EXTSWSLI"); static cl::opt FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true), @@ -83,6 +90,9 @@ struct PPCMIPeephole : public MachineFunctionPass { private: MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; + MachineBlockFrequencyInfo *MBFI; + uint64_t EntryFreq; // Initialize class variables. void initialize(MachineFunction &MFParm); @@ -93,6 +103,8 @@ private: // Perform peepholes. bool eliminateRedundantCompare(void); bool eliminateRedundantTOCSaves(std::map &TOCSaves); + bool combineSEXTAndSHL(MachineInstr &MI, MachineInstr *&ToErase); + bool emitRLDICWhenLoweringJumpTables(MachineInstr &MI); void UpdateTOCSaves(std::map &TOCSaves, MachineInstr *MI); @@ -100,7 +112,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -118,6 +134,9 @@ void PPCMIPeephole::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); MDT = &getAnalysis(); + MPDT = &getAnalysis(); + MBFI = &getAnalysis(); + EntryFreq = MBFI->getEntryFreq(); TII = MF->getSubtarget().getInstrInfo(); LLVM_DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); LLVM_DEBUG(MF->dump()); @@ -198,6 +217,30 @@ getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) { void PPCMIPeephole::UpdateTOCSaves( std::map &TOCSaves, MachineInstr *MI) { assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here"); + assert(MF->getSubtarget().isELFv2ABI() && + "TOC-save removal only supported on ELFv2"); + PPCFunctionInfo *FI = MF->getInfo(); + + MachineBasicBlock *Entry = &MF->front(); + uint64_t CurrBlockFreq = MBFI->getBlockFreq(MI->getParent()).getFrequency(); + + // If the block in which the TOC save resides is in a block that + // post-dominates Entry, or a block that is hotter than entry (keep in mind + // that early MachineLICM has already run so the TOC save won't be hoisted) + // we can just do the save in the prologue. + if (CurrBlockFreq > EntryFreq || MPDT->dominates(MI->getParent(), Entry)) + FI->setMustSaveTOC(true); + + // If we are saving the TOC in the prologue, all the TOC saves can be removed + // from the code. + if (FI->mustSaveTOC()) { + for (auto &TOCSave : TOCSaves) + TOCSave.second = false; + // Add new instruction to map. + TOCSaves[MI] = false; + return; + } + bool Keep = true; for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) { MachineInstr *CurrInst = It->first; @@ -758,6 +801,11 @@ bool PPCMIPeephole::simplifyCode(void) { NumOptADDLIs++; break; } + case PPC::RLDICR: { + Simplified |= emitRLDICWhenLoweringJumpTables(MI) || + combineSEXTAndSHL(MI, ToErase); + break; + } } } @@ -771,6 +819,10 @@ bool PPCMIPeephole::simplifyCode(void) { // Eliminate all the TOC save instructions which are redundant. Simplified |= eliminateRedundantTOCSaves(TOCSaves); + PPCFunctionInfo *FI = MF->getInfo(); + if (FI->mustSaveTOC()) + NumTOCSavesInPrologue++; + // We try to eliminate redundant compare instruction. Simplified |= eliminateRedundantCompare(); @@ -1275,10 +1327,136 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) { return Simplified; } +// We miss the opportunity to emit an RLDIC when lowering jump tables +// since ISEL sees only a single basic block. When selecting, the clear +// and shift left will be in different blocks. +bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) { + if (MI.getOpcode() != PPC::RLDICR) + return false; + + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + + MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); + if (SrcMI->getOpcode() != PPC::RLDICL) + return false; + + MachineOperand MOpSHSrc = SrcMI->getOperand(2); + MachineOperand MOpMBSrc = SrcMI->getOperand(3); + MachineOperand MOpSHMI = MI.getOperand(2); + MachineOperand MOpMEMI = MI.getOperand(3); + if (!(MOpSHSrc.isImm() && MOpMBSrc.isImm() && MOpSHMI.isImm() && + MOpMEMI.isImm())) + return false; + + uint64_t SHSrc = MOpSHSrc.getImm(); + uint64_t MBSrc = MOpMBSrc.getImm(); + uint64_t SHMI = MOpSHMI.getImm(); + uint64_t MEMI = MOpMEMI.getImm(); + uint64_t NewSH = SHSrc + SHMI; + uint64_t NewMB = MBSrc - SHMI; + if (NewMB > 63 || NewSH > 63) + return false; + + // The bits cleared with RLDICL are [0, MBSrc). + // The bits cleared with RLDICR are (MEMI, 63]. + // After the sequence, the bits cleared are: + // [0, MBSrc-SHMI) and (MEMI, 63). + // + // The bits cleared with RLDIC are [0, NewMB) and (63-NewSH, 63]. + if ((63 - NewSH) != MEMI) + return false; + + LLVM_DEBUG(dbgs() << "Converting pair: "); + LLVM_DEBUG(SrcMI->dump()); + LLVM_DEBUG(MI.dump()); + + MI.setDesc(TII->get(PPC::RLDIC)); + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + MI.getOperand(2).setImm(NewSH); + MI.getOperand(3).setImm(NewMB); + + LLVM_DEBUG(dbgs() << "To: "); + LLVM_DEBUG(MI.dump()); + NumRotatesCollapsed++; + return true; +} + +// For case in LLVM IR +// entry: +// %iconv = sext i32 %index to i64 +// br i1 undef label %true, label %false +// true: +// %ptr = getelementptr inbounds i32, i32* null, i64 %iconv +// ... +// PPCISelLowering::combineSHL fails to combine, because sext and shl are in +// different BBs when conducting instruction selection. We can do a peephole +// optimization to combine these two instructions into extswsli after +// instruction selection. +bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI, + MachineInstr *&ToErase) { + if (MI.getOpcode() != PPC::RLDICR) + return false; + + if (!MF->getSubtarget().isISA3_0()) + return false; + + assert(MI.getNumOperands() == 4 && "RLDICR should have 4 operands"); + + MachineOperand MOpSHMI = MI.getOperand(2); + MachineOperand MOpMEMI = MI.getOperand(3); + if (!(MOpSHMI.isImm() && MOpMEMI.isImm())) + return false; + + uint64_t SHMI = MOpSHMI.getImm(); + uint64_t MEMI = MOpMEMI.getImm(); + if (SHMI + MEMI != 63) + return false; + + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + + MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); + if (SrcMI->getOpcode() != PPC::EXTSW && + SrcMI->getOpcode() != PPC::EXTSW_32_64) + return false; + + // If the register defined by extsw has more than one use, combination is not + // needed. + if (!MRI->hasOneNonDBGUse(SrcReg)) + return false; + + LLVM_DEBUG(dbgs() << "Combining pair: "); + LLVM_DEBUG(SrcMI->dump()); + LLVM_DEBUG(MI.dump()); + + MachineInstr *NewInstr = + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), + SrcMI->getOpcode() == PPC::EXTSW ? TII->get(PPC::EXTSWSLI) + : TII->get(PPC::EXTSWSLI_32_64), + MI.getOperand(0).getReg()) + .add(SrcMI->getOperand(1)) + .add(MOpSHMI); + (void)NewInstr; + + LLVM_DEBUG(dbgs() << "TO: "); + LLVM_DEBUG(NewInstr->dump()); + ++NumEXTSWAndSLDICombined; + ToErase = &MI; + // SrcMI, which is extsw, is of no use now, erase it. + SrcMI->eraseFromParent(); + return true; +} + } // end default namespace INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index 3923417257e8..2f65d6a2855b 100644 --- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 8a3f50aa9565..dfae19804d94 100644 --- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -45,6 +44,12 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// PEI. bool MustSaveLR; + /// MustSaveTOC - Indicates that the TOC save needs to be performed in the + /// prologue of the function. This is typically the case when there are + /// indirect calls in the function and it is more profitable to save the + /// TOC pointer in the prologue than in the block(s) containing the call(s). + bool MustSaveTOC = false; + /// Do we have to disable shrink-wrapping? This has to be set if we emit any /// instructions that clobber LR in the entry block because discovering this /// in PEI is too late (happens after shrink-wrapping); @@ -152,6 +157,9 @@ public: void setMustSaveLR(bool U) { MustSaveLR = U; } bool mustSaveLR() const { return MustSaveLR; } + void setMustSaveTOC(bool U) { MustSaveTOC = U; } + bool mustSaveTOC() const { return MustSaveTOC; } + /// We certainly don't want to shrink wrap functions if we've emitted a /// MovePCtoLR8 as that has to go into the entry, so the prologue definitely /// has to go into the entry block. diff --git a/lib/Target/PowerPC/PPCMachineScheduler.cpp b/lib/Target/PowerPC/PPCMachineScheduler.cpp new file mode 100644 index 000000000000..a38c8f475066 --- /dev/null +++ b/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -0,0 +1,83 @@ +//===- PPCMachineScheduler.cpp - MI Scheduler for PowerPC -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PPCMachineScheduler.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" + +using namespace llvm; + +static cl::opt +DisableAddiLoadHeuristic("disable-ppc-sched-addi-load", + cl::desc("Disable scheduling addi instruction before" + "load for ppc"), cl::Hidden); + +bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary &Zone) const { + if (DisableAddiLoadHeuristic) + return false; + + auto isADDIInstr = [&] (const MachineInstr &Inst) { + return Inst.getOpcode() == PPC::ADDI || Inst.getOpcode() == PPC::ADDI8; + }; + + SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand; + SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand; + if (isADDIInstr(*FirstCand.SU->getInstr()) && + SecondCand.SU->getInstr()->mayLoad()) { + TryCand.Reason = Stall; + return true; + } + if (FirstCand.SU->getInstr()->mayLoad() && + isADDIInstr(*SecondCand.SU->getInstr())) { + TryCand.Reason = NoCand; + return true; + } + + return false; +} + +void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + GenericScheduler::tryCandidate(Cand, TryCand, Zone); + + if (!Cand.isValid() || !Zone) + return; + + // Add powerpc specific heuristic only when TryCand isn't selected or + // selected as node order. + if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) + return; + + // There are some benefits to schedule the ADDI before the load to hide the + // latency, as RA may create a true dependency between the load and addi. + if (biasAddiLoadCandidate(Cand, TryCand, *Zone)) + return; +} + +void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { + // Custom PPC PostRA specific behavior here. + PostGenericScheduler::enterMBB(MBB); +} + +void PPCPostRASchedStrategy::leaveMBB() { + // Custom PPC PostRA specific behavior here. + PostGenericScheduler::leaveMBB(); +} + +void PPCPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) { + // Custom PPC PostRA specific initialization here. + PostGenericScheduler::initialize(Dag); +} + +SUnit *PPCPostRASchedStrategy::pickNode(bool &IsTopNode) { + // Custom PPC PostRA specific scheduling here. + return PostGenericScheduler::pickNode(IsTopNode); +} + diff --git a/lib/Target/PowerPC/PPCMachineScheduler.h b/lib/Target/PowerPC/PPCMachineScheduler.h new file mode 100644 index 000000000000..93532d9545a6 --- /dev/null +++ b/lib/Target/PowerPC/PPCMachineScheduler.h @@ -0,0 +1,49 @@ +//===- PPCMachineScheduler.h - Custom PowerPC MI scheduler --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Custom PowerPC MI scheduler. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_POWERPCMACHINESCHEDULER_H +#define LLVM_LIB_TARGET_POWERPC_POWERPCMACHINESCHEDULER_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// A MachineSchedStrategy implementation for PowerPC pre RA scheduling. +class PPCPreRASchedStrategy : public GenericScheduler { +public: + PPCPreRASchedStrategy(const MachineSchedContext *C) : + GenericScheduler(C) {} +protected: + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const override; +private: + bool biasAddiLoadCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary &Zone) const; +}; + +/// A MachineSchedStrategy implementation for PowerPC post RA scheduling. +class PPCPostRASchedStrategy : public PostGenericScheduler { +public: + PPCPostRASchedStrategy(const MachineSchedContext *C) : + PostGenericScheduler(C) {} + +protected: + void initialize(ScheduleDAGMI *Dag) override; + SUnit *pickNode(bool &IsTopNode) override; + void enterMBB(MachineBasicBlock *MBB) override; + void leaveMBB() override; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_POWERPC_POWERPCMACHINESCHEDULER_H diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h index 8a1d68011c5f..d0d84efdbd20 100644 --- a/lib/Target/PowerPC/PPCPerfectShuffle.h +++ b/lib/Target/PowerPC/PPCPerfectShuffle.h @@ -1,9 +1,8 @@ //===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCPfmCounters.td b/lib/Target/PowerPC/PPCPfmCounters.td index d2a09f30c0f3..20b9efdc9df9 100644 --- a/lib/Target/PowerPC/PPCPfmCounters.td +++ b/lib/Target/PowerPC/PPCPfmCounters.td @@ -1,9 +1,8 @@ //===-- PPCPfmCounters.td - PPC Hardware Counters ----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp index 4458b92ceb5e..d83c92276800 100644 --- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -1,9 +1,8 @@ //===--------- PPCPreEmitPeephole.cpp - Late peephole optimizations -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp index 25b2b54cbe98..3a83cc27439c 100644 --- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp @@ -1,9 +1,8 @@ //===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,10 +30,6 @@ using namespace llvm; STATISTIC(NumSimplified, "Number of QPX load splats simplified"); -namespace llvm { - void initializePPCQPXLoadSplatPass(PassRegistry&); -} - namespace { struct PPCQPXLoadSplat : public MachineFunctionPass { static char ID; diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 173fc18b9ebf..8eaa6dfe2bf7 100644 --- a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -1,9 +1,8 @@ //===---- PPCReduceCRLogicals.cpp - Reduce CR Bit Logical operations ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // @@ -49,10 +48,6 @@ STATISTIC(NumNotSplitChainCopies, STATISTIC(NumNotSplitWrongOpcode, "Number of blocks not split due to the wrong opcode."); -namespace llvm { - void initializePPCReduceCRLogicalsPass(PassRegistry&); -} - /// Given a basic block \p Successor that potentially contains PHIs, this /// function will look for any incoming values in the PHIs that are supposed to /// be coming from \p OrigMBB but whose definition is actually in \p NewMBB. @@ -171,9 +166,33 @@ static bool splitMBB(BlockSplitInfo &BSI) { : *ThisMBB->succ_begin(); MachineBasicBlock *NewBRTarget = BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget; - BranchProbability ProbToNewTarget = - !BSI.MBPI ? BranchProbability::getUnknown() - : BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget); + + // It's impossible to know the precise branch probability after the split. + // But it still needs to be reasonable, the whole probability to original + // targets should not be changed. + // After split NewBRTarget will get two incoming edges. Assume P0 is the + // original branch probability to NewBRTarget, P1 and P2 are new branch + // probabilies to NewBRTarget after split. If the two edge frequencies are + // same, then + // F * P1 = F * P0 / 2 ==> P1 = P0 / 2 + // F * (1 - P1) * P2 = F * P1 ==> P2 = P1 / (1 - P1) + BranchProbability ProbToNewTarget, ProbFallThrough; // Prob for new Br. + BranchProbability ProbOrigTarget, ProbOrigFallThrough; // Prob for orig Br. + ProbToNewTarget = ProbFallThrough = BranchProbability::getUnknown(); + ProbOrigTarget = ProbOrigFallThrough = BranchProbability::getUnknown(); + if (BSI.MBPI) { + if (BSI.BranchToFallThrough) { + ProbToNewTarget = BSI.MBPI->getEdgeProbability(ThisMBB, OrigFallThrough) / 2; + ProbFallThrough = ProbToNewTarget.getCompl(); + ProbOrigFallThrough = ProbToNewTarget / ProbToNewTarget.getCompl(); + ProbOrigTarget = ProbOrigFallThrough.getCompl(); + } else { + ProbToNewTarget = BSI.MBPI->getEdgeProbability(ThisMBB, OrigTarget) / 2; + ProbFallThrough = ProbToNewTarget.getCompl(); + ProbOrigTarget = ProbToNewTarget / ProbToNewTarget.getCompl(); + ProbOrigFallThrough = ProbOrigTarget.getCompl(); + } + } // Create a new basic block. MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore; @@ -185,11 +204,16 @@ static bool splitMBB(BlockSplitInfo &BSI) { // Move everything after SplitBefore into the new block. NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end()); NewMBB->transferSuccessors(ThisMBB); + if (!ProbOrigTarget.isUnknown()) { + auto MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigTarget); + NewMBB->setSuccProbability(MBBI, ProbOrigTarget); + MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigFallThrough); + NewMBB->setSuccProbability(MBBI, ProbOrigFallThrough); + } - // Add the two successors to ThisMBB. The probabilities come from the - // existing blocks if available. + // Add the two successors to ThisMBB. ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget); - ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl()); + ThisMBB->addSuccessor(NewMBB, ProbFallThrough); // Add the branches to ThisMBB. BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(), diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 3d067aa8e621..12554ea8d079 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "PPCRegisterInfo.h" -#include "PPC.h" #include "PPCFrameLowering.h" #include "PPCInstrBuilder.h" #include "PPCMachineFunctionInfo.h" @@ -71,6 +69,14 @@ StackPtrConst("ppc-stack-ptr-caller-preserved", "caller preserved registers can be LICM candidates"), cl::init(true), cl::Hidden); +static cl::opt +MaxCRBitSpillDist("ppc-max-crbit-spill-dist", + cl::desc("Maximum search distance for definition of CR bit " + "spill on ppc"), + cl::Hidden, cl::init(100)); + +static unsigned offsetMinAlignForOpcode(unsigned OpC); + PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR, TM.isPPC64() ? 0 : 1, @@ -153,30 +159,39 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (TM.isPPC64() && MF->getInfo()->isSplitCSR()) return CSR_SRV464_TLS_PE_SaveList; - if (Subtarget.hasSPE()) - return CSR_SVR432_SPE_SaveList; - // On PPC64, we might need to save r2 (but only if it is not reserved). bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2); + // Cold calling convention CSRs. if (MF->getFunction().getCallingConv() == CallingConv::Cold) { - return TM.isPPC64() - ? (Subtarget.hasAltivec() - ? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList - : CSR_SVR64_ColdCC_Altivec_SaveList) - : (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList - : CSR_SVR64_ColdCC_SaveList)) - : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList - : CSR_SVR32_ColdCC_SaveList); + if (TM.isPPC64()) { + if (Subtarget.hasAltivec()) + return SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList + : CSR_SVR64_ColdCC_Altivec_SaveList; + return SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList + : CSR_SVR64_ColdCC_SaveList; + } + // 32-bit targets. + if (Subtarget.hasAltivec()) + return CSR_SVR32_ColdCC_Altivec_SaveList; + else if (Subtarget.hasSPE()) + return CSR_SVR32_ColdCC_SPE_SaveList; + return CSR_SVR32_ColdCC_SaveList; } - - return TM.isPPC64() - ? (Subtarget.hasAltivec() - ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList - : CSR_SVR464_Altivec_SaveList) - : (SaveR2 ? CSR_SVR464_R2_SaveList : CSR_SVR464_SaveList)) - : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_SaveList - : CSR_SVR432_SaveList); + // Standard calling convention CSRs. + if (TM.isPPC64()) { + if (Subtarget.hasAltivec()) + return SaveR2 ? CSR_SVR464_R2_Altivec_SaveList + : CSR_SVR464_Altivec_SaveList; + return SaveR2 ? CSR_SVR464_R2_SaveList + : CSR_SVR464_SaveList; + } + // 32-bit targets. + if (Subtarget.hasAltivec()) + return CSR_SVR432_Altivec_SaveList; + else if (Subtarget.hasSPE()) + return CSR_SVR432_SPE_SaveList; + return CSR_SVR432_SaveList; } const MCPhysReg * @@ -221,18 +236,26 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF, : CSR_Darwin64_RegMask) : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask : CSR_Darwin32_RegMask); + if (Subtarget.isAIXABI()) { + assert(!Subtarget.hasAltivec() && "Altivec is not implemented on AIX yet."); + return TM.isPPC64() ? CSR_AIX64_RegMask : CSR_AIX32_RegMask; + } if (CC == CallingConv::Cold) { return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask : CSR_SVR64_ColdCC_RegMask) : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask - : CSR_SVR32_ColdCC_RegMask); + : (Subtarget.hasSPE() + ? CSR_SVR32_ColdCC_SPE_RegMask + : CSR_SVR32_ColdCC_RegMask)); } return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask : CSR_SVR464_RegMask) : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask - : CSR_SVR432_RegMask); + : (Subtarget.hasSPE() + ? CSR_SVR432_SPE_RegMask + : CSR_SVR432_RegMask)); } const uint32_t* @@ -288,6 +311,11 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, PPC::R13); // Small Data Area pointer register } + // Always reserve r2 on AIX for now. + // TODO: Make r2 allocatable on AIX/XCOFF for some leaf functions. + if (Subtarget.isAIXABI()) + markSuperRegs(Reserved, PPC::R2); // System-reserved register + // On PPC64, r13 is the thread pointer. Never allocate this register. if (TM.isPPC64()) markSuperRegs(Reserved, PPC::R13); @@ -316,6 +344,51 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const PPCInstrInfo *InstrInfo = Subtarget.getInstrInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const std::vector &Info = MFI.getCalleeSavedInfo(); + + // If the callee saved info is invalid we have to default to true for safety. + if (!MFI.isCalleeSavedInfoValid()) + return true; + + // We will require the use of X-Forms because the frame is larger than what + // can be represented in signed 16 bits that fit in the immediate of a D-Form. + // If we need an X-Form then we need a register to store the address offset. + unsigned FrameSize = MFI.getStackSize(); + // Signed 16 bits means that the FrameSize cannot be more than 15 bits. + if (FrameSize & ~0x7FFF) + return true; + + // The callee saved info is valid so it can be traversed. + // Checking for registers that need saving that do not have load or store + // forms where the address offset is an immediate. + for (unsigned i = 0; i < Info.size(); i++) { + int FrIdx = Info[i].getFrameIdx(); + unsigned Reg = Info[i].getReg(); + + unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(Reg); + if (!MFI.isFixedObjectIndex(FrIdx)) { + // This is not a fixed object. If it requires alignment then we may still + // need to use the XForm. + if (offsetMinAlignForOpcode(Opcode) > 1) + return true; + } + + // This is eiher: + // 1) A fixed frame index object which we know are aligned so + // as long as we have a valid DForm/DSForm/DQForm (non XForm) we don't + // need to consider the alignement here. + // 2) A not fixed object but in that case we now know that the min required + // alignment is no more than 1 based on the previous check. + if (InstrInfo->isXFormMemOp(Opcode)) + return true; + } + return false; +} + bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const { assert(TargetRegisterInfo::isPhysicalRegister(PhysReg)); @@ -664,6 +737,7 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, MachineFunction &MF = *MBB.getParent(); const PPCSubtarget &Subtarget = MF.getSubtarget(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const TargetRegisterInfo* TRI = Subtarget.getRegisterInfo(); DebugLoc dl = MI.getDebugLoc(); bool LP64 = TM.isPPC64(); @@ -673,27 +747,59 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); unsigned SrcReg = MI.getOperand(0).getReg(); - // We need to move the CR field that contains the CR bit we are spilling. - // The super register may not be explicitly defined (i.e. it can be defined - // by a CR-logical that only defines the subreg) so we state that the CR - // field is undef. Also, in order to preserve the kill flag on the CR bit, - // we add it as an implicit use. - BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) + // Search up the BB to find the definition of the CR bit. + MachineBasicBlock::reverse_iterator Ins; + unsigned CRBitSpillDistance = 0; + for (Ins = MI; Ins != MBB.rend(); Ins++) { + // Definition found. + if (Ins->modifiesRegister(SrcReg, TRI)) + break; + // Unable to find CR bit definition within maximum search distance. + if (CRBitSpillDistance == MaxCRBitSpillDist) { + Ins = MI; + break; + } + // Skip debug instructions when counting CR bit spill distance. + if (!Ins->isDebugInstr()) + CRBitSpillDistance++; + } + + // Unable to find the definition of the CR bit in the MBB. + if (Ins == MBB.rend()) + Ins = MI; + + // There is no need to extract the CR bit if its value is already known. + switch (Ins->getOpcode()) { + case PPC::CRUNSET: + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LI8 : PPC::LI), Reg) + .addImm(0); + break; + case PPC::CRSET: + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LIS8 : PPC::LIS), Reg) + .addImm(-32768); + break; + default: + // We need to move the CR field that contains the CR bit we are spilling. + // The super register may not be explicitly defined (i.e. it can be defined + // by a CR-logical that only defines the subreg) so we state that the CR + // field is undef. Also, in order to preserve the kill flag on the CR bit, + // we add it as an implicit use. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) .addReg(getCRFromCRBit(SrcReg), RegState::Undef) .addReg(SrcReg, RegState::Implicit | getKillRegState(MI.getOperand(0).isKill())); - // If the saved register wasn't CR0LT, shift the bits left so that the bit to - // store is the first one. Mask all but that bit. - unsigned Reg1 = Reg; - Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - - // rlwinm rA, rA, ShiftBits, 0, 0. - BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) - .addReg(Reg1, RegState::Kill) - .addImm(getEncodingValue(SrcReg)) - .addImm(0).addImm(0); + // If the saved register wasn't CR0LT, shift the bits left so that the bit + // to store is the first one. Mask all but that bit. + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + // rlwinm rA, rA, ShiftBits, 0, 0. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill) + .addImm(getEncodingValue(SrcReg)) + .addImm(0).addImm(0); + } addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW)) .addReg(Reg, RegState::Kill), FrameIndex); @@ -826,9 +932,7 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, } // If the offset must be a multiple of some value, return what that value is. -static unsigned offsetMinAlign(const MachineInstr &MI) { - unsigned OpC = MI.getOpcode(); - +static unsigned offsetMinAlignForOpcode(unsigned OpC) { switch (OpC) { default: return 1; @@ -847,12 +951,21 @@ static unsigned offsetMinAlign(const MachineInstr &MI) { case PPC::STXSD: case PPC::STXSSP: return 4; + case PPC::EVLDD: + case PPC::EVSTDD: + return 8; case PPC::LXV: case PPC::STXV: return 16; } } +// If the offset must be a multiple of some value, return what that value is. +static unsigned offsetMinAlign(const MachineInstr &MI) { + unsigned OpC = MI.getOpcode(); + return offsetMinAlignForOpcode(OpC); +} + // Return the OffsetOperandNo given the FIOperandNum (and the instruction). static unsigned getOffsetONFromFION(const MachineInstr &MI, unsigned FIOperandNum) { @@ -963,7 +1076,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // happen in invalid code. assert(OpC != PPC::DBG_VALUE && "This should be handled in a target-independent way"); - if (!noImmForm && ((isInt<16>(Offset) && + bool OffsetFitsMnemonic = (OpC == PPC::EVSTDD || OpC == PPC::EVLDD) ? + isUInt<8>(Offset) : + isInt<16>(Offset); + if (!noImmForm && ((OffsetFitsMnemonic && ((Offset % offsetMinAlign(MI)) == 0)) || OpC == TargetOpcode::STACKMAP || OpC == TargetOpcode::PATCHPOINT)) { @@ -1001,7 +1117,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (noImmForm) OperandBase = 1; - else if (OpC != TargetOpcode::INLINEASM) { + else if (OpC != TargetOpcode::INLINEASM && + OpC != TargetOpcode::INLINEASM_BR) { assert(ImmToIdxMap.count(OpC) && "No indexed form of load or store available!"); unsigned NewOpcode = ImmToIdxMap.find(OpC)->second; @@ -1016,7 +1133,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true); } -unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); if (!TM.isPPC64()) @@ -1025,7 +1142,7 @@ unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return TFI->hasFP(MF) ? PPC::X31 : PPC::X1; } -unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const { +Register PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const { const PPCSubtarget &Subtarget = MF.getSubtarget(); if (!hasBasePointer(MF)) return getFrameRegister(MF); @@ -1080,7 +1197,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { MachineBasicBlock &MBB = *MI->getParent(); MachineFunction &MF = *MBB.getParent(); const PPCFrameLowering *TFI = getFrameLowering(MF); - unsigned StackEst = TFI->determineFrameLayout(MF, false, true); + unsigned StackEst = TFI->determineFrameLayout(MF, true); // If we likely don't need a stack frame, then we probably don't need a // virtual base register either. diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h index e93fe4ce3453..a50e05920cd4 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -1,9 +1,8 @@ //===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,13 +14,14 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H #define LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H -#include "PPC.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" #define GET_REGINFO_HEADER #include "PPCGenRegisterInfo.inc" namespace llvm { +class PPCTargetMachine; inline static unsigned getCRFromCRBit(unsigned SrcReg) { unsigned Reg = 0; @@ -90,9 +90,7 @@ public: return true; } - bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { - return true; - } + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override { return true; @@ -134,10 +132,10 @@ public: int64_t Offset) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; // Base pointer (stack realignment) support. - unsigned getBaseRegister(const MachineFunction &MF) const; + Register getBaseRegister(const MachineFunction &MF) const; bool hasBasePointer(const MachineFunction &MF) const; /// stripRegisterPrefix - This method strips the character prefix from a diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td index d0d29b6d2c7d..af0dff6347a6 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -1,9 +1,8 @@ //===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -375,8 +374,6 @@ def CRBITRC : RegisterClass<"PPC", [i1], 32, def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6, CR7, CR2, CR3, CR4)>; -def CRRC0 : RegisterClass<"PPC", [i32], 32, (add CR0)>; - // The CTR registers are not allocatable because they're used by the // decrement-and-branch instructions, and thus need to stay live across // multiple basic blocks. diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td index c8fe7d7eea78..4fa29d96ca14 100644 --- a/lib/Target/PowerPC/PPCSchedule.td +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -1,9 +1,8 @@ //===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -106,6 +105,7 @@ def IIC_VecVSL : InstrItinClass; def IIC_VecVSR : InstrItinClass; def IIC_SprMTMSRD : InstrItinClass; def IIC_SprSLIE : InstrItinClass; +def IIC_SprSLBFEE : InstrItinClass; def IIC_SprSLBIE : InstrItinClass; def IIC_SprSLBIEG : InstrItinClass; def IIC_SprSLBMTE : InstrItinClass; diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td index 646822eedbe0..708261fc7cc8 100644 --- a/lib/Target/PowerPC/PPCSchedule440.td +++ b/lib/Target/PowerPC/PPCSchedule440.td @@ -1,9 +1,8 @@ //===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td index f34c1accc0fd..c2b298524e00 100644 --- a/lib/Target/PowerPC/PPCScheduleA2.td +++ b/lib/Target/PowerPC/PPCScheduleA2.td @@ -1,9 +1,8 @@ //===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCScheduleE500.td b/lib/Target/PowerPC/PPCScheduleE500.td index 479a970b2537..74744dda54f7 100644 --- a/lib/Target/PowerPC/PPCScheduleE500.td +++ b/lib/Target/PowerPC/PPCScheduleE500.td @@ -1,9 +1,8 @@ //===-- PPCScheduleE500.td - e500 Scheduling Defs ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td index d8bda073833f..1a1c041565b6 100644 --- a/lib/Target/PowerPC/PPCScheduleE500mc.td +++ b/lib/Target/PowerPC/PPCScheduleE500mc.td @@ -1,9 +1,8 @@ //===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td index 3e50803955c4..4480d7fba4fb 100644 --- a/lib/Target/PowerPC/PPCScheduleE5500.td +++ b/lib/Target/PowerPC/PPCScheduleE5500.td @@ -1,9 +1,8 @@ //===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td index 0995b7200d93..8f1907f2c016 100644 --- a/lib/Target/PowerPC/PPCScheduleG3.td +++ b/lib/Target/PowerPC/PPCScheduleG3.td @@ -1,9 +1,8 @@ //===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td index 1b15c7b3c7ad..0eabc49d7841 100644 --- a/lib/Target/PowerPC/PPCScheduleG4.td +++ b/lib/Target/PowerPC/PPCScheduleG4.td @@ -1,9 +1,8 @@ //===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td index 0044c3c6a449..9c84aec638d7 100644 --- a/lib/Target/PowerPC/PPCScheduleG4Plus.td +++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -1,9 +1,8 @@ //===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td index c802b80170fb..087073537796 100644 --- a/lib/Target/PowerPC/PPCScheduleG5.td +++ b/lib/Target/PowerPC/PPCScheduleG5.td @@ -1,9 +1,8 @@ //===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td index 1d6e509819da..5a8c1eb2b837 100644 --- a/lib/Target/PowerPC/PPCScheduleP7.td +++ b/lib/Target/PowerPC/PPCScheduleP7.td @@ -1,9 +1,8 @@ //===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td index ff39dfda7016..70a58f42a98a 100644 --- a/lib/Target/PowerPC/PPCScheduleP8.td +++ b/lib/Target/PowerPC/PPCScheduleP8.td @@ -1,9 +1,8 @@ //===-- PPCScheduleP8.td - PPC P8 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td index a1e625c855e0..6a79cca89194 100644 --- a/lib/Target/PowerPC/PPCScheduleP9.td +++ b/lib/Target/PowerPC/PPCScheduleP9.td @@ -1,9 +1,8 @@ //===-- PPCScheduleP9.td - PPC P9 Scheduling Definitions ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -51,8 +50,21 @@ let SchedModel = P9Model in { // ***************** Processor Resources ***************** - //Dispatcher: - def DISPATCHER : ProcResource<12>; + // Dispatcher slots: + // x0, x1, x2, and x3 are the dedicated slice dispatch ports, where each + // corresponds to one of the four execution slices. + def DISPx02 : ProcResource<2>; + def DISPx13 : ProcResource<2>; + // The xa and xb ports can be used to send an iop to either of the two slices + // of the superslice, but are restricted to iops with only two primary sources. + def DISPxab : ProcResource<2>; + // b0 and b1 are dedicated dispatch ports into the branch slice. + def DISPb01 : ProcResource<2>; + + // Any non BR dispatch ports + def DISP_NBR + : ProcResGroup<[ DISPx02, DISPx13, DISPxab]>; + def DISP_SS : ProcResGroup<[ DISPx02, DISPx13]>; // Issue Ports // An instruction can go down one of two issue queues. @@ -117,8 +129,37 @@ let SchedModel = P9Model in { // ***************** SchedWriteRes Definitions ***************** - //Dispatcher - def DISP_1C : SchedWriteRes<[DISPATCHER]> { + // Dispatcher + // Dispatch Rules: '-' or 'V' + // Vector ('V') - vector iops (128-bit operand) take only one decode and + // dispatch slot but are dispatched to both the even and odd slices of a + // superslice. + def DISP_1C : SchedWriteRes<[DISP_NBR]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Dispatch Rules: 'E' + // Even slice ('E')- certain operations must be sent only to an even slice. + // Also consumes odd dispatch slice slot of the same superslice at dispatch + def DISP_EVEN_1C : SchedWriteRes<[ DISPx02, DISPx13 ]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Dispatch Rules: 'P' + // Paired ('P') - certain cracked and expanded iops are paired such that they + // must dispatch together to the same superslice. + def DISP_PAIR_1C : SchedWriteRes<[ DISP_SS, DISP_SS]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Tuple Restricted ('R') - certain iops preclude dispatching more than one + // operation per slice for the super- slice to which they are dispatched + def DISP_3SLOTS_1C : SchedWriteRes<[DISPx02, DISPx13, DISPxab]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Each execution and branch slice can receive up to two iops per cycle + def DISP_BR_1C : SchedWriteRes<[ DISPxab ]> { let NumMicroOps = 0; let Latency = 1; } @@ -148,7 +189,7 @@ let SchedModel = P9Model in { // ALU Units // An ALU may take either 2 or 3 cycles to complete the operation. - // However, the ALU unit is only every busy for 1 cycle at a time and may + // However, the ALU unit is only ever busy for 1 cycle at a time and may // receive new instructions each cycle. def P9_ALU_2C : SchedWriteRes<[ALU]> { let Latency = 2; @@ -203,10 +244,6 @@ let SchedModel = P9Model in { // DP Unit // A DP unit may take from 2 to 36 cycles to complete. // Some DP operations keep the unit busy for up to 10 cycles. - def P9_DP_2C : SchedWriteRes<[DP]> { - let Latency = 2; - } - def P9_DP_5C : SchedWriteRes<[DP]> { let Latency = 5; } @@ -228,11 +265,6 @@ let SchedModel = P9Model in { let Latency = 22; } - def P9_DP_24C_8 : SchedWriteRes<[DP]> { - let ResourceCycles = [8]; - let Latency = 24; - } - def P9_DPO_24C_8 : SchedWriteRes<[DPO]> { let ResourceCycles = [8]; let Latency = 24; @@ -248,11 +280,6 @@ let SchedModel = P9Model in { let Latency = 22; } - def P9_DP_27C_7 : SchedWriteRes<[DP]> { - let ResourceCycles = [7]; - let Latency = 27; - } - def P9_DPE_27C_10 : SchedWriteRes<[DP]> { let ResourceCycles = [10]; let Latency = 27; @@ -383,16 +410,12 @@ let SchedModel = P9Model in { def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>; def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>; def P9_StoreAndALUOp_3C : WriteSequence<[P9_LS_1C, P9_ALU_2C]>; - def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>; def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>; def P9_ALU2OpAndALU2Op_6C : WriteSequence<[P9_ALU_3C, P9_ALU_3C]>; def P9_ALUOpAndALUOpAndALUOp_6C : WriteSequence<[P9_ALU_2C, P9_ALU_2C, P9_ALU_2C]>; def P9_DPOpAndALUOp_7C : WriteSequence<[P9_DP_5C, P9_ALU_2C]>; - def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>; def P9_DPOpAndALU2Op_10C : WriteSequence<[P9_DP_7C, P9_ALU_3C]>; - def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>; - def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>; def P9_DPOpAndALU2Op_25C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_3C]>; def P9_DPOpAndALU2Op_29C_5 : WriteSequence<[P9_DP_26C_5, P9_ALU_3C]>; def P9_DPOpAndALU2Op_36C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_3C]>; diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index c0cbfd779cb9..6aa7528634d3 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -1,9 +1,8 @@ //===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,6 +39,11 @@ static cl::opt QPXStackUnaligned("qpx-stack-unaligned", cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), cl::Hidden); +static cl::opt + EnableMachinePipeliner("ppc-enable-pipeliner", + cl::desc("Enable Machine Pipeliner for PPC"), + cl::init(false), cl::Hidden); + PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { initializeEnvironment(); @@ -68,6 +72,7 @@ void PPCSubtarget::initializeEnvironment() { HasFPU = false; HasQPX = false; HasVSX = false; + NeedsTwoConstNR = false; HasP8Vector = false; HasP8Altivec = false; HasP8Crypto = false; @@ -103,11 +108,13 @@ void PPCSubtarget::initializeEnvironment() { HasDirectMove = false; IsQPXStackUnaligned = false; HasHTM = false; - HasFusion = false; HasFloat128 = false; IsISA3_0 = false; UseLongCalls = false; SecurePlt = false; + VectorsUseTwoUnits = false; + UsePPCPreRASchedStrategy = false; + UsePPCPostRASchedStrategy = false; HasPOPCNTD = POPCNTD_Unavailable; } @@ -138,6 +145,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isDarwin()) HasLazyResolverStubs = true; + if (TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() || + TargetTriple.isMusl()) + SecurePlt = true; + if (HasSPE && IsPPC64) report_fatal_error( "SPE is only supported for 32-bit targets.\n", false); if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU)) @@ -175,10 +186,14 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const { return false; } -bool PPCSubtarget::enableMachineScheduler() const { - return true; +bool PPCSubtarget::enableMachineScheduler() const { return true; } + +bool PPCSubtarget::enableMachinePipeliner() const { + return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner; } +bool PPCSubtarget::useDFAforSMS() const { return false; } + // This overrides the PostRAScheduler bit in the SchedModel for each CPU. bool PPCSubtarget::enablePostRAScheduler() const { return true; } diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index c56f254d6bec..55fec1cb6d99 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -1,9 +1,8 @@ //===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -99,6 +98,7 @@ protected: bool HasSPE; bool HasQPX; bool HasVSX; + bool NeedsTwoConstNR; bool HasP8Vector; bool HasP8Altivec; bool HasP8Crypto; @@ -131,11 +131,13 @@ protected: bool HasPartwordAtomics; bool HasDirectMove; bool HasHTM; - bool HasFusion; bool HasFloat128; bool IsISA3_0; bool UseLongCalls; bool SecurePlt; + bool VectorsUseTwoUnits; + bool UsePPCPreRASchedStrategy; + bool UsePPCPostRASchedStrategy; POPCNTDKind HasPOPCNTD; @@ -244,6 +246,7 @@ public: bool hasFPU() const { return HasFPU; } bool hasQPX() const { return HasQPX; } bool hasVSX() const { return HasVSX; } + bool needsTwoConstNR() const { return NeedsTwoConstNR; } bool hasP8Vector() const { return HasP8Vector; } bool hasP8Altivec() const { return HasP8Altivec; } bool hasP8Crypto() const { return HasP8Crypto; } @@ -260,6 +263,7 @@ public: bool isPPC4xx() const { return IsPPC4xx; } bool isPPC6xx() const { return IsPPC6xx; } bool isSecurePlt() const {return SecurePlt; } + bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; } bool isE500() const { return IsE500; } bool isFeatureMFTB() const { return FeatureMFTB; } bool isDeprecatedDST() const { return DeprecatedDST; } @@ -267,6 +271,8 @@ public: bool hasInvariantFunctionDescriptors() const { return HasInvariantFunctionDescriptors; } + bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; } + bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; } bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } @@ -285,7 +291,6 @@ public: } bool hasHTM() const { return HasHTM; } - bool hasFusion() const { return HasFusion; } bool hasFloat128() const { return HasFloat128; } bool isISA3_0() const { return IsISA3_0; } bool useLongCalls() const { return UseLongCalls; } @@ -307,16 +312,21 @@ public: bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isDarwinABI() const { return isTargetMachO() || isDarwin(); } - bool isSVR4ABI() const { return !isDarwinABI(); } + bool isAIXABI() const { return TargetTriple.isOSAIX(); } + bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); } bool isELFv2ABI() const; /// Originally, this function return hasISEL(). Now we always enable it, /// but may expand the ISEL instruction later. bool enableEarlyIfConversion() const override { return true; } - // Scheduling customization. + /// Scheduling customization. bool enableMachineScheduler() const override; - // This overrides the PostRAScheduler bit in the SchedModel for each CPU. + /// Pipeliner customization. + bool enableMachinePipeliner() const override; + /// Machine Pipeliner customization + bool useDFAforSMS() const override; + /// This overrides the PostRAScheduler bit in the SchedModel for each CPU. bool enablePostRAScheduler() const override; AntiDepBreakMode getAntiDepBreakMode() const override; void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index ac36abbe8439..fb826c4a32f1 100644 --- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -1,9 +1,8 @@ //===---------- PPCTLSDynamicCall.cpp - TLS Dynamic Call Fixup ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -35,10 +34,6 @@ using namespace llvm; #define DEBUG_TYPE "ppc-tls-dynamic-call" -namespace llvm { - void initializePPCTLSDynamicCallPass(PassRegistry&); -} - namespace { struct PPCTLSDynamicCall : public MachineFunctionPass { static char ID; diff --git a/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/lib/Target/PowerPC/PPCTOCRegDeps.cpp index 17345b6ca8d3..3eb0569fb955 100644 --- a/lib/Target/PowerPC/PPCTOCRegDeps.cpp +++ b/lib/Target/PowerPC/PPCTOCRegDeps.cpp @@ -1,9 +1,8 @@ //===-- PPCTOCRegDeps.cpp - Add Extra TOC Register Dependencies -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,10 +82,6 @@ using namespace llvm; #define DEBUG_TYPE "ppc-toc-reg-deps" -namespace llvm { - void initializePPCTOCRegDepsPass(PassRegistry&); -} - namespace { // PPCTOCRegDeps pass - For simple functions without epilogue code, move // returns up, and create conditional returns, to avoid unnecessary diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 580d057602f5..ce00f848dd72 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,9 +13,11 @@ #include "PPCTargetMachine.h" #include "MCTargetDesc/PPCMCTargetDesc.h" #include "PPC.h" +#include "PPCMachineScheduler.h" #include "PPCSubtarget.h" #include "PPCTargetObjectFile.h" #include "PPCTargetTransformInfo.h" +#include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -100,6 +101,19 @@ extern "C" void LLVMInitializePowerPCTarget() { RegisterTargetMachine C(getThePPC64LETarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); +#ifndef NDEBUG + initializePPCCTRLoopsVerifyPass(PR); +#endif + initializePPCLoopPreIncPrepPass(PR); + initializePPCTOCRegDepsPass(PR); + initializePPCEarlyReturnPass(PR); + initializePPCVSXCopyPass(PR); + initializePPCVSXFMAMutatePass(PR); + initializePPCVSXSwapRemovalPass(PR); + initializePPCReduceCRLogicalsPass(PR); + initializePPCBSelPass(PR); + initializePPCBranchCoalescingPass(PR); + initializePPCQPXLoadSplatPass(PR); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCPreEmitPeepholePass(PR); @@ -199,6 +213,8 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, case Triple::ppc64le: return PPCTargetMachine::PPC_ABI_ELFv2; case Triple::ppc64: + if (TT.getEnvironment() == llvm::Triple::ELFv2) + return PPCTargetMachine::PPC_ABI_ELFv2; return PPCTargetMachine::PPC_ABI_ELFv1; default: return PPCTargetMachine::PPC_ABI_UNKNOWN; @@ -227,9 +243,9 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT, bool JIT) { if (CM) { if (*CM == CodeModel::Tiny) - report_fatal_error("Target does not support the tiny CodeModel"); + report_fatal_error("Target does not support the tiny CodeModel", false); if (*CM == CodeModel::Kernel) - report_fatal_error("Target does not support the kernel CodeModel"); + report_fatal_error("Target does not support the kernel CodeModel", false); return *CM; } if (!TT.isOSDarwin() && !JIT && @@ -238,6 +254,29 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT, return CodeModel::Small; } + +static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) { + const PPCSubtarget &ST = C->MF->getSubtarget(); + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ? + llvm::make_unique(C) : + llvm::make_unique(C)); + // add DAG Mutations here. + DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + +static ScheduleDAGInstrs *createPPCPostMachineScheduler( + MachineSchedContext *C) { + const PPCSubtarget &ST = C->MF->getSubtarget(); + ScheduleDAGMI *DAG = + new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ? + llvm::make_unique(C) : + llvm::make_unique(C), true); + // add DAG Mutations here. + return DAG; +} + // The FeatureString here is a little subtle. We are modifying the feature // string with what are (currently) non-function specific overrides as it goes // into the LLVMTargetMachine constructor and then using the stored value in the @@ -331,6 +370,14 @@ public: void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + return createPPCMachineScheduler(C); + } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + return createPPCPostMachineScheduler(C); + } }; } // end anonymous namespace @@ -374,7 +421,7 @@ bool PPCPassConfig::addPreISel() { addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) - addPass(createPPCCTRLoops()); + addPass(createHardwareLoopsPass()); return false; } @@ -441,6 +488,9 @@ void PPCPassConfig::addPreRegAlloc() { } if (EnableExtraTOCRegDeps) addPass(createPPCTOCRegDepsPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(&MachinePipelinerID); } void PPCPassConfig::addPreSched2() { @@ -469,3 +519,13 @@ TargetTransformInfo PPCTargetMachine::getTargetTransformInfo(const Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); } + +static MachineSchedRegistry +PPCPreRASchedRegistry("ppc-prera", + "Run PowerPC PreRA specific scheduler", + createPPCMachineScheduler); + +static MachineSchedRegistry +PPCPostRASchedRegistry("ppc-postra", + "Run PowerPC PostRA specific scheduler", + createPPCPostMachineScheduler); diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 75b98a815ab4..fd1d14ae32d4 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -1,9 +1,8 @@ //===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -59,10 +58,6 @@ public: const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; - - bool isMachineVerifierClean() const override { - return false; - } }; } // end namespace llvm diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp index a049dc3fda93..e237fab1b267 100644 --- a/lib/Target/PowerPC/PPCTargetObjectFile.cpp +++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h index 417b8ed0d612..78a5840c87c7 100644 --- a/lib/Target/PowerPC/PPCTargetObjectFile.h +++ b/lib/Target/PowerPC/PPCTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h index 310fea9ef09f..e17361d997fd 100644 --- a/lib/Target/PowerPC/PPCTargetStreamer.h +++ b/lib/Target/PowerPC/PPCTargetStreamer.h @@ -1,9 +1,8 @@ //===- PPCTargetStreamer.h - PPC Target Streamer ----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index bc9bcab83a0a..ff3dfbfaca05 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1,17 +1,18 @@ //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "PPCTargetTransformInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -32,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions")); +// The latency of mtctr is only justified if there are more than 4 +// comparisons that will be removed as a result. +static cl::opt +SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, + cl::desc("Loops with a constant trip count smaller than " + "this value will not use the count register.")); + //===----------------------------------------------------------------------===// // // PPC cost model. @@ -205,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U, return BaseT::getUserCost(U, Operands); } +bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, + TargetLibraryInfo *LibInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + + // Loop through the inline asm constraints and look for something that + // clobbers ctr. + auto asmClobbersCTR = [](InlineAsm *IA) { + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + return false; + }; + + // Determining the address of a TLS variable results in a function call in + // certain TLS models. + std::function memAddrUsesCTR = + [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool { + const auto *GV = dyn_cast(MemAddr); + if (!GV) { + // Recurse to check for constants that refer to TLS global variables. + if (const auto *CV = dyn_cast(MemAddr)) + for (const auto &CO : CV->operands()) + if (memAddrUsesCTR(CO)) + return true; + + return false; + } + + if (!GV->isThreadLocal()) + return false; + TLSModel::Model Model = TM.getTLSModel(GV); + return Model == TLSModel::GeneralDynamic || + Model == TLSModel::LocalDynamic; + }; + + auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) { + if (IntegerType *ITy = dyn_cast(Ty)) + return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); + + return false; + }; + + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); + J != JE; ++J) { + if (CallInst *CI = dyn_cast(J)) { + // Inline ASM is okay, unless it clobbers the ctr register. + if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) { + if (asmClobbersCTR(IA)) + return true; + continue; + } + + if (Function *F = CI->getCalledFunction()) { + // Most intrinsics don't become function calls, but some might. + // sin, cos, exp and log are always calls. + unsigned Opcode = 0; + if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { + switch (F->getIntrinsicID()) { + default: continue; + // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr + // we're definitely using CTR. + case Intrinsic::set_loop_iterations: + case Intrinsic::loop_decrement: + return true; + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + case Intrinsic::setjmp: + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + case Intrinsic::longjmp: + + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp + // because, although it does clobber the counter register, the + // control can't then return to inside the loop unless there is also + // an eh_sjlj_setjmp. + case Intrinsic::eh_sjlj_setjmp: + + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::powi: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + case Intrinsic::sin: + case Intrinsic::cos: + return true; + case Intrinsic::copysign: + if (CI->getArgOperand(0)->getType()->getScalarType()-> + isPPC_FP128Ty()) + return true; + else + continue; // ISD::FCOPYSIGN is never a library call. + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; + case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; + case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; + } + } + + // PowerPC does not use [US]DIVREM or other library calls for + // operations on regular types which are not otherwise library calls + // (i.e. soft float or atomics). If adapting for targets that do, + // additional care is required here. + + LibFunc Func; + if (!F->hasLocalLinkage() && F->hasName() && LibInfo && + LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->hasOptimizedCodeGen(Func)) { + // Non-read-only functions are never treated as intrinsics. + if (!CI->onlyReadsMemory()) + return true; + + // Conversion happens only for FP calls. + if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) + return true; + + switch (Func) { + default: return true; + case LibFunc_copysign: + case LibFunc_copysignf: + continue; // ISD::FCOPYSIGN is never a library call. + case LibFunc_copysignl: + return true; + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: + continue; // ISD::FABS is never a library call. + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + Opcode = ISD::FSQRT; break; + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: + Opcode = ISD::FFLOOR; break; + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: + Opcode = ISD::FNEARBYINT; break; + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: + Opcode = ISD::FCEIL; break; + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: + Opcode = ISD::FRINT; break; + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: + Opcode = ISD::FROUND; break; + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: + Opcode = ISD::FTRUNC; break; + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: + Opcode = ISD::FMINNUM; break; + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: + Opcode = ISD::FMAXNUM; break; + } + } + + if (Opcode) { + EVT EVTy = + TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true); + + if (EVTy == MVT::Other) + return true; + + if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) + continue; + else if (EVTy.isVector() && + TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) + continue; + + return true; + } + } + + return true; + } else if (isa(J) && + J->getType()->getScalarType()->isPPC_FP128Ty()) { + // Most operations on ppc_f128 values become calls. + return true; + } else if (isa(J) || isa(J) || + isa(J) || isa(J)) { + CastInst *CI = cast(J); + if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || + CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || + isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType())) + return true; + } else if (isLargeIntegerTy(!TM.isPPC64(), + J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::UDiv || + J->getOpcode() == Instruction::SDiv || + J->getOpcode() == Instruction::URem || + J->getOpcode() == Instruction::SRem)) { + return true; + } else if (!TM.isPPC64() && + isLargeIntegerTy(false, J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::Shl || + J->getOpcode() == Instruction::AShr || + J->getOpcode() == Instruction::LShr)) { + // Only on PPC32, for 128-bit integers (specifically not 64-bit + // integers), these might be runtime calls. + return true; + } else if (isa(J) || isa(J)) { + // On PowerPC, indirect jumps use the counter register. + return true; + } else if (SwitchInst *SI = dyn_cast(J)) { + if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) + return true; + } + + // FREM is always a call. + if (J->getOpcode() == Instruction::FRem) + return true; + + if (ST->useSoftFloat()) { + switch(J->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FCmp: + return true; + } + } + + for (Value *Operand : J->operands()) + if (memAddrUsesCTR(Operand)) + return true; + } + + return false; +} + +bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + TargetSchedModel SchedModel; + SchedModel.init(ST); + + // Do not convert small short loops to CTR loop. + unsigned ConstTripCount = SE.getSmallConstantTripCount(L); + if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + CodeMetrics Metrics; + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, *this, EphValues); + // 6 is an approximate latency for the mtctr instruction. + if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) + return false; + } + + // We don't want to spill/restore the counter register, and so we don't + // want to use the counter register if the loop contains calls. + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) + if (mightUseCTR(*I, LibInfo)) + return false; + + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // If there is an exit edge known to be frequently taken, + // we should not transform this loop. + for (auto &BB : ExitingBlocks) { + Instruction *TI = BB->getTerminator(); + if (!TI) continue; + + if (BranchInst *BI = dyn_cast(TI)) { + uint64_t TrueWeight = 0, FalseWeight = 0; + if (!BI->isConditional() || + !BI->extractProfMetadata(TrueWeight, FalseWeight)) + continue; + + // If the exit path is more frequent than the loop path, + // we return here without further analysis for this loop. + bool TrueIsExit = !L->contains(BI->getSuccessor(0)); + if (( TrueIsExit && FalseWeight < TrueWeight) || + (!TrueIsExit && FalseWeight > TrueWeight)) + return false; + } + } + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CountType = TM.isPPC64() ? + Type::getInt64Ty(C) : Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { @@ -239,17 +582,12 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } -const PPCTTIImpl::TTI::MemCmpExpansionOptions * -PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { - static const auto Options = []() { - TTI::MemCmpExpansionOptions Options; - Options.LoadSizes.push_back(8); - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); - return Options; - }(); - return &Options; +PPCTTIImpl::TTI::MemCmpExpansionOptions +PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes = {8, 4, 2, 1}; + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + return Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { @@ -324,6 +662,33 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } +// Adjust the cost of vector instructions on targets which there is overlap +// between the vector and scalar units, thereby reducing the overall throughput +// of vector code wrt. scalar code. +int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, + Type *Ty2) { + if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy()) + return Cost; + + std::pair LT1 = TLI->getTypeLegalizationCost(DL, Ty1); + // If type legalization involves splitting the vector, we don't want to + // double the cost at every step - only the last step. + if (LT1.first != 1 || !LT1.second.isVector()) + return Cost; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (TLI->isOperationExpand(ISD, LT1.second)) + return Cost; + + if (Ty2) { + std::pair LT2 = TLI->getTypeLegalizationCost(DL, Ty2); + if (LT2.first != 1 || !LT2.second.isVector()) + return Cost; + } + + return Cost * 2; +} + int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, @@ -331,8 +696,9 @@ int PPCTTIImpl::getArithmeticInstrCost( assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. - return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + return vectorCostAdjustment(Cost, Opcode, Ty, nullptr); } int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -345,19 +711,22 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the // structured types of shuffles covered by TTI::ShuffleKind). - return LT.first; + return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp, + nullptr); } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); - return BaseT::getCastInstrCost(Opcode, Dst, Src); + int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src); + return vectorCostAdjustment(Cost, Opcode, Dst, Src); } int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr); } int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -366,18 +735,23 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); + Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr); + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { - // Double-precision scalars are already located in index #0. - if (Index == 0) + // Double-precision scalars are already located in index #0 (or #1 if LE). + if (ISD == ISD::EXTRACT_VECTOR_ELT && + Index == (ST->isLittleEndian() ? 1 : 0)) return 0; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; + } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { // Floating point scalars are already located in index #0. if (Index == 0) return 0; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; } // Estimated cost of a load-hit-store delay. This was obtained @@ -394,9 +768,9 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { // these need to be estimated as very costly. if (ISD == ISD::EXTRACT_VECTOR_ELT || ISD == ISD::INSERT_VECTOR_ELT) - return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); + return LHSPenalty + Cost; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; } int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, @@ -407,6 +781,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, "Invalid Opcode"); int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr); bool IsAltivecType = ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || @@ -500,3 +875,25 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, return Cost; } +bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, TargetLibraryInfo *LibInfo) { + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + if (canSaveCmp(*I, BI, SE, LI, DT, AC, LibInfo)) + return false; // Stop search. + + HardwareLoopInfo HWLoopInfo(L); + + if (!HWLoopInfo.canAnalyze(*LI)) + return false; + + if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) + return false; + + if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) + return false; + + *BI = HWLoopInfo.ExitBranch; + return true; +} diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 9221a910288a..5d76ee418b69 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- PPCTargetTransformInfo.h - PPC specific TTI -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -17,7 +16,6 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H #define LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H -#include "PPC.h" #include "PPCTargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" @@ -35,6 +33,7 @@ class PPCTTIImpl : public BasicTTIImplBase { const PPCSubtarget *getST() const { return ST; } const PPCTargetLowering *getTLI() const { return TLI; } + bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo); public: explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F) @@ -54,6 +53,13 @@ public: unsigned getUserCost(const User *U, ArrayRef Operands); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo); + bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, + DominatorTree *DT, AssumptionCache *AC, + TargetLibraryInfo *LibInfo); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); @@ -63,14 +69,15 @@ public: /// @{ bool useColdCCForColdCall(Function &F); bool enableAggressiveInterleaving(bool LoopHasReductions); - const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const; + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize(); unsigned getPrefetchDistance(); unsigned getMaxInterleaveFactor(unsigned VF); + int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp index 93fe3230ab81..719ed7b63878 100644 --- a/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -1,9 +1,8 @@ //===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,10 +36,6 @@ using namespace llvm; #define DEBUG_TYPE "ppc-vsx-copy" -namespace llvm { - void initializePPCVSXCopyPass(PassRegistry&); -} - namespace { // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers // (Altivec and scalar floating-point registers), we need to transform the diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 6586f503a7b8..ce78239df0a8 100644 --- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -1,9 +1,8 @@ //===--------------- PPCVSXFMAMutate.cpp - VSX FMA Mutation ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 1be193e08c01..44175af7f9b6 100644 --- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -1,9 +1,8 @@ //===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // @@ -60,10 +59,6 @@ using namespace llvm; #define DEBUG_TYPE "ppc-vsx-swaps" -namespace llvm { - void initializePPCVSXSwapRemovalPass(PassRegistry&); -} - namespace { // A PPCVSXSwapEntry is created for each machine instruction that @@ -427,6 +422,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { // of opcodes having a common attribute in TableGen. Should this // change, this is a prime candidate to use such a mechanism. case PPC::INLINEASM: + case PPC::INLINEASM_BR: case PPC::EXTRACT_SUBREG: case PPC::INSERT_SUBREG: case PPC::COPY_TO_REGCLASS: diff --git a/lib/Target/PowerPC/README_P9.txt b/lib/Target/PowerPC/README_P9.txt index d56f7cca7b21..c9984b7604bd 100644 --- a/lib/Target/PowerPC/README_P9.txt +++ b/lib/Target/PowerPC/README_P9.txt @@ -512,8 +512,8 @@ Fixed Point Facility: "lxsdx $XT, $src", IIC_LdStLFD, [(set f64:$XT, (load xoaddr:$src))]>; - . (set f64:$XT, (load ixaddr:$src)) - (set f64:$XT, (store ixaddr:$dst)) + . (set f64:$XT, (load iaddrX4:$src)) + (set f64:$XT, (store iaddrX4:$dst)) - Load/Store SP, with conversion from/to DP: lxssp stxssp . Similar to lxsspx/stxsspx: @@ -521,8 +521,8 @@ Fixed Point Facility: "lxsspx $XT, $src", IIC_LdStLFD, [(set f32:$XT, (load xoaddr:$src))]>; - . (set f32:$XT, (load ixaddr:$src)) - (set f32:$XT, (store ixaddr:$dst)) + . (set f32:$XT, (load iaddrX4:$src)) + (set f32:$XT, (store iaddrX4:$dst)) - Load as Integer Byte/Halfword & Zero Indexed: lxsibzx lxsihzx . Similar to lxsiwzx: diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp index 979595264472..99b5dec74668 100644 --- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp +++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp @@ -1,14 +1,12 @@ //===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "PPC.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h new file mode 100644 index 000000000000..2d0afbfb1be0 --- /dev/null +++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h @@ -0,0 +1,22 @@ +//===-- PowerPCTargetInfo.h - PowerPC Target Implementation -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_TARGETINFO_POWERPCTARGETINFO_H +#define LLVM_LIB_TARGET_POWERPC_TARGETINFO_POWERPCTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getThePPC32Target(); +Target &getThePPC64Target(); +Target &getThePPC64LETarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_POWERPC_TARGETINFO_POWERPCTARGETINFO_H diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 1d1112cc5124..0172c6298772 100644 --- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1,9 +1,8 @@ //===-- RISCVAsmParser.cpp - Parse RISCV assembly to MCInst instructions --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,6 +10,7 @@ #include "MCTargetDesc/RISCVMCExpr.h" #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "MCTargetDesc/RISCVTargetStreamer.h" +#include "TargetInfo/RISCVTargetInfo.h" #include "Utils/RISCVBaseInfo.h" #include "Utils/RISCVMatInt.h" #include "llvm/ADT/STLExtras.h" @@ -21,6 +21,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" @@ -47,6 +48,7 @@ class RISCVAsmParser : public MCTargetAsmParser { SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); } + bool isRV32E() const { return getSTI().hasFeature(RISCV::FeatureRV32E); } RISCVTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); @@ -79,14 +81,42 @@ class RISCVAsmParser : public MCTargetAsmParser { // synthesize the desired immedate value into the destination register. void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out); + // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement + // helpers such as emitLoadLocalAddress and emitLoadAddress. + void emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg, + const MCExpr *Symbol, RISCVMCExpr::VariantKind VKHi, + unsigned SecondOpcode, SMLoc IDLoc, MCStreamer &Out); + // Helper to emit pseudo instruction "lla" used in PC-rel addressing. void emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + // Helper to emit pseudo instruction "la" used in GOT/PC-rel addressing. + void emitLoadAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + + // Helper to emit pseudo instruction "la.tls.ie" used in initial-exec TLS + // addressing. + void emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + + // Helper to emit pseudo instruction "la.tls.gd" used in global-dynamic TLS + // addressing. + void emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + + // Helper to emit pseudo load/store instruction with a symbol. + void emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, SMLoc IDLoc, + MCStreamer &Out, bool HasTmpReg); + + // Checks that a PseudoAddTPRel is using x4/tp in its second input operand. + // Enforcing this using a restricted register class for the second input + // operand of PseudoAddTPRel results in a poor diagnostic due to the fact + // 'add' is an overloaded mnemonic. + bool checkPseudoAddTPRel(MCInst &Inst, OperandVector &Operands); + /// Helper for processing MC instructions that have been successfully matched /// by MatchAndEmitInstruction. Modifications to the emitted instructions, /// like the expansion of pseudo instructions (e.g., "li"), can be performed /// in this method. - bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands, + MCStreamer &Out); // Auto-generated instruction matching functions #define GET_ASSEMBLER_HEADER @@ -99,6 +129,7 @@ class RISCVAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands); OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands); OperandMatchResultTy parseBareSymbol(OperandVector &Operands); + OperandMatchResultTy parseCallSymbol(OperandVector &Operands); OperandMatchResultTy parseJALOffset(OperandVector &Operands); bool parseOperand(OperandVector &Operands, StringRef Mnemonic); @@ -269,6 +300,27 @@ public: VK == RISCVMCExpr::VK_RISCV_None; } + bool isCallSymbol() const { + int64_t Imm; + RISCVMCExpr::VariantKind VK; + // Must be of 'immediate' type but not a constant. + if (!isImm() || evaluateConstantImm(getImm(), Imm, VK)) + return false; + return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) && + (VK == RISCVMCExpr::VK_RISCV_CALL || + VK == RISCVMCExpr::VK_RISCV_CALL_PLT); + } + + bool isTPRelAddSymbol() const { + int64_t Imm; + RISCVMCExpr::VariantKind VK; + // Must be of 'immediate' type but not a constant. + if (!isImm() || evaluateConstantImm(getImm(), Imm, VK)) + return false; + return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) && + VK == RISCVMCExpr::VK_RISCV_TPREL_ADD; + } + bool isCSRSystemRegister() const { return isSystemRegister(); } /// Return true if the operand is a valid for the fence instruction e.g. @@ -463,7 +515,8 @@ public: IsValid = isInt<12>(Imm); return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) || VK == RISCVMCExpr::VK_RISCV_LO || - VK == RISCVMCExpr::VK_RISCV_PCREL_LO); + VK == RISCVMCExpr::VK_RISCV_PCREL_LO || + VK == RISCVMCExpr::VK_RISCV_TPREL_LO); } bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); } @@ -489,10 +542,12 @@ public: bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); if (!IsConstantImm) { IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm); - return IsValid && VK == RISCVMCExpr::VK_RISCV_HI; + return IsValid && (VK == RISCVMCExpr::VK_RISCV_HI || + VK == RISCVMCExpr::VK_RISCV_TPREL_HI); } else { return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None || - VK == RISCVMCExpr::VK_RISCV_HI); + VK == RISCVMCExpr::VK_RISCV_HI || + VK == RISCVMCExpr::VK_RISCV_TPREL_HI); } } @@ -505,10 +560,16 @@ public: bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); if (!IsConstantImm) { IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm); - return IsValid && VK == RISCVMCExpr::VK_RISCV_PCREL_HI; + return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI || + VK == RISCVMCExpr::VK_RISCV_GOT_HI || + VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI || + VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI); } else { return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None || - VK == RISCVMCExpr::VK_RISCV_PCREL_HI); + VK == RISCVMCExpr::VK_RISCV_PCREL_HI || + VK == RISCVMCExpr::VK_RISCV_GOT_HI || + VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI || + VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI); } } @@ -753,7 +814,7 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, default: break; case Match_Success: - return processInstruction(Inst, IDLoc, Out); + return processInstruction(Inst, IDLoc, Operands, Out); case Match_MissingFeature: return Error(IDLoc, "instruction use requires an option to be enabled"); case Match_MnemonicFail: @@ -844,8 +905,8 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidSImm12: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 11), (1 << 11) - 1, - "operand must be a symbol with %lo/%pcrel_lo modifier or an integer in " - "the range"); + "operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an " + "integer in the range"); case Match_InvalidSImm12Lsb0: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 11), (1 << 11) - 2, @@ -856,13 +917,15 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, "immediate must be a multiple of 2 bytes in the range"); case Match_InvalidUImm20LUI: return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 20) - 1, - "operand must be a symbol with %hi() " - "modifier or an integer in the range"); + "operand must be a symbol with " + "%hi/%tprel_hi modifier or an integer in " + "the range"); case Match_InvalidUImm20AUIPC: return generateImmOutOfRangeError( Operands, ErrorInfo, 0, (1 << 20) - 1, - "operand must be a symbol with %pcrel_hi() modifier or an integer in " - "the range"); + "operand must be a symbol with a " + "%pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or " + "an integer in the range"); case Match_InvalidSImm21Lsb0JAL: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 20), (1 << 20) - 2, @@ -888,11 +951,33 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); return Error(ErrorLoc, "operand must be a bare symbol name"); } + case Match_InvalidCallSymbol: { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); + return Error(ErrorLoc, "operand must be a bare symbol name"); + } + case Match_InvalidTPRelAddSymbol: { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); + return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier"); + } } llvm_unreachable("Unknown match type detected!"); } +// Attempts to match Name as a register (either using the default name or +// alternative ABI names), setting RegNo to the matching register. Upon +// failure, returns true and sets RegNo to 0. If IsRV32E then registers +// x16-x31 will be rejected. +static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo, + StringRef Name) { + RegNo = MatchRegisterName(Name); + if (RegNo == 0) + RegNo = MatchRegisterAltName(Name); + if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31) + RegNo = 0; + return RegNo == 0; +} + bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { const AsmToken &Tok = getParser().getTok(); @@ -901,42 +986,45 @@ bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, RegNo = 0; StringRef Name = getLexer().getTok().getIdentifier(); - if (!MatchRegisterName(Name) || !MatchRegisterAltName(Name)) { - getParser().Lex(); // Eat identifier token. - return false; - } + if (matchRegisterNameHelper(isRV32E(), RegNo, Name)) + return Error(StartLoc, "invalid register name"); - return Error(StartLoc, "invalid register name"); + getParser().Lex(); // Eat identifier token. + return false; } OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands, bool AllowParens) { SMLoc FirstS = getLoc(); bool HadParens = false; - AsmToken Buf[2]; + AsmToken LParen; - // If this a parenthesised register name is allowed, parse it atomically + // If this is an LParen and a parenthesised register name is allowed, parse it + // atomically. if (AllowParens && getLexer().is(AsmToken::LParen)) { + AsmToken Buf[2]; size_t ReadCount = getLexer().peekTokens(Buf); if (ReadCount == 2 && Buf[1].getKind() == AsmToken::RParen) { HadParens = true; + LParen = getParser().getTok(); getParser().Lex(); // Eat '(' } } switch (getLexer().getKind()) { default: + if (HadParens) + getLexer().UnLex(LParen); return MatchOperand_NoMatch; case AsmToken::Identifier: StringRef Name = getLexer().getTok().getIdentifier(); - unsigned RegNo = MatchRegisterName(Name); + unsigned RegNo; + matchRegisterNameHelper(isRV32E(), RegNo, Name); + if (RegNo == 0) { - RegNo = MatchRegisterAltName(Name); - if (RegNo == 0) { - if (HadParens) - getLexer().UnLex(Buf[0]); - return MatchOperand_NoMatch; - } + if (HadParens) + getLexer().UnLex(LParen); + return MatchOperand_NoMatch; } if (HadParens) Operands.push_back(RISCVOperand::createToken("(", FirstS, isRV64())); @@ -965,6 +1053,8 @@ RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) { case AsmToken::LParen: case AsmToken::Minus: case AsmToken::Plus: + case AsmToken::Exclaim: + case AsmToken::Tilde: case AsmToken::Integer: case AsmToken::String: { if (getParser().parseExpression(Res)) @@ -1029,8 +1119,11 @@ OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) { default: return MatchOperand_NoMatch; case AsmToken::LParen: + case AsmToken::Dot: case AsmToken::Minus: case AsmToken::Plus: + case AsmToken::Exclaim: + case AsmToken::Tilde: case AsmToken::Integer: case AsmToken::String: case AsmToken::Identifier: @@ -1093,12 +1186,55 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) { if (getLexer().getKind() != AsmToken::Identifier) return MatchOperand_NoMatch; + StringRef Identifier; + AsmToken Tok = getLexer().getTok(); + + if (getParser().parseIdentifier(Identifier)) + return MatchOperand_ParseFail; + + if (Identifier.consume_back("@plt")) { + Error(getLoc(), "'@plt' operand not valid for instruction"); + return MatchOperand_ParseFail; + } + + MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); + + if (Sym->isVariable()) { + const MCExpr *V = Sym->getVariableValue(/*SetUsed=*/false); + if (!isa(V)) { + getLexer().UnLex(Tok); // Put back if it's not a bare symbol. + return MatchOperand_NoMatch; + } + Res = V; + } else + Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); + Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); + return MatchOperand_Success; +} + +OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) { + SMLoc S = getLoc(); + SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); + const MCExpr *Res; + + if (getLexer().getKind() != AsmToken::Identifier) + return MatchOperand_NoMatch; + + // Avoid parsing the register in `call rd, foo` as a call symbol. + if (getLexer().peekTok().getKind() != AsmToken::EndOfStatement) + return MatchOperand_NoMatch; + StringRef Identifier; if (getParser().parseIdentifier(Identifier)) return MatchOperand_ParseFail; + RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL; + if (Identifier.consume_back("@plt")) + Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; + MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); + Res = RISCVMCExpr::create(Res, Kind, getContext()); Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); return MatchOperand_Success; } @@ -1408,42 +1544,144 @@ void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value, } } -void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, - MCStreamer &Out) { - // The local load address pseudo-instruction "lla" is used in PC-relative - // addressing of symbols: - // lla rdest, symbol - // expands to - // TmpLabel: AUIPC rdest, %pcrel_hi(symbol) - // ADDI rdest, %pcrel_lo(TmpLabel) +void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg, + const MCExpr *Symbol, + RISCVMCExpr::VariantKind VKHi, + unsigned SecondOpcode, SMLoc IDLoc, + MCStreamer &Out) { + // A pair of instructions for PC-relative addressing; expands to + // TmpLabel: AUIPC TmpReg, VKHi(symbol) + // OP DestReg, TmpReg, %pcrel_lo(TmpLabel) MCContext &Ctx = getContext(); MCSymbol *TmpLabel = Ctx.createTempSymbol( "pcrel_hi", /* AlwaysAddSuffix */ true, /* CanBeUnnamed */ false); Out.EmitLabel(TmpLabel); - MCOperand DestReg = Inst.getOperand(0); - const RISCVMCExpr *Symbol = RISCVMCExpr::create( - Inst.getOperand(1).getExpr(), RISCVMCExpr::VK_RISCV_PCREL_HI, Ctx); - + const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx); emitToStreamer( - Out, MCInstBuilder(RISCV::AUIPC).addOperand(DestReg).addExpr(Symbol)); + Out, MCInstBuilder(RISCV::AUIPC).addOperand(TmpReg).addExpr(SymbolHi)); const MCExpr *RefToLinkTmpLabel = RISCVMCExpr::create(MCSymbolRefExpr::create(TmpLabel, Ctx), RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx); - emitToStreamer(Out, MCInstBuilder(RISCV::ADDI) - .addOperand(DestReg) + emitToStreamer(Out, MCInstBuilder(SecondOpcode) .addOperand(DestReg) + .addOperand(TmpReg) .addExpr(RefToLinkTmpLabel)); } +void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out) { + // The load local address pseudo-instruction "lla" is used in PC-relative + // addressing of local symbols: + // lla rdest, symbol + // expands to + // TmpLabel: AUIPC rdest, %pcrel_hi(symbol) + // ADDI rdest, rdest, %pcrel_lo(TmpLabel) + MCOperand DestReg = Inst.getOperand(0); + const MCExpr *Symbol = Inst.getOperand(1).getExpr(); + emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI, + RISCV::ADDI, IDLoc, Out); +} + +void RISCVAsmParser::emitLoadAddress(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out) { + // The load address pseudo-instruction "la" is used in PC-relative and + // GOT-indirect addressing of global symbols: + // la rdest, symbol + // expands to either (for non-PIC) + // TmpLabel: AUIPC rdest, %pcrel_hi(symbol) + // ADDI rdest, rdest, %pcrel_lo(TmpLabel) + // or (for PIC) + // TmpLabel: AUIPC rdest, %got_pcrel_hi(symbol) + // Lx rdest, %pcrel_lo(TmpLabel)(rdest) + MCOperand DestReg = Inst.getOperand(0); + const MCExpr *Symbol = Inst.getOperand(1).getExpr(); + unsigned SecondOpcode; + RISCVMCExpr::VariantKind VKHi; + // FIXME: Should check .option (no)pic when implemented + if (getContext().getObjectFileInfo()->isPositionIndependent()) { + SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW; + VKHi = RISCVMCExpr::VK_RISCV_GOT_HI; + } else { + SecondOpcode = RISCV::ADDI; + VKHi = RISCVMCExpr::VK_RISCV_PCREL_HI; + } + emitAuipcInstPair(DestReg, DestReg, Symbol, VKHi, SecondOpcode, IDLoc, Out); +} + +void RISCVAsmParser::emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out) { + // The load TLS IE address pseudo-instruction "la.tls.ie" is used in + // initial-exec TLS model addressing of global symbols: + // la.tls.ie rdest, symbol + // expands to + // TmpLabel: AUIPC rdest, %tls_ie_pcrel_hi(symbol) + // Lx rdest, %pcrel_lo(TmpLabel)(rdest) + MCOperand DestReg = Inst.getOperand(0); + const MCExpr *Symbol = Inst.getOperand(1).getExpr(); + unsigned SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW; + emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GOT_HI, + SecondOpcode, IDLoc, Out); +} + +void RISCVAsmParser::emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out) { + // The load TLS GD address pseudo-instruction "la.tls.gd" is used in + // global-dynamic TLS model addressing of global symbols: + // la.tls.gd rdest, symbol + // expands to + // TmpLabel: AUIPC rdest, %tls_gd_pcrel_hi(symbol) + // ADDI rdest, rdest, %pcrel_lo(TmpLabel) + MCOperand DestReg = Inst.getOperand(0); + const MCExpr *Symbol = Inst.getOperand(1).getExpr(); + emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GD_HI, + RISCV::ADDI, IDLoc, Out); +} + +void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, + SMLoc IDLoc, MCStreamer &Out, + bool HasTmpReg) { + // The load/store pseudo-instruction does a pc-relative load with + // a symbol. + // + // The expansion looks like this + // + // TmpLabel: AUIPC tmp, %pcrel_hi(symbol) + // [S|L]X rd, %pcrel_lo(TmpLabel)(tmp) + MCOperand DestReg = Inst.getOperand(0); + unsigned SymbolOpIdx = HasTmpReg ? 2 : 1; + unsigned TmpRegOpIdx = HasTmpReg ? 1 : 0; + MCOperand TmpReg = Inst.getOperand(TmpRegOpIdx); + const MCExpr *Symbol = Inst.getOperand(SymbolOpIdx).getExpr(); + emitAuipcInstPair(DestReg, TmpReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI, + Opcode, IDLoc, Out); +} + +bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst, + OperandVector &Operands) { + assert(Inst.getOpcode() == RISCV::PseudoAddTPRel && "Invalid instruction"); + assert(Inst.getOperand(2).isReg() && "Unexpected second operand kind"); + if (Inst.getOperand(2).getReg() != RISCV::X4) { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[3]).getStartLoc(); + return Error(ErrorLoc, "the second input operand must be tp/x4 when using " + "%tprel_add modifier"); + } + + return false; +} + bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, + OperandVector &Operands, MCStreamer &Out) { Inst.setLoc(IDLoc); - if (Inst.getOpcode() == RISCV::PseudoLI) { + switch (Inst.getOpcode()) { + default: + break; + case RISCV::PseudoLI: { unsigned Reg = Inst.getOperand(0).getReg(); const MCOperand &Op1 = Inst.getOperand(1); if (Op1.isExpr()) { @@ -1463,9 +1701,68 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, Imm = SignExtend64<32>(Imm); emitLoadImm(Reg, Imm, Out); return false; - } else if (Inst.getOpcode() == RISCV::PseudoLLA) { + } + case RISCV::PseudoLLA: emitLoadLocalAddress(Inst, IDLoc, Out); return false; + case RISCV::PseudoLA: + emitLoadAddress(Inst, IDLoc, Out); + return false; + case RISCV::PseudoLA_TLS_IE: + emitLoadTLSIEAddress(Inst, IDLoc, Out); + return false; + case RISCV::PseudoLA_TLS_GD: + emitLoadTLSGDAddress(Inst, IDLoc, Out); + return false; + case RISCV::PseudoLB: + emitLoadStoreSymbol(Inst, RISCV::LB, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoLBU: + emitLoadStoreSymbol(Inst, RISCV::LBU, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoLH: + emitLoadStoreSymbol(Inst, RISCV::LH, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoLHU: + emitLoadStoreSymbol(Inst, RISCV::LHU, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoLW: + emitLoadStoreSymbol(Inst, RISCV::LW, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoLWU: + emitLoadStoreSymbol(Inst, RISCV::LWU, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoLD: + emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false); + return false; + case RISCV::PseudoFLW: + emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoFLD: + emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoSB: + emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoSH: + emitLoadStoreSymbol(Inst, RISCV::SH, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoSW: + emitLoadStoreSymbol(Inst, RISCV::SW, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoSD: + emitLoadStoreSymbol(Inst, RISCV::SD, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoFSW: + emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoFSD: + emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /*HasTmpReg=*/true); + return false; + case RISCV::PseudoAddTPRel: + if (checkPseudoAddTPRel(Inst, Operands)) + return true; + break; } emitToStreamer(Out, Inst); diff --git a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index eafa09d56315..36200c03f703 100644 --- a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -1,9 +1,8 @@ //===-- RISCVDisassembler.cpp - Disassembler for RISCV --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "TargetInfo/RISCVTargetInfo.h" #include "Utils/RISCVBaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -70,7 +70,13 @@ static const unsigned GPRDecoderTable[] = { static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > sizeof(GPRDecoderTable)) + const FeatureBitset &FeatureBits = + static_cast(Decoder) + ->getSubtargetInfo() + .getFeatureBits(); + bool IsRV32E = FeatureBits[RISCV::FeatureRV32E]; + + if (RegNo > array_lengthof(GPRDecoderTable) || (IsRV32E && RegNo > 15)) return MCDisassembler::Fail; // We must define our own mapping from RegNo to register identifier. @@ -95,7 +101,7 @@ static const unsigned FPR32DecoderTable[] = { static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > sizeof(FPR32DecoderTable)) + if (RegNo > array_lengthof(FPR32DecoderTable)) return MCDisassembler::Fail; // We must define our own mapping from RegNo to register identifier. @@ -131,7 +137,7 @@ static const unsigned FPR64DecoderTable[] = { static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > sizeof(FPR64DecoderTable)) + if (RegNo > array_lengthof(FPR64DecoderTable)) return MCDisassembler::Fail; // We must define our own mapping from RegNo to register identifier. diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp deleted file mode 100644 index 979c8f4e2fa7..000000000000 --- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp +++ /dev/null @@ -1,115 +0,0 @@ -//===-- RISCVInstPrinter.cpp - Convert RISCV MCInst to asm syntax ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an RISCV MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "RISCVInstPrinter.h" -#include "MCTargetDesc/RISCVMCExpr.h" -#include "Utils/RISCVBaseInfo.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// Include the auto-generated portion of the assembly writer. -#define PRINT_ALIAS_INSTR -#include "RISCVGenAsmWriter.inc" - -// Include the auto-generated portion of the compress emitter. -#define GEN_UNCOMPRESS_INSTR -#include "RISCVGenCompressInstEmitter.inc" - -static cl::opt - NoAliases("riscv-no-aliases", - cl::desc("Disable the emission of assembler pseudo instructions"), - cl::init(false), cl::Hidden); - -void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - bool Res = false; - const MCInst *NewMI = MI; - MCInst UncompressedMI; - if (!NoAliases) - Res = uncompressInst(UncompressedMI, *MI, MRI, STI); - if (Res) - NewMI = const_cast(&UncompressedMI); - if (NoAliases || !printAliasInstr(NewMI, STI, O)) - printInstruction(NewMI, STI, O); - printAnnotation(O, Annot); -} - -void RISCVInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { - O << getRegisterName(RegNo); -} - -void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O, - const char *Modifier) { - assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); - const MCOperand &MO = MI->getOperand(OpNo); - - if (MO.isReg()) { - printRegName(O, MO.getReg()); - return; - } - - if (MO.isImm()) { - O << MO.getImm(); - return; - } - - assert(MO.isExpr() && "Unknown operand kind in printOperand"); - MO.getExpr()->print(O, &MAI); -} - -void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNo).getImm(); - auto SysReg = RISCVSysReg::lookupSysRegByEncoding(Imm); - if (SysReg && SysReg->haveRequiredFeatures(STI.getFeatureBits())) - O << SysReg->Name; - else - O << Imm; -} - -void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned FenceArg = MI->getOperand(OpNo).getImm(); - assert (((FenceArg >> 4) == 0) && "Invalid immediate in printFenceArg"); - - if ((FenceArg & RISCVFenceField::I) != 0) - O << 'i'; - if ((FenceArg & RISCVFenceField::O) != 0) - O << 'o'; - if ((FenceArg & RISCVFenceField::R) != 0) - O << 'r'; - if ((FenceArg & RISCVFenceField::W) != 0) - O << 'w'; - if (FenceArg == 0) - O << "unknown"; -} - -void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - auto FRMArg = - static_cast(MI->getOperand(OpNo).getImm()); - O << RISCVFPRndMode::roundingModeToString(FRMArg); -} diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h deleted file mode 100644 index 0f9bed184996..000000000000 --- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h +++ /dev/null @@ -1,55 +0,0 @@ -//===-- RISCVInstPrinter.h - Convert RISCV MCInst to asm syntax ---*- C++ -*--// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a RISCV MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_RISCV_INSTPRINTER_RISCVINSTPRINTER_H -#define LLVM_LIB_TARGET_RISCV_INSTPRINTER_RISCVINSTPRINTER_H - -#include "MCTargetDesc/RISCVMCTargetDesc.h" -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { -class MCOperand; - -class RISCVInstPrinter : public MCInstPrinter { -public: - RISCVInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - void printRegName(raw_ostream &O, unsigned RegNo) const override; - - void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O, const char *Modifier = nullptr); - void printCSRSystemRegister(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printFenceArg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, - const MCSubtargetInfo &STI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = RISCV::ABIRegAltName); -}; -} // namespace llvm - -#endif diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 7672fea5d95b..ee5f760ebcb0 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- RISCVAsmBackend.cpp - RISCV Assembler Backend ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -17,6 +16,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -33,6 +33,10 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm, switch ((unsigned)Fixup.getKind()) { default: break; + case RISCV::fixup_riscv_got_hi20: + case RISCV::fixup_riscv_tls_got_hi20: + case RISCV::fixup_riscv_tls_gd_hi20: + return true; case RISCV::fixup_riscv_pcrel_lo12_i: case RISCV::fixup_riscv_pcrel_lo12_s: // For pcrel_lo12, force a relocation if the target of the corresponding @@ -48,6 +52,11 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm, default: llvm_unreachable("Unexpected fixup kind for pcrel_lo12"); break; + case RISCV::fixup_riscv_got_hi20: + case RISCV::fixup_riscv_tls_got_hi20: + case RISCV::fixup_riscv_tls_gd_hi20: + ShouldForce = true; + break; case RISCV::fixup_riscv_pcrel_hi20: ShouldForce = T->getValue()->findAssociatedFragment() != Fixup.getValue()->findAssociatedFragment(); @@ -153,16 +162,12 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { return false; // The canonical nop on RISC-V is addi x0, x0, 0. - uint64_t Nop32Count = Count / 4; - for (uint64_t i = Nop32Count; i != 0; --i) + for (; Count >= 4; Count -= 4) OS.write("\x13\0\0\0", 4); // The canonical nop on RVC is c.nop. - if (HasStdExtC) { - uint64_t Nop16Count = (Count - Nop32Count * 4) / 2; - for (uint64_t i = Nop16Count; i != 0; --i) - OS.write("\x01\0", 2); - } + if (Count && HasStdExtC) + OS.write("\x01\0", 2); return true; } @@ -173,6 +178,10 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); + case RISCV::fixup_riscv_got_hi20: + case RISCV::fixup_riscv_tls_got_hi20: + case RISCV::fixup_riscv_tls_gd_hi20: + llvm_unreachable("Relocation should be unconditionally forced\n"); case FK_Data_1: case FK_Data_2: case FK_Data_4: @@ -180,12 +189,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, return Value; case RISCV::fixup_riscv_lo12_i: case RISCV::fixup_riscv_pcrel_lo12_i: + case RISCV::fixup_riscv_tprel_lo12_i: return Value & 0xfff; case RISCV::fixup_riscv_lo12_s: case RISCV::fixup_riscv_pcrel_lo12_s: + case RISCV::fixup_riscv_tprel_lo12_s: return (((Value >> 5) & 0x7f) << 25) | ((Value & 0x1f) << 7); case RISCV::fixup_riscv_hi20: case RISCV::fixup_riscv_pcrel_hi20: + case RISCV::fixup_riscv_tprel_hi20: // Add 1 if bit 11 is 1, to compensate for low 12 bits being negative. return ((Value + 0x800) >> 12) & 0xfffff; case RISCV::fixup_riscv_jal: { @@ -223,7 +235,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = (Sbit << 31) | (Mid6 << 25) | (Lo4 << 8) | (Hi1 << 7); return Value; } - case RISCV::fixup_riscv_call: { + case RISCV::fixup_riscv_call: + case RISCV::fixup_riscv_call_plt: { // Jalr will add UpperImm with the sign-extended 12-bit LowerImm, // we need to add 0x800ULL before extract upper bits to reflect the // effect of the sign extension. @@ -287,6 +300,60 @@ void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, } } +// Linker relaxation may change code size. We have to insert Nops +// for .align directive when linker relaxation enabled. So then Linker +// could satisfy alignment by removing Nops. +// The function return the total Nops Size we need to insert. +bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign( + const MCAlignFragment &AF, unsigned &Size) { + // Calculate Nops Size only when linker relaxation enabled. + if (!STI.getFeatureBits()[RISCV::FeatureRelax]) + return false; + + bool HasStdExtC = STI.getFeatureBits()[RISCV::FeatureStdExtC]; + unsigned MinNopLen = HasStdExtC ? 2 : 4; + + if (AF.getAlignment() <= MinNopLen) { + return false; + } else { + Size = AF.getAlignment() - MinNopLen; + return true; + } +} + +// We need to insert R_RISCV_ALIGN relocation type to indicate the +// position of Nops and the total bytes of the Nops have been inserted +// when linker relaxation enabled. +// The function insert fixup_riscv_align fixup which eventually will +// transfer to R_RISCV_ALIGN relocation type. +bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm, + const MCAsmLayout &Layout, + MCAlignFragment &AF) { + // Insert the fixup only when linker relaxation enabled. + if (!STI.getFeatureBits()[RISCV::FeatureRelax]) + return false; + + // Calculate total Nops we need to insert. If there are none to insert + // then simply return. + unsigned Count; + if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count) || (Count == 0)) + return false; + + MCContext &Ctx = Asm.getContext(); + const MCExpr *Dummy = MCConstantExpr::create(0, Ctx); + // Create fixup_riscv_align fixup. + MCFixup Fixup = + MCFixup::create(0, Dummy, MCFixupKind(RISCV::fixup_riscv_align), SMLoc()); + + uint64_t FixedValue = 0; + MCValue NopBytes = MCValue::get(Count); + + Asm.getWriter().recordRelocation(Asm, Layout, &AF, Fixup, NopBytes, + FixedValue); + + return true; +} + std::unique_ptr RISCVAsmBackend::createObjectTargetWriter() const { return createRISCVELFObjectWriter(OSABI, Is64Bit); @@ -298,5 +365,5 @@ MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T, const MCTargetOptions &Options) { const Triple &TT = STI.getTargetTriple(); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); - return new RISCVAsmBackend(STI, OSABI, TT.isArch64Bit()); + return new RISCVAsmBackend(STI, OSABI, TT.isArch64Bit(), Options); } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index b98e45f4053f..254249c87dc8 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -1,9 +1,8 @@ //===-- RISCVAsmBackend.h - RISCV Assembler Backend -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -12,6 +11,7 @@ #include "MCTargetDesc/RISCVFixupKinds.h" #include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "Utils/RISCVBaseInfo.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -26,21 +26,45 @@ class RISCVAsmBackend : public MCAsmBackend { uint8_t OSABI; bool Is64Bit; bool ForceRelocs = false; + const MCTargetOptions &TargetOptions; + RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown; public: - RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit) - : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), - Is64Bit(Is64Bit) {} + RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit, + const MCTargetOptions &Options) + : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), Is64Bit(Is64Bit), + TargetOptions(Options) { + TargetABI = RISCVABI::computeTargetABI( + STI.getTargetTriple(), STI.getFeatureBits(), Options.getABIName()); + RISCVFeatures::validate(STI.getTargetTriple(), STI.getFeatureBits()); + } ~RISCVAsmBackend() override {} void setForceRelocs() { ForceRelocs = true; } + // Returns true if relocations will be forced for shouldForceRelocation by + // default. This will be true if relaxation is enabled or had previously + // been enabled. + bool willForceRelocations() const { + return ForceRelocs || STI.getFeatureBits()[RISCV::FeatureRelax]; + } + // Generate diff expression relocations if the relax feature is enabled or had // previously been enabled, otherwise it is safe for the assembler to // calculate these internally. bool requiresDiffExpressionRelocations() const override { - return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs; + return willForceRelocations(); } + + // Return Size with extra Nop Bytes for alignment directive in code section. + bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF, + unsigned &Size) override; + + // Insert target specific fixup type for alignment directive in code section. + bool shouldInsertFixupForCodeAlign(MCAssembler &Asm, + const MCAsmLayout &Layout, + MCAlignFragment &AF) override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef Data, uint64_t Value, bool IsResolved, @@ -80,12 +104,21 @@ public: { "fixup_riscv_pcrel_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_pcrel_lo12_i", 20, 12, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_pcrel_lo12_s", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_got_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_tprel_hi20", 12, 20, 0 }, + { "fixup_riscv_tprel_lo12_i", 20, 12, 0 }, + { "fixup_riscv_tprel_lo12_s", 0, 32, 0 }, + { "fixup_riscv_tprel_add", 0, 0, 0 }, + { "fixup_riscv_tls_got_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_tls_gd_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_relax", 0, 0, 0 } + { "fixup_riscv_call_plt", 0, 64, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_relax", 0, 0, 0 }, + { "fixup_riscv_align", 0, 0, 0 } }; static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds, "Not all fixup kinds added to Infos array"); @@ -107,6 +140,9 @@ public: bool writeNopData(raw_ostream &OS, uint64_t Count) const override; + + const MCTargetOptions &getTargetOptions() const { return TargetOptions; } + RISCVABI::ABI getTargetABI() const { return TargetABI; } }; } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 9b88614aa693..3ccbc86d2619 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- RISCVELFObjectWriter.cpp - RISCV ELF Writer -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -49,7 +48,42 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, const MCFixup &Fixup, bool IsPCRel) const { // Determine the type of the relocation - switch ((unsigned)Fixup.getKind()) { + unsigned Kind = Fixup.getKind(); + if (IsPCRel) { + switch (Kind) { + default: + llvm_unreachable("invalid fixup kind!"); + case FK_Data_4: + case FK_PCRel_4: + return ELF::R_RISCV_32_PCREL; + case RISCV::fixup_riscv_pcrel_hi20: + return ELF::R_RISCV_PCREL_HI20; + case RISCV::fixup_riscv_pcrel_lo12_i: + return ELF::R_RISCV_PCREL_LO12_I; + case RISCV::fixup_riscv_pcrel_lo12_s: + return ELF::R_RISCV_PCREL_LO12_S; + case RISCV::fixup_riscv_got_hi20: + return ELF::R_RISCV_GOT_HI20; + case RISCV::fixup_riscv_tls_got_hi20: + return ELF::R_RISCV_TLS_GOT_HI20; + case RISCV::fixup_riscv_tls_gd_hi20: + return ELF::R_RISCV_TLS_GD_HI20; + case RISCV::fixup_riscv_jal: + return ELF::R_RISCV_JAL; + case RISCV::fixup_riscv_branch: + return ELF::R_RISCV_BRANCH; + case RISCV::fixup_riscv_rvc_jump: + return ELF::R_RISCV_RVC_JUMP; + case RISCV::fixup_riscv_rvc_branch: + return ELF::R_RISCV_RVC_BRANCH; + case RISCV::fixup_riscv_call: + return ELF::R_RISCV_CALL; + case RISCV::fixup_riscv_call_plt: + return ELF::R_RISCV_CALL_PLT; + } + } + + switch (Kind) { default: llvm_unreachable("invalid fixup kind!"); case FK_Data_4: @@ -78,24 +112,18 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_RISCV_LO12_I; case RISCV::fixup_riscv_lo12_s: return ELF::R_RISCV_LO12_S; - case RISCV::fixup_riscv_pcrel_hi20: - return ELF::R_RISCV_PCREL_HI20; - case RISCV::fixup_riscv_pcrel_lo12_i: - return ELF::R_RISCV_PCREL_LO12_I; - case RISCV::fixup_riscv_pcrel_lo12_s: - return ELF::R_RISCV_PCREL_LO12_S; - case RISCV::fixup_riscv_jal: - return ELF::R_RISCV_JAL; - case RISCV::fixup_riscv_branch: - return ELF::R_RISCV_BRANCH; - case RISCV::fixup_riscv_rvc_jump: - return ELF::R_RISCV_RVC_JUMP; - case RISCV::fixup_riscv_rvc_branch: - return ELF::R_RISCV_RVC_BRANCH; - case RISCV::fixup_riscv_call: - return ELF::R_RISCV_CALL; + case RISCV::fixup_riscv_tprel_hi20: + return ELF::R_RISCV_TPREL_HI20; + case RISCV::fixup_riscv_tprel_lo12_i: + return ELF::R_RISCV_TPREL_LO12_I; + case RISCV::fixup_riscv_tprel_lo12_s: + return ELF::R_RISCV_TPREL_LO12_S; + case RISCV::fixup_riscv_tprel_add: + return ELF::R_RISCV_TPREL_ADD; case RISCV::fixup_riscv_relax: return ELF::R_RISCV_RELAX; + case RISCV::fixup_riscv_align: + return ELF::R_RISCV_ALIGN; } } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index a6ba1e41e964..40fa195f3790 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -1,9 +1,8 @@ //===-- RISCVELFStreamer.cpp - RISCV ELF Target Streamer Methods ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "RISCVELFStreamer.h" +#include "MCTargetDesc/RISCVAsmBackend.h" #include "RISCVMCTargetDesc.h" +#include "Utils/RISCVBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -23,14 +24,35 @@ RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI) : RISCVTargetStreamer(S) { MCAssembler &MCA = getStreamer().getAssembler(); - const FeatureBitset &Features = STI.getFeatureBits(); + auto &MAB = static_cast(MCA.getBackend()); + RISCVABI::ABI ABI = MAB.getTargetABI(); + assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI"); unsigned EFlags = MCA.getELFHeaderEFlags(); if (Features[RISCV::FeatureStdExtC]) EFlags |= ELF::EF_RISCV_RVC; + switch (ABI) { + case RISCVABI::ABI_ILP32: + case RISCVABI::ABI_LP64: + break; + case RISCVABI::ABI_ILP32F: + case RISCVABI::ABI_LP64F: + EFlags |= ELF::EF_RISCV_FLOAT_ABI_SINGLE; + break; + case RISCVABI::ABI_ILP32D: + case RISCVABI::ABI_LP64D: + EFlags |= ELF::EF_RISCV_FLOAT_ABI_DOUBLE; + break; + case RISCVABI::ABI_ILP32E: + EFlags |= ELF::EF_RISCV_RVE; + break; + case RISCVABI::ABI_Unknown: + llvm_unreachable("Improperly initialised target ABI"); + } + MCA.setELFHeaderEFlags(EFlags); } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h index 1f36bbc43882..138df786eaf3 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h @@ -1,9 +1,8 @@ //===-- RISCVELFStreamer.h - RISCV ELF Target Streamer ---------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index 6a1224be774e..6c7933340608 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -1,9 +1,8 @@ //===-- RISCVFixupKinds.h - RISCV Specific Fixup Entries --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -35,6 +34,27 @@ enum Fixups { // fixup_riscv_pcrel_lo12_s - 12-bit fixup corresponding to pcrel_lo(foo) for // the S-type store instructions fixup_riscv_pcrel_lo12_s, + // fixup_riscv_got_hi20 - 20-bit fixup corresponding to got_pcrel_hi(foo) for + // instructions like auipc + fixup_riscv_got_hi20, + // fixup_riscv_tprel_hi20 - 20-bit fixup corresponding to tprel_hi(foo) for + // instructions like lui + fixup_riscv_tprel_hi20, + // fixup_riscv_tprel_lo12_i - 12-bit fixup corresponding to tprel_lo(foo) for + // instructions like addi + fixup_riscv_tprel_lo12_i, + // fixup_riscv_tprel_lo12_s - 12-bit fixup corresponding to tprel_lo(foo) for + // the S-type store instructions + fixup_riscv_tprel_lo12_s, + // fixup_riscv_tprel_add - A fixup corresponding to %tprel_add(foo) for the + // add_tls instruction. Used to provide a hint to the linker. + fixup_riscv_tprel_add, + // fixup_riscv_tls_got_hi20 - 20-bit fixup corresponding to + // tls_ie_pcrel_hi(foo) for instructions like auipc + fixup_riscv_tls_got_hi20, + // fixup_riscv_tls_gd_hi20 - 20-bit fixup corresponding to + // tls_gd_pcrel_hi(foo) for instructions like auipc + fixup_riscv_tls_gd_hi20, // fixup_riscv_jal - 20-bit fixup for symbol references in the jal // instruction fixup_riscv_jal, @@ -50,9 +70,17 @@ enum Fixups { // fixup_riscv_call - A fixup representing a call attached to the auipc // instruction in a pair composed of adjacent auipc+jalr instructions. fixup_riscv_call, + // fixup_riscv_call_plt - A fixup representing a procedure linkage table call + // attached to the auipc instruction in a pair composed of adjacent auipc+jalr + // instructions. + fixup_riscv_call_plt, // fixup_riscv_relax - Used to generate an R_RISCV_RELAX relocation type, // which indicates the linker may relax the instruction pair. fixup_riscv_relax, + // fixup_riscv_align - Used to generate an R_RISCV_ALIGN relocation type, + // which indicates the linker should fixup the alignment after linker + // relaxation. + fixup_riscv_align, // fixup_riscv_invalid - used as a sentinel and a marker, must be last fixup fixup_riscv_invalid, diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp new file mode 100644 index 000000000000..fe37b70811d8 --- /dev/null +++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -0,0 +1,114 @@ +//===-- RISCVInstPrinter.cpp - Convert RISCV MCInst to asm syntax ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an RISCV MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "RISCVInstPrinter.h" +#include "MCTargetDesc/RISCVMCExpr.h" +#include "Utils/RISCVBaseInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "RISCVGenAsmWriter.inc" + +// Include the auto-generated portion of the compress emitter. +#define GEN_UNCOMPRESS_INSTR +#include "RISCVGenCompressInstEmitter.inc" + +static cl::opt + NoAliases("riscv-no-aliases", + cl::desc("Disable the emission of assembler pseudo instructions"), + cl::init(false), cl::Hidden); + +void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + bool Res = false; + const MCInst *NewMI = MI; + MCInst UncompressedMI; + if (!NoAliases) + Res = uncompressInst(UncompressedMI, *MI, MRI, STI); + if (Res) + NewMI = const_cast(&UncompressedMI); + if (NoAliases || !printAliasInstr(NewMI, STI, O)) + printInstruction(NewMI, STI, O); + printAnnotation(O, Annot); +} + +void RISCVInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { + O << getRegisterName(RegNo); +} + +void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, + const char *Modifier) { + assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + const MCOperand &MO = MI->getOperand(OpNo); + + if (MO.isReg()) { + printRegName(O, MO.getReg()); + return; + } + + if (MO.isImm()) { + O << MO.getImm(); + return; + } + + assert(MO.isExpr() && "Unknown operand kind in printOperand"); + MO.getExpr()->print(O, &MAI); +} + +void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + auto SysReg = RISCVSysReg::lookupSysRegByEncoding(Imm); + if (SysReg && SysReg->haveRequiredFeatures(STI.getFeatureBits())) + O << SysReg->Name; + else + O << Imm; +} + +void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned FenceArg = MI->getOperand(OpNo).getImm(); + assert (((FenceArg >> 4) == 0) && "Invalid immediate in printFenceArg"); + + if ((FenceArg & RISCVFenceField::I) != 0) + O << 'i'; + if ((FenceArg & RISCVFenceField::O) != 0) + O << 'o'; + if ((FenceArg & RISCVFenceField::R) != 0) + O << 'r'; + if ((FenceArg & RISCVFenceField::W) != 0) + O << 'w'; + if (FenceArg == 0) + O << "unknown"; +} + +void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + auto FRMArg = + static_cast(MI->getOperand(OpNo).getImm()); + O << RISCVFPRndMode::roundingModeToString(FRMArg); +} diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h new file mode 100644 index 000000000000..5ca1d3fa20fe --- /dev/null +++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h @@ -0,0 +1,54 @@ +//===-- RISCVInstPrinter.h - Convert RISCV MCInst to asm syntax ---*- C++ -*--// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints a RISCV MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVINSTPRINTER_H +#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVINSTPRINTER_H + +#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { +class MCOperand; + +class RISCVInstPrinter : public MCInstPrinter { +public: + RISCVInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printRegName(raw_ostream &O, unsigned RegNo) const override; + + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O, const char *Modifier = nullptr); + void printCSRSystemRegister(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFenceArg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = RISCV::ABIRegAltName); +}; +} // namespace llvm + +#endif diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp index 780dae410cd0..983629692883 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- RISCVMCAsmInfo.cpp - RISCV Asm properties -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,6 +21,7 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) { CommentString = "#"; AlignmentIsInBytes = false; SupportsDebugInformation = true; + ExceptionsType = ExceptionHandling::DwarfCFI; Data16bitsDirective = "\t.half\t"; Data32bitsDirective = "\t.word\t"; } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h index 901a1eba8af2..043fdb7c08c0 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- RISCVMCAsmInfo.h - RISCV Asm Info ----------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index c5a4ffc0e360..0fc775f63ed4 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- RISCVMCCodeEmitter.cpp - Convert RISCV code to machine code -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,6 +56,10 @@ public: SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + void expandAddTPRel(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + /// TableGen'erated function for getting the binary encoding for an /// instruction. uint64_t getBinaryCodeForInstr(const MCInst &MI, @@ -85,28 +88,34 @@ MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII, return new RISCVMCCodeEmitter(Ctx, MCII); } -// Expand PseudoCALL and PseudoTAIL to AUIPC and JALR with relocation types. -// We expand PseudoCALL and PseudoTAIL while encoding, meaning AUIPC and JALR -// won't go through RISCV MC to MC compressed instruction transformation. This -// is acceptable because AUIPC has no 16-bit form and C_JALR have no immediate -// operand field. We let linker relaxation deal with it. When linker -// relaxation enabled, AUIPC and JALR have chance relax to JAL. If C extension -// is enabled, JAL has chance relax to C_JAL. +// Expand PseudoCALL(Reg) and PseudoTAIL to AUIPC and JALR with relocation +// types. We expand PseudoCALL(Reg) and PseudoTAIL while encoding, meaning AUIPC +// and JALR won't go through RISCV MC to MC compressed instruction +// transformation. This is acceptable because AUIPC has no 16-bit form and +// C_JALR have no immediate operand field. We let linker relaxation deal with +// it. When linker relaxation enabled, AUIPC and JALR have chance relax to JAL. +// If C extension is enabled, JAL has chance relax to C_JAL. void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { MCInst TmpInst; - MCOperand Func = MI.getOperand(0); - unsigned Ra = (MI.getOpcode() == RISCV::PseudoTAIL) ? RISCV::X6 : RISCV::X1; + MCOperand Func; + unsigned Ra; + if (MI.getOpcode() == RISCV::PseudoTAIL) { + Func = MI.getOperand(0); + Ra = RISCV::X6; + } else if (MI.getOpcode() == RISCV::PseudoCALLReg) { + Func = MI.getOperand(1); + Ra = MI.getOperand(0).getReg(); + } else { + Func = MI.getOperand(0); + Ra = RISCV::X1; + } uint32_t Binary; assert(Func.isExpr() && "Expected expression"); - const MCExpr *Expr = Func.getExpr(); - - // Create function call expression CallExpr for AUIPC. - const MCExpr *CallExpr = - RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx); + const MCExpr *CallExpr = Func.getExpr(); // Emit AUIPC Ra, Func with R_RISCV_CALL relocation type. TmpInst = MCInstBuilder(RISCV::AUIPC) @@ -119,12 +128,50 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS, // Emit JALR X0, X6, 0 TmpInst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Ra).addImm(0); else - // Emit JALR X1, X1, 0 + // Emit JALR Ra, Ra, 0 TmpInst = MCInstBuilder(RISCV::JALR).addReg(Ra).addReg(Ra).addImm(0); Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI); support::endian::write(OS, Binary, support::little); } +// Expand PseudoAddTPRel to a simple ADD with the correct relocation. +void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + MCOperand DestReg = MI.getOperand(0); + MCOperand SrcReg = MI.getOperand(1); + MCOperand TPReg = MI.getOperand(2); + assert(TPReg.isReg() && TPReg.getReg() == RISCV::X4 && + "Expected thread pointer as second input to TP-relative add"); + + MCOperand SrcSymbol = MI.getOperand(3); + assert(SrcSymbol.isExpr() && + "Expected expression as third input to TP-relative add"); + + const RISCVMCExpr *Expr = dyn_cast(SrcSymbol.getExpr()); + assert(Expr && Expr->getKind() == RISCVMCExpr::VK_RISCV_TPREL_ADD && + "Expected tprel_add relocation on TP-relative symbol"); + + // Emit the correct tprel_add relocation for the symbol. + Fixups.push_back(MCFixup::create( + 0, Expr, MCFixupKind(RISCV::fixup_riscv_tprel_add), MI.getLoc())); + + // Emit fixup_riscv_relax for tprel_add where the relax feature is enabled. + if (STI.getFeatureBits()[RISCV::FeatureRelax]) { + const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx); + Fixups.push_back(MCFixup::create( + 0, Dummy, MCFixupKind(RISCV::fixup_riscv_relax), MI.getLoc())); + } + + // Emit a normal ADD instruction with the given operands. + MCInst TmpInst = MCInstBuilder(RISCV::ADD) + .addOperand(DestReg) + .addOperand(SrcReg) + .addOperand(TPReg); + uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI); + support::endian::write(OS, Binary, support::little); +} + void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { @@ -132,13 +179,20 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, // Get byte count of instruction. unsigned Size = Desc.getSize(); - if (MI.getOpcode() == RISCV::PseudoCALL || + if (MI.getOpcode() == RISCV::PseudoCALLReg || + MI.getOpcode() == RISCV::PseudoCALL || MI.getOpcode() == RISCV::PseudoTAIL) { expandFunctionCall(MI, OS, Fixups, STI); MCNumEmitted += 2; return; } + if (MI.getOpcode() == RISCV::PseudoAddTPRel) { + expandAddTPRel(MI, OS, Fixups, STI); + MCNumEmitted += 1; + return; + } + switch (Size) { default: llvm_unreachable("Unhandled encodeInstruction length!"); @@ -205,6 +259,7 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, const MCExpr *Expr = MO.getExpr(); MCExpr::ExprKind Kind = Expr->getKind(); RISCV::Fixups FixupKind = RISCV::fixup_riscv_invalid; + bool RelaxCandidate = false; if (Kind == MCExpr::Target) { const RISCVMCExpr *RVExpr = cast(Expr); @@ -212,6 +267,13 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, case RISCVMCExpr::VK_RISCV_None: case RISCVMCExpr::VK_RISCV_Invalid: llvm_unreachable("Unhandled fixup kind!"); + case RISCVMCExpr::VK_RISCV_TPREL_ADD: + // tprel_add is only used to indicate that a relocation should be emitted + // for an add instruction used in TP-relative addressing. It should not be + // expanded as if representing an actual instruction operand and so to + // encounter it here is an error. + llvm_unreachable( + "VK_RISCV_TPREL_ADD should not represent an instruction operand"); case RISCVMCExpr::VK_RISCV_LO: if (MIFrm == RISCVII::InstFormatI) FixupKind = RISCV::fixup_riscv_lo12_i; @@ -219,9 +281,11 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, FixupKind = RISCV::fixup_riscv_lo12_s; else llvm_unreachable("VK_RISCV_LO used with unexpected instruction format"); + RelaxCandidate = true; break; case RISCVMCExpr::VK_RISCV_HI: FixupKind = RISCV::fixup_riscv_hi20; + RelaxCandidate = true; break; case RISCVMCExpr::VK_RISCV_PCREL_LO: if (MIFrm == RISCVII::InstFormatI) @@ -231,12 +295,42 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, else llvm_unreachable( "VK_RISCV_PCREL_LO used with unexpected instruction format"); + RelaxCandidate = true; break; case RISCVMCExpr::VK_RISCV_PCREL_HI: FixupKind = RISCV::fixup_riscv_pcrel_hi20; + RelaxCandidate = true; + break; + case RISCVMCExpr::VK_RISCV_GOT_HI: + FixupKind = RISCV::fixup_riscv_got_hi20; + break; + case RISCVMCExpr::VK_RISCV_TPREL_LO: + if (MIFrm == RISCVII::InstFormatI) + FixupKind = RISCV::fixup_riscv_tprel_lo12_i; + else if (MIFrm == RISCVII::InstFormatS) + FixupKind = RISCV::fixup_riscv_tprel_lo12_s; + else + llvm_unreachable( + "VK_RISCV_TPREL_LO used with unexpected instruction format"); + RelaxCandidate = true; + break; + case RISCVMCExpr::VK_RISCV_TPREL_HI: + FixupKind = RISCV::fixup_riscv_tprel_hi20; + RelaxCandidate = true; + break; + case RISCVMCExpr::VK_RISCV_TLS_GOT_HI: + FixupKind = RISCV::fixup_riscv_tls_got_hi20; + break; + case RISCVMCExpr::VK_RISCV_TLS_GD_HI: + FixupKind = RISCV::fixup_riscv_tls_gd_hi20; break; case RISCVMCExpr::VK_RISCV_CALL: FixupKind = RISCV::fixup_riscv_call; + RelaxCandidate = true; + break; + case RISCVMCExpr::VK_RISCV_CALL_PLT: + FixupKind = RISCV::fixup_riscv_call_plt; + RelaxCandidate = true; break; } } else if (Kind == MCExpr::SymbolRef && @@ -258,13 +352,15 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc())); ++MCNumFixups; - if (EnableRelax) { - if (FixupKind == RISCV::fixup_riscv_call) { - Fixups.push_back( - MCFixup::create(0, Expr, MCFixupKind(RISCV::fixup_riscv_relax), - MI.getLoc())); - ++MCNumFixups; - } + // Ensure an R_RISCV_RELAX relocation will be emitted if linker relaxation is + // enabled and the current fixup will result in a relocation that may be + // relaxed. + if (EnableRelax && RelaxCandidate) { + const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx); + Fixups.push_back( + MCFixup::create(0, Dummy, MCFixupKind(RISCV::fixup_riscv_relax), + MI.getLoc())); + ++MCNumFixups; } return 0; diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index 53648a5922c8..ae25ec818171 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -1,9 +1,8 @@ //===-- RISCVMCExpr.cpp - RISCV specific MC expression classes ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,12 @@ // //===----------------------------------------------------------------------===// -#include "RISCV.h" #include "RISCVMCExpr.h" +#include "MCTargetDesc/RISCVAsmBackend.h" +#include "RISCV.h" #include "RISCVFixupKinds.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" @@ -32,11 +34,15 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, VariantKind Kind, } void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { - bool HasVariant = - ((getKind() != VK_RISCV_None) && (getKind() != VK_RISCV_CALL)); + VariantKind Kind = getKind(); + bool HasVariant = ((Kind != VK_RISCV_None) && (Kind != VK_RISCV_CALL) && + (Kind != VK_RISCV_CALL_PLT)); + if (HasVariant) OS << '%' << getVariantKindName(getKind()) << '('; Expr->print(OS, MAI); + if (Kind == VK_RISCV_CALL_PLT) + OS << "@plt"; if (HasVariant) OS << ')'; } @@ -50,19 +56,30 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup() const { if (!AUIPCSRE) return nullptr; - const auto *DF = - dyn_cast_or_null(AUIPCSRE->findAssociatedFragment()); + const MCSymbol *AUIPCSymbol = &AUIPCSRE->getSymbol(); + const auto *DF = dyn_cast_or_null(AUIPCSymbol->getFragment()); + if (!DF) return nullptr; - const MCSymbol *AUIPCSymbol = &AUIPCSRE->getSymbol(); + uint64_t Offset = AUIPCSymbol->getOffset(); + if (DF->getContents().size() == Offset) { + DF = dyn_cast_or_null(DF->getNextNode()); + if (!DF) + return nullptr; + Offset = 0; + } + for (const MCFixup &F : DF->getFixups()) { - if (F.getOffset() != AUIPCSymbol->getOffset()) + if (F.getOffset() != Offset) continue; switch ((unsigned)F.getKind()) { default: continue; + case RISCV::fixup_riscv_got_hi20: + case RISCV::fixup_riscv_tls_got_hi20: + case RISCV::fixup_riscv_tls_gd_hi20: case RISCV::fixup_riscv_pcrel_hi20: return &F; } @@ -79,6 +96,16 @@ bool RISCVMCExpr::evaluatePCRelLo(MCValue &Res, const MCAsmLayout *Layout, // ( + ). The Fixup // is pcrel relative to the VK_RISCV_PCREL_LO fixup, so we need to add the // offset to the VK_RISCV_PCREL_HI Fixup from VK_RISCV_PCREL_LO to correct. + + // Don't try to evaluate if the fixup will be forced as a relocation (e.g. + // as linker relaxation is enabled). If we evaluated pcrel_lo in this case, + // the modified fixup will be converted into a relocation that no longer + // points to the pcrel_hi as the linker requires. + auto &RAB = + static_cast(Layout->getAssembler().getBackend()); + if (RAB.willForceRelocations()) + return false; + MCValue AUIPCLoc; if (!getSubExpr()->evaluateAsValue(AUIPCLoc, *Layout)) return false; @@ -137,6 +164,12 @@ bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res, case VK_RISCV_HI: case VK_RISCV_PCREL_LO: case VK_RISCV_PCREL_HI: + case VK_RISCV_GOT_HI: + case VK_RISCV_TPREL_LO: + case VK_RISCV_TPREL_HI: + case VK_RISCV_TPREL_ADD: + case VK_RISCV_TLS_GOT_HI: + case VK_RISCV_TLS_GD_HI: return false; } } @@ -154,6 +187,12 @@ RISCVMCExpr::VariantKind RISCVMCExpr::getVariantKindForName(StringRef name) { .Case("hi", VK_RISCV_HI) .Case("pcrel_lo", VK_RISCV_PCREL_LO) .Case("pcrel_hi", VK_RISCV_PCREL_HI) + .Case("got_pcrel_hi", VK_RISCV_GOT_HI) + .Case("tprel_lo", VK_RISCV_TPREL_LO) + .Case("tprel_hi", VK_RISCV_TPREL_HI) + .Case("tprel_add", VK_RISCV_TPREL_ADD) + .Case("tls_ie_pcrel_hi", VK_RISCV_TLS_GOT_HI) + .Case("tls_gd_pcrel_hi", VK_RISCV_TLS_GD_HI) .Default(VK_RISCV_Invalid); } @@ -169,14 +208,71 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) { return "pcrel_lo"; case VK_RISCV_PCREL_HI: return "pcrel_hi"; + case VK_RISCV_GOT_HI: + return "got_pcrel_hi"; + case VK_RISCV_TPREL_LO: + return "tprel_lo"; + case VK_RISCV_TPREL_HI: + return "tprel_hi"; + case VK_RISCV_TPREL_ADD: + return "tprel_add"; + case VK_RISCV_TLS_GOT_HI: + return "tls_ie_pcrel_hi"; + case VK_RISCV_TLS_GD_HI: + return "tls_gd_pcrel_hi"; } } +static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) { + switch (Expr->getKind()) { + case MCExpr::Target: + llvm_unreachable("Can't handle nested target expression"); + break; + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast(Expr); + fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm); + fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm); + break; + } + + case MCExpr::SymbolRef: { + // We're known to be under a TLS fixup, so any symbol should be + // modified. There should be only one. + const MCSymbolRefExpr &SymRef = *cast(Expr); + cast(SymRef.getSymbol()).setType(ELF::STT_TLS); + break; + } + + case MCExpr::Unary: + fixELFSymbolsInTLSFixupsImpl(cast(Expr)->getSubExpr(), Asm); + break; + } +} + +void RISCVMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { + switch (getKind()) { + default: + return; + case VK_RISCV_TPREL_HI: + case VK_RISCV_TLS_GOT_HI: + case VK_RISCV_TLS_GD_HI: + break; + } + + fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm); +} + bool RISCVMCExpr::evaluateAsConstant(int64_t &Res) const { MCValue Value; if (Kind == VK_RISCV_PCREL_HI || Kind == VK_RISCV_PCREL_LO || - Kind == VK_RISCV_CALL) + Kind == VK_RISCV_GOT_HI || Kind == VK_RISCV_TPREL_HI || + Kind == VK_RISCV_TPREL_LO || Kind == VK_RISCV_TPREL_ADD || + Kind == VK_RISCV_TLS_GOT_HI || Kind == VK_RISCV_TLS_GD_HI || + Kind == VK_RISCV_CALL || Kind == VK_RISCV_CALL_PLT) return false; if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr)) diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h index 4eafcc08b51f..b5a292dc1b1a 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h @@ -1,9 +1,8 @@ //===-- RISCVMCExpr.h - RISCV specific MC expression classes ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,7 +28,14 @@ public: VK_RISCV_HI, VK_RISCV_PCREL_LO, VK_RISCV_PCREL_HI, + VK_RISCV_GOT_HI, + VK_RISCV_TPREL_LO, + VK_RISCV_TPREL_HI, + VK_RISCV_TPREL_ADD, + VK_RISCV_TLS_GOT_HI, + VK_RISCV_TLS_GD_HI, VK_RISCV_CALL, + VK_RISCV_CALL_PLT, VK_RISCV_Invalid }; @@ -53,11 +59,11 @@ public: const MCExpr *getSubExpr() const { return Expr; } - /// Get the MCExpr of the VK_RISCV_PCREL_HI Fixup that the - /// VK_RISCV_PCREL_LO points to. + /// Get the corresponding PC-relative HI fixup that a VK_RISCV_PCREL_LO + /// points to. /// /// \returns nullptr if this isn't a VK_RISCV_PCREL_LO pointing to a - /// VK_RISCV_PCREL_HI. + /// known PC-relative HI fixup. const MCFixup *getPCRelHiFixup() const; void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; @@ -68,8 +74,7 @@ public: return getSubExpr()->findAssociatedFragment(); } - // There are no TLS RISCVMCExprs at the moment. - void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; bool evaluateAsConstant(int64_t &Res) const; diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp index 133f3cd3d39a..bc45262ab2de 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- RISCVMCTargetDesc.cpp - RISCV Target Descriptions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -12,10 +11,11 @@ //===----------------------------------------------------------------------===// #include "RISCVMCTargetDesc.h" -#include "InstPrinter/RISCVInstPrinter.h" #include "RISCVELFStreamer.h" +#include "RISCVInstPrinter.h" #include "RISCVMCAsmInfo.h" #include "RISCVTargetStreamer.h" +#include "TargetInfo/RISCVTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInstrInfo.h" @@ -50,7 +50,13 @@ static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) { static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT) { - return new RISCVMCAsmInfo(TT); + MCAsmInfo *MAI = new RISCVMCAsmInfo(TT); + + unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0); + MAI->addInitialFrameState(Inst); + + return MAI; } static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT, diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h index 0228253c08cb..b30997533ddf 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- RISCVMCTargetDesc.h - RISCV Target Descriptions ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,9 +32,6 @@ class Triple; class raw_ostream; class raw_pwrite_stream; -Target &getTheRISCV32Target(); -Target &getTheRISCV64Target(); - MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index 8d5ef3dbd17f..913e1f744192 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- RISCVTargetStreamer.cpp - RISCV Target Streamer Methods -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h index 74ec9e303933..1becc134b2a2 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h @@ -1,9 +1,8 @@ //===-- RISCVTargetStreamer.h - RISCV Target Streamer ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h index b25aee46200d..834a1d171143 100644 --- a/lib/Target/RISCV/RISCV.h +++ b/lib/Target/RISCV/RISCV.h @@ -1,9 +1,8 @@ //===-- RISCV.h - Top-level interface for RISCV -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td index 0e86e2bc5e98..e19b70b8e709 100644 --- a/lib/Target/RISCV/RISCV.td +++ b/lib/Target/RISCV/RISCV.td @@ -1,9 +1,8 @@ //===-- RISCV.td - Describe the RISCV Target Machine -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -55,23 +54,29 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">, def RV64 : HwMode<"+64bit">; def RV32 : HwMode<"-64bit">; +def FeatureRV32E + : SubtargetFeature<"e", "IsRV32E", "true", + "Implements RV32E (provides 16 rather than 32 GPRs)">; +def IsRV32E : Predicate<"Subtarget->isRV32E()">, + AssemblerPredicate<"FeatureRV32E">; + def FeatureRelax : SubtargetFeature<"relax", "EnableLinkerRelax", "true", "Enable Linker relaxation.">; //===----------------------------------------------------------------------===// -// Registers, calling conventions, instruction descriptions. +// Named operands for CSR instructions. //===----------------------------------------------------------------------===// -include "RISCVRegisterInfo.td" -include "RISCVCallingConv.td" -include "RISCVInstrInfo.td" +include "RISCVSystemOperands.td" //===----------------------------------------------------------------------===// -// Named operands for CSR instructions. +// Registers, calling conventions, instruction descriptions. //===----------------------------------------------------------------------===// -include "RISCVSystemOperands.td" +include "RISCVRegisterInfo.td" +include "RISCVCallingConv.td" +include "RISCVInstrInfo.td" //===----------------------------------------------------------------------===// // RISC-V processors supported. diff --git a/lib/Target/RISCV/RISCVAsmPrinter.cpp b/lib/Target/RISCV/RISCVAsmPrinter.cpp index bdf8e5d840b3..57631dcb5115 100644 --- a/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- RISCVAsmPrinter.cpp - RISCV LLVM assembly writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "RISCV.h" -#include "InstPrinter/RISCVInstPrinter.h" +#include "MCTargetDesc/RISCVInstPrinter.h" #include "MCTargetDesc/RISCVMCExpr.h" #include "RISCVTargetMachine.h" +#include "TargetInfo/RISCVTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -43,11 +43,9 @@ public: void EmitInstruction(const MachineInstr *MI) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; void EmitToStreamer(MCStreamer &S, const MCInst &Inst); bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, @@ -84,39 +82,50 @@ void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) { } bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) { - if (AsmVariant != 0) - report_fatal_error("There are no defined alternate asm variants"); - // First try the generic code, which knows about modifiers like 'c' and 'n'. - if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS)) + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS)) return false; - if (!ExtraCode) { - const MachineOperand &MO = MI->getOperand(OpNo); - switch (MO.getType()) { - case MachineOperand::MO_Immediate: - OS << MO.getImm(); - return false; - case MachineOperand::MO_Register: - OS << RISCVInstPrinter::getRegisterName(MO.getReg()); - return false; + const MachineOperand &MO = MI->getOperand(OpNo); + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { default: + return true; // Unknown modifier. + case 'z': // Print zero register if zero, regular printing otherwise. + if (MO.isImm() && MO.getImm() == 0) { + OS << RISCVInstPrinter::getRegisterName(RISCV::X0); + return false; + } break; + case 'i': // Literal 'i' if operand is not a register. + if (!MO.isReg()) + OS << 'i'; + return false; } } + switch (MO.getType()) { + case MachineOperand::MO_Immediate: + OS << MO.getImm(); + return false; + case MachineOperand::MO_Register: + OS << RISCVInstPrinter::getRegisterName(MO.getReg()); + return false; + default: + break; + } + return true; } bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, + unsigned OpNo, const char *ExtraCode, raw_ostream &OS) { - if (AsmVariant != 0) - report_fatal_error("There are no defined alternate asm variants"); - if (!ExtraCode) { const MachineOperand &MO = MI->getOperand(OpNo); // For now, we only support register memory operands in registers and @@ -128,7 +137,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, return false; } - return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS); + return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS); } // Force static initialization. diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td index ef146258c383..db13e6e8beca 100644 --- a/lib/Target/RISCV/RISCVCallingConv.td +++ b/lib/Target/RISCV/RISCVCallingConv.td @@ -1,9 +1,8 @@ //===-- RISCVCallingConv.td - Calling Conventions RISCV ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,16 @@ // The RISC-V calling convention is handled with custom code in // RISCVISelLowering.cpp (CC_RISCV). -def CSR : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>; +def CSR_ILP32_LP64 + : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>; + +def CSR_ILP32F_LP64F + : CalleeSavedRegs<(add CSR_ILP32_LP64, + F8_32, F9_32, (sequence "F%u_32", 18, 27))>; + +def CSR_ILP32D_LP64D + : CalleeSavedRegs<(add CSR_ILP32_LP64, + F8_64, F9_64, (sequence "F%u_64", 18, 27))>; // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask() def CSR_NoRegs : CalleeSavedRegs<(add)>; diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 35c185aa5edd..1c5171a7b7a4 100644 --- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -1,9 +1,8 @@ //===-- RISCVExpandPseudoInsts.cpp - Expand pseudo instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -55,6 +54,22 @@ private: bool expandAtomicCmpXchg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI); + bool expandAuipcInstPair(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned FlagsHi, unsigned SecondOpcode); + bool expandLoadLocalAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadTLSIEAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadTLSGDAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); }; char RISCVExpandPseudo::ID = 0; @@ -87,6 +102,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoAtomicLoadNand32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32, NextMBBI); + case RISCV::PseudoAtomicLoadNand64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64, + NextMBBI); case RISCV::PseudoMaskedAtomicSwap32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32, NextMBBI); @@ -111,8 +129,18 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, NextMBBI); case RISCV::PseudoCmpXchg32: return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI); + case RISCV::PseudoCmpXchg64: + return expandAtomicCmpXchg(MBB, MBBI, false, 64, NextMBBI); case RISCV::PseudoMaskedCmpXchg32: return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI); + case RISCV::PseudoLLA: + return expandLoadLocalAddress(MBB, MBBI, NextMBBI); + case RISCV::PseudoLA: + return expandLoadAddress(MBB, MBBI, NextMBBI); + case RISCV::PseudoLA_TLS_IE: + return expandLoadTLSIEAddress(MBB, MBBI, NextMBBI); + case RISCV::PseudoLA_TLS_GD: + return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI); } return false; @@ -152,12 +180,61 @@ static unsigned getSCForRMW32(AtomicOrdering Ordering) { } } +static unsigned getLRForRMW64(AtomicOrdering Ordering) { + switch (Ordering) { + default: + llvm_unreachable("Unexpected AtomicOrdering"); + case AtomicOrdering::Monotonic: + return RISCV::LR_D; + case AtomicOrdering::Acquire: + return RISCV::LR_D_AQ; + case AtomicOrdering::Release: + return RISCV::LR_D; + case AtomicOrdering::AcquireRelease: + return RISCV::LR_D_AQ; + case AtomicOrdering::SequentiallyConsistent: + return RISCV::LR_D_AQ_RL; + } +} + +static unsigned getSCForRMW64(AtomicOrdering Ordering) { + switch (Ordering) { + default: + llvm_unreachable("Unexpected AtomicOrdering"); + case AtomicOrdering::Monotonic: + return RISCV::SC_D; + case AtomicOrdering::Acquire: + return RISCV::SC_D; + case AtomicOrdering::Release: + return RISCV::SC_D_RL; + case AtomicOrdering::AcquireRelease: + return RISCV::SC_D_RL; + case AtomicOrdering::SequentiallyConsistent: + return RISCV::SC_D_AQ_RL; + } +} + +static unsigned getLRForRMW(AtomicOrdering Ordering, int Width) { + if (Width == 32) + return getLRForRMW32(Ordering); + if (Width == 64) + return getLRForRMW64(Ordering); + llvm_unreachable("Unexpected LR width\n"); +} + +static unsigned getSCForRMW(AtomicOrdering Ordering, int Width) { + if (Width == 32) + return getSCForRMW32(Ordering); + if (Width == 64) + return getSCForRMW64(Ordering); + llvm_unreachable("Unexpected SC width\n"); +} + static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL, MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB, MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) { - assert(Width == 32 && "RV64 atomic expansion currently unsupported"); unsigned DestReg = MI.getOperand(0).getReg(); unsigned ScratchReg = MI.getOperand(1).getReg(); unsigned AddrReg = MI.getOperand(2).getReg(); @@ -166,11 +243,11 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, static_cast(MI.getOperand(4).getImm()); // .loop: - // lr.w dest, (addr) + // lr.[w|d] dest, (addr) // binop scratch, dest, val - // sc.w scratch, scratch, (addr) + // sc.[w|d] scratch, scratch, (addr) // bnez scratch, loop - BuildMI(LoopMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg) + BuildMI(LoopMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg) .addReg(AddrReg); switch (BinOp) { default: @@ -184,7 +261,7 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, .addImm(-1); break; } - BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg) + BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg) .addReg(AddrReg) .addReg(ScratchReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) @@ -219,7 +296,7 @@ static void doMaskedAtomicBinOpExpansion( const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL, MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB, MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) { - assert(Width == 32 && "RV64 atomic expansion currently unsupported"); + assert(Width == 32 && "Should never need to expand masked 64-bit operations"); unsigned DestReg = MI.getOperand(0).getReg(); unsigned ScratchReg = MI.getOperand(1).getReg(); unsigned AddrReg = MI.getOperand(2).getReg(); @@ -333,7 +410,7 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp( MachineBasicBlock::iterator &NextMBBI) { assert(IsMasked == true && "Should only need to expand masked atomic max/min"); - assert(Width == 32 && "RV64 atomic expansion currently unsupported"); + assert(Width == 32 && "Should never need to expand masked 64-bit operations"); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); @@ -451,7 +528,6 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp( bool RISCVExpandPseudo::expandAtomicCmpXchg( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI) { - assert(Width == 32 && "RV64 atomic expansion currently unsupported"); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB.getParent(); @@ -483,18 +559,18 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg( if (!IsMasked) { // .loophead: - // lr.w dest, (addr) + // lr.[w|d] dest, (addr) // bne dest, cmpval, done - BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg) + BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg) .addReg(AddrReg); BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE)) .addReg(DestReg) .addReg(CmpValReg) .addMBB(DoneMBB); // .looptail: - // sc.w scratch, newval, (addr) + // sc.[w|d] scratch, newval, (addr) // bnez scratch, loophead - BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg) + BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg) .addReg(AddrReg) .addReg(NewValReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) @@ -507,7 +583,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg( // and scratch, dest, mask // bne scratch, cmpval, done unsigned MaskReg = MI.getOperand(5).getReg(); - BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg) + BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg) .addReg(AddrReg); BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg) .addReg(DestReg) @@ -525,7 +601,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg( // bnez scratch, loophead insertMaskedMerge(TII, DL, LoopTailMBB, ScratchReg, DestReg, NewValReg, MaskReg, ScratchReg); - BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg) + BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg) .addReg(AddrReg) .addReg(ScratchReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) @@ -545,6 +621,90 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg( return true; } +bool RISCVExpandPseudo::expandAuipcInstPair( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, unsigned FlagsHi, + unsigned SecondOpcode) { + MachineFunction *MF = MBB.getParent(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + + unsigned DestReg = MI.getOperand(0).getReg(); + const MachineOperand &Symbol = MI.getOperand(1); + + MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + // Tell AsmPrinter that we unconditionally want the symbol of this label to be + // emitted. + NewMBB->setLabelMustBeEmitted(); + + MF->insert(++MBB.getIterator(), NewMBB); + + BuildMI(NewMBB, DL, TII->get(RISCV::AUIPC), DestReg) + .addDisp(Symbol, 0, FlagsHi); + BuildMI(NewMBB, DL, TII->get(SecondOpcode), DestReg) + .addReg(DestReg) + .addMBB(NewMBB, RISCVII::MO_PCREL_LO); + + // Move all the rest of the instructions to NewMBB. + NewMBB->splice(NewMBB->end(), &MBB, std::next(MBBI), MBB.end()); + // Update machine-CFG edges. + NewMBB->transferSuccessorsAndUpdatePHIs(&MBB); + // Make the original basic block fall-through to the new. + MBB.addSuccessor(NewMBB); + + // Make sure live-ins are correctly attached to this new basic block. + LivePhysRegs LiveRegs; + computeAndAddLiveIns(LiveRegs, *NewMBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +bool RISCVExpandPseudo::expandLoadLocalAddress( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + return expandAuipcInstPair(MBB, MBBI, NextMBBI, RISCVII::MO_PCREL_HI, + RISCV::ADDI); +} + +bool RISCVExpandPseudo::expandLoadAddress( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineFunction *MF = MBB.getParent(); + + unsigned SecondOpcode; + unsigned FlagsHi; + if (MF->getTarget().isPositionIndependent()) { + const auto &STI = MF->getSubtarget(); + SecondOpcode = STI.is64Bit() ? RISCV::LD : RISCV::LW; + FlagsHi = RISCVII::MO_GOT_HI; + } else { + SecondOpcode = RISCV::ADDI; + FlagsHi = RISCVII::MO_PCREL_HI; + } + return expandAuipcInstPair(MBB, MBBI, NextMBBI, FlagsHi, SecondOpcode); +} + +bool RISCVExpandPseudo::expandLoadTLSIEAddress( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineFunction *MF = MBB.getParent(); + + const auto &STI = MF->getSubtarget(); + unsigned SecondOpcode = STI.is64Bit() ? RISCV::LD : RISCV::LW; + return expandAuipcInstPair(MBB, MBBI, NextMBBI, RISCVII::MO_TLS_GOT_HI, + SecondOpcode); +} + +bool RISCVExpandPseudo::expandLoadTLSGDAddress( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + return expandAuipcInstPair(MBB, MBBI, NextMBBI, RISCVII::MO_TLS_GD_HI, + RISCV::ADDI); +} + } // end of anonymous namespace INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo", diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp index 74417899c8da..32c3b9684d2c 100644 --- a/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- RISCVFrameLowering.cpp - RISCV Frame Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,6 +18,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCDwarf.h" using namespace llvm; @@ -97,6 +97,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); auto *RVFI = MF.getInfo(); + const RISCVRegisterInfo *RI = STI.getRegisterInfo(); + const RISCVInstrInfo *TII = STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); unsigned FPReg = getFPReg(STI); @@ -120,6 +122,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Allocate space on the stack if necessary. adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); + // Emit ".cfi_def_cfa_offset StackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + // The frame pointer is callee-saved, and code has been generated for us to // save it to the stack. We need to skip over the storing of callee-saved // registers as the frame pointer must be modified after it has been saved @@ -129,10 +137,28 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, const std::vector &CSI = MFI.getCalleeSavedInfo(); std::advance(MBBI, CSI.size()); + // Iterate over list of callee-saved registers and emit .cfi_offset + // directives. + for (const auto &Entry : CSI) { + int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx()); + unsigned Reg = Entry.getReg(); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, RI->getDwarfRegNum(Reg, true), Offset)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + // Generate new FP. - if (hasFP(MF)) + if (hasFP(MF)) { adjustReg(MBB, MBBI, DL, FPReg, SPReg, StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup); + + // Emit ".cfi_def_cfa $fp, 0" + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, RI->getDwarfRegNum(FPReg, true), 0)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -142,6 +168,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); auto *RVFI = MF.getInfo(); DebugLoc DL = MBBI->getDebugLoc(); + const RISCVInstrInfo *TII = STI.getInstrInfo(); unsigned FPReg = getFPReg(STI); unsigned SPReg = getSPReg(STI); @@ -151,19 +178,58 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto LastFrameDestroy = std::prev(MBBI, MFI.getCalleeSavedInfo().size()); uint64_t StackSize = MFI.getStackSize(); + uint64_t FPOffset = StackSize - RVFI->getVarArgsSaveSize(); // Restore the stack pointer using the value of the frame pointer. Only // necessary if the stack pointer was modified, meaning the stack size is // unknown. if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) { assert(hasFP(MF) && "frame pointer should not have been eliminated"); - adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, - -StackSize + RVFI->getVarArgsSaveSize(), + adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset, MachineInstr::FrameDestroy); } + if (hasFP(MF)) { + // To find the instruction restoring FP from stack. + for (auto &I = LastFrameDestroy; I != MBBI; ++I) { + if (I->mayLoad() && I->getOperand(0).isReg()) { + unsigned DestReg = I->getOperand(0).getReg(); + if (DestReg == FPReg) { + // If there is frame pointer, after restoring $fp registers, we + // need adjust CFA to ($sp - FPOffset). + // Emit ".cfi_def_cfa $sp, -FPOffset" + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, RI->getDwarfRegNum(SPReg, true), -FPOffset)); + BuildMI(MBB, std::next(I), DL, + TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + break; + } + } + } + } + + // Add CFI directives for callee-saved registers. + const std::vector &CSI = MFI.getCalleeSavedInfo(); + // Iterate over list of callee-saved registers and emit .cfi_restore + // directives. + for (const auto &Entry : CSI) { + unsigned Reg = Entry.getReg(); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore( + nullptr, RI->getDwarfRegNum(Reg, true))); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + // Deallocate stack adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); + + // After restoring $sp, we need to adjust CFA to $(sp + 0) + // Emit ".cfi_def_cfa_offset 0" + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); } int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h index ca653c2b9f17..0e045c3ff853 100644 --- a/lib/Target/RISCV/RISCVFrameLowering.h +++ b/lib/Target/RISCV/RISCVFrameLowering.h @@ -1,9 +1,8 @@ //===-- RISCVFrameLowering.h - Define frame lowering for RISCV -*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index aa80365feb83..d0a3af375a6d 100644 --- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- RISCVISelDAGToDAG.cpp - A dag to dag inst selector for RISCV ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -156,7 +155,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } } + break; } + case RISCVISD::READ_CYCLE_WIDE: + assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32"); + + ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32, + MVT::i32, MVT::Other, + Node->getOperand(0))); + return; } // Select the default instruction. diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp index 508dcbd009ed..ce7b85911ab6 100644 --- a/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1,9 +1,8 @@ //===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,6 +17,8 @@ #include "RISCVRegisterInfo.h" #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" +#include "Utils/RISCVMatInt.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -43,6 +44,24 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { + if (Subtarget.isRV32E()) + report_fatal_error("Codegen not yet implemented for RV32E"); + + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI"); + + switch (ABI) { + default: + report_fatal_error("Don't know how to lower this ABI"); + case RISCVABI::ABI_ILP32: + case RISCVABI::ABI_ILP32F: + case RISCVABI::ABI_ILP32D: + case RISCVABI::ABI_LP64: + case RISCVABI::ABI_LP64F: + case RISCVABI::ABI_LP64D: + break; + } + MVT XLenVT = Subtarget.getXLenVT(); // Set up the register classes. @@ -81,10 +100,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (Subtarget.is64Bit()) { - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SRL); - setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::ANY_EXTEND); + setOperationAction(ISD::SHL, MVT::i32, Custom); + setOperationAction(ISD::SRA, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i32, Custom); } if (!Subtarget.hasStdExtM()) { @@ -97,14 +115,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UREM, XLenVT, Expand); } + if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) { + setOperationAction(ISD::SDIV, MVT::i32, Custom); + setOperationAction(ISD::UDIV, MVT::i32, Custom); + setOperationAction(ISD::UREM, MVT::i32, Custom); + } + setOperationAction(ISD::SDIVREM, XLenVT, Expand); setOperationAction(ISD::UDIVREM, XLenVT, Expand); setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand); setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand); - setOperationAction(ISD::SHL_PARTS, XLenVT, Expand); - setOperationAction(ISD::SRL_PARTS, XLenVT, Expand); - setOperationAction(ISD::SRA_PARTS, XLenVT, Expand); + setOperationAction(ISD::SHL_PARTS, XLenVT, Custom); + setOperationAction(ISD::SRL_PARTS, XLenVT, Custom); + setOperationAction(ISD::SRA_PARTS, XLenVT, Custom); setOperationAction(ISD::ROTL, XLenVT, Expand); setOperationAction(ISD::ROTR, XLenVT, Expand); @@ -114,9 +138,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, XLenVT, Expand); ISD::CondCode FPCCToExtend[] = { - ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETO, ISD::SETUEQ, - ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, - ISD::SETGT, ISD::SETGE, ISD::SETNE}; + ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT, + ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT, + ISD::SETGE, ISD::SETNE}; ISD::NodeType FPOpToExtend[] = { ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM}; @@ -133,6 +157,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::f32, Expand); } + if (Subtarget.hasStdExtF() && Subtarget.is64Bit()) + setOperationAction(ISD::BITCAST, MVT::i32, Custom); + if (Subtarget.hasStdExtD()) { setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); @@ -151,6 +178,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BlockAddress, XLenVT, Custom); setOperationAction(ISD::ConstantPool, XLenVT, Custom); + setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom); + + // TODO: On M-mode only targets, the cycle[h] CSR may not be present. + // Unfortunately this can't be determined just from the ISA naming string. + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, + Subtarget.is64Bit() ? Legal : Custom); + if (Subtarget.hasStdExtA()) { setMaxAtomicSizeInBitsSupported(Subtarget.getXLen()); setMinCmpXchgSizeInBits(32); @@ -276,6 +310,11 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const { return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64; } +bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const { + return (VT == MVT::f32 && Subtarget.hasStdExtF()) || + (VT == MVT::f64 && Subtarget.hasStdExtD()); +} + // Changes the condition code and swaps operands if necessary, so the SetCC // operation matches one of the comparisons supported directly in the RISC-V // ISA. @@ -326,6 +365,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerBlockAddress(Op, DAG); case ISD::ConstantPool: return lowerConstantPool(Op, DAG); + case ISD::GlobalTLSAddress: + return lowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); case ISD::VASTART: @@ -334,6 +375,81 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerFRAMEADDR(Op, DAG); case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG); + case ISD::SHL_PARTS: + return lowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + return lowerShiftRightParts(Op, DAG, true); + case ISD::SRL_PARTS: + return lowerShiftRightParts(Op, DAG, false); + case ISD::BITCAST: { + assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() && + "Unexpected custom legalisation"); + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32) + return SDValue(); + SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0); + return FPConv; + } + } +} + +static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, unsigned Flags) { + return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags); +} + +static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, unsigned Flags) { + return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(), + Flags); +} + +static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, unsigned Flags) { + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + N->getOffset(), Flags); +} + +template +SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, + bool IsLocal) const { + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + + if (isPositionIndependent()) { + SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); + if (IsLocal) + // Use PC-relative addressing to access the symbol. This generates the + // pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym)) + // %pcrel_lo(auipc)). + return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0); + + // Use PC-relative addressing to access the GOT for this symbol, then load + // the address from the GOT. This generates the pattern (PseudoLA sym), + // which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))). + return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0); + } + + switch (getTargetMachine().getCodeModel()) { + default: + report_fatal_error("Unsupported code model for lowering"); + case CodeModel::Small: { + // Generate a sequence for accessing addresses within the first 2 GiB of + // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)). + SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI); + SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO); + SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0); + return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0); + } + case CodeModel::Medium: { + // Generate a sequence for accessing addresses within any 2GiB range within + // the address space. This generates the pattern (PseudoLLA sym), which + // expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)). + SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); + return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0); + } } } @@ -342,67 +458,145 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op, SDLoc DL(Op); EVT Ty = Op.getValueType(); GlobalAddressSDNode *N = cast(Op); - const GlobalValue *GV = N->getGlobal(); int64_t Offset = N->getOffset(); MVT XLenVT = Subtarget.getXLenVT(); - if (isPositionIndependent()) - report_fatal_error("Unable to lowerGlobalAddress"); + const GlobalValue *GV = N->getGlobal(); + bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); + SDValue Addr = getAddr(N, DAG, IsLocal); + // In order to maximise the opportunity for common subexpression elimination, // emit a separate ADD node for the global address offset instead of folding // it in the global address node. Later peephole optimisations may choose to // fold it back in when profitable. - SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_HI); - SDValue GALo = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_LO); - SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, GAHi), 0); - SDValue MNLo = - SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, GALo), 0); if (Offset != 0) - return DAG.getNode(ISD::ADD, DL, Ty, MNLo, + return DAG.getNode(ISD::ADD, DL, Ty, Addr, DAG.getConstant(Offset, DL, XLenVT)); - return MNLo; + return Addr; } SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT Ty = Op.getValueType(); BlockAddressSDNode *N = cast(Op); - const BlockAddress *BA = N->getBlockAddress(); - int64_t Offset = N->getOffset(); - - if (isPositionIndependent()) - report_fatal_error("Unable to lowerBlockAddress"); - SDValue BAHi = DAG.getTargetBlockAddress(BA, Ty, Offset, RISCVII::MO_HI); - SDValue BALo = DAG.getTargetBlockAddress(BA, Ty, Offset, RISCVII::MO_LO); - SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, BAHi), 0); - SDValue MNLo = - SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, BALo), 0); - return MNLo; + return getAddr(N, DAG); } SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op, SelectionDAG &DAG) const { + ConstantPoolSDNode *N = cast(Op); + + return getAddr(N, DAG); +} + +SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, + SelectionDAG &DAG, + bool UseGOT) const { + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const GlobalValue *GV = N->getGlobal(); + MVT XLenVT = Subtarget.getXLenVT(); + + if (UseGOT) { + // Use PC-relative addressing to access the GOT for this TLS symbol, then + // load the address from the GOT and add the thread pointer. This generates + // the pattern (PseudoLA_TLS_IE sym), which expands to + // (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)). + SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0); + SDValue Load = + SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0); + + // Add the thread pointer. + SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT); + return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg); + } + + // Generate a sequence for accessing the address relative to the thread + // pointer, with the appropriate adjustment for the thread pointer offset. + // This generates the pattern + // (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym)) + SDValue AddrHi = + DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI); + SDValue AddrAdd = + DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD); + SDValue AddrLo = + DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO); + + SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0); + SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT); + SDValue MNAdd = SDValue( + DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd), + 0); + return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0); +} + +SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N, + SelectionDAG &DAG) const { + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits()); + const GlobalValue *GV = N->getGlobal(); + + // Use a PC-relative addressing mode to access the global dynamic GOT address. + // This generates the pattern (PseudoLA_TLS_GD sym), which expands to + // (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)). + SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0); + SDValue Load = + SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0); + + // Prepare argument list to generate call. + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Load; + Entry.Ty = CallTy; + Args.push_back(Entry); + + // Setup call to __tls_get_addr. + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, CallTy, + DAG.getExternalSymbol("__tls_get_addr", Ty), + std::move(Args)); + + return LowerCallTo(CLI).first; +} + +SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { SDLoc DL(Op); EVT Ty = Op.getValueType(); - ConstantPoolSDNode *N = cast(Op); - const Constant *CPA = N->getConstVal(); + GlobalAddressSDNode *N = cast(Op); int64_t Offset = N->getOffset(); - unsigned Alignment = N->getAlignment(); - - if (!isPositionIndependent()) { - SDValue CPAHi = - DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_HI); - SDValue CPALo = - DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_LO); - SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, CPAHi), 0); - SDValue MNLo = - SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, CPALo), 0); - return MNLo; - } else { - report_fatal_error("Unable to lowerConstantPool"); + MVT XLenVT = Subtarget.getXLenVT(); + + // Non-PIC TLS lowering should always use the LocalExec model. + TLSModel::Model Model = isPositionIndependent() + ? getTargetMachine().getTLSModel(N->getGlobal()) + : TLSModel::LocalExec; + + SDValue Addr; + switch (Model) { + case TLSModel::LocalExec: + Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false); + break; + case TLSModel::InitialExec: + Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true); + break; + case TLSModel::LocalDynamic: + case TLSModel::GeneralDynamic: + Addr = getDynamicTLSAddr(N, DAG); + break; } + + // In order to maximise the opportunity for common subexpression elimination, + // emit a separate ADD node for the global address offset instead of folding + // it in the global address node. Later peephole optimisations may choose to + // fold it back in when profitable. + if (Offset != 0) + return DAG.getNode(ISD::ADD, DL, Ty, Addr, + DAG.getConstant(Offset, DL, XLenVT)); + return Addr; } SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -513,29 +707,184 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op, return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT); } -// Return true if the given node is a shift with a non-constant shift amount. -static bool isVariableShift(SDValue Val) { - switch (Val.getOpcode()) { +SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shamt = Op.getOperand(2); + EVT VT = Lo.getValueType(); + + // if Shamt-XLEN < 0: // Shamt < XLEN + // Lo = Lo << Shamt + // Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 - Shamt)) + // else: + // Lo = 0 + // Hi = Lo << (Shamt-XLEN) + + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT); + SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT); + SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen); + SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt); + + SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt); + SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One); + SDValue ShiftRightLo = + DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt); + SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt); + SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo); + SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen); + + SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT); + + Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero); + Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse); + + SDValue Parts[2] = {Lo, Hi}; + return DAG.getMergeValues(Parts, DL); +} + +SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, + bool IsSRA) const { + SDLoc DL(Op); + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shamt = Op.getOperand(2); + EVT VT = Lo.getValueType(); + + // SRA expansion: + // if Shamt-XLEN < 0: // Shamt < XLEN + // Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt)) + // Hi = Hi >>s Shamt + // else: + // Lo = Hi >>s (Shamt-XLEN); + // Hi = Hi >>s (XLEN-1) + // + // SRL expansion: + // if Shamt-XLEN < 0: // Shamt < XLEN + // Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt)) + // Hi = Hi >>u Shamt + // else: + // Lo = Hi >>u (Shamt-XLEN); + // Hi = 0; + + unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL; + + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT); + SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT); + SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen); + SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt); + + SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt); + SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One); + SDValue ShiftLeftHi = + DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt); + SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi); + SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt); + SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen); + SDValue HiFalse = + IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero; + + SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT); + + Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse); + Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse); + + SDValue Parts[2] = {Lo, Hi}; + return DAG.getMergeValues(Parts, DL); +} + +// Returns the opcode of the target-specific SDNode that implements the 32-bit +// form of the given Opcode. +static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) { + switch (Opcode) { default: - return false; + llvm_unreachable("Unexpected opcode"); case ISD::SHL: + return RISCVISD::SLLW; case ISD::SRA: + return RISCVISD::SRAW; case ISD::SRL: - return Val.getOperand(1).getOpcode() != ISD::Constant; + return RISCVISD::SRLW; + case ISD::SDIV: + return RISCVISD::DIVW; + case ISD::UDIV: + return RISCVISD::DIVUW; + case ISD::UREM: + return RISCVISD::REMUW; } } -// Returns true if the given node is an sdiv, udiv, or urem with non-constant -// operands. -static bool isVariableSDivUDivURem(SDValue Val) { - switch (Val.getOpcode()) { +// Converts the given 32-bit operation to a target-specific SelectionDAG node. +// Because i32 isn't a legal type for RV64, these operations would otherwise +// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W +// later one because the fact the operation was originally of type i32 is +// lost. +static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode()); + SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); + SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); + SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1); + // ReplaceNodeResults requires we maintain the same type for the return value. + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes); +} + +void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + SDLoc DL(N); + switch (N->getOpcode()) { default: - return false; + llvm_unreachable("Don't know how to custom type legalize this operation!"); + case ISD::READCYCLECOUNTER: { + assert(!Subtarget.is64Bit() && + "READCYCLECOUNTER only has custom type legalization on riscv32"); + + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue RCW = + DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0)); + + Results.push_back(RCW); + Results.push_back(RCW.getValue(1)); + Results.push_back(RCW.getValue(2)); + break; + } + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + if (N->getOperand(1).getOpcode() == ISD::Constant) + return; + Results.push_back(customLegalizeToWOp(N, DAG)); + break; case ISD::SDIV: case ISD::UDIV: case ISD::UREM: - return Val.getOperand(0).getOpcode() != ISD::Constant && - Val.getOperand(1).getOpcode() != ISD::Constant; + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + Subtarget.hasStdExtM() && "Unexpected custom legalisation"); + if (N->getOperand(0).getOpcode() == ISD::Constant || + N->getOperand(1).getOpcode() == ISD::Constant) + return; + Results.push_back(customLegalizeToWOp(N, DAG)); + break; + case ISD::BITCAST: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + Subtarget.hasStdExtF() && "Unexpected custom legalisation"); + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + if (Op0.getValueType() != MVT::f32) + return; + SDValue FPConv = + DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv)); + break; + } } } @@ -546,51 +895,225 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; - case ISD::SHL: - case ISD::SRL: - case ISD::SRA: { - assert(Subtarget.getXLen() == 64 && "Combine should be 64-bit only"); - if (!DCI.isBeforeLegalize()) - break; - SDValue RHS = N->getOperand(1); - if (N->getValueType(0) != MVT::i32 || RHS->getOpcode() == ISD::Constant || - (RHS->getOpcode() == ISD::AssertZext && - cast(RHS->getOperand(1))->getVT().getSizeInBits() <= 5)) - break; - SDValue LHS = N->getOperand(0); - SDLoc DL(N); - SDValue NewRHS = - DAG.getNode(ISD::AssertZext, DL, RHS.getValueType(), RHS, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 5))); - return DCI.CombineTo( - N, DAG.getNode(N->getOpcode(), DL, LHS.getValueType(), LHS, NewRHS)); - } - case ISD::ANY_EXTEND: { - // If any-extending an i32 variable-length shift or sdiv/udiv/urem to i64, - // then instead sign-extend in order to increase the chance of being able - // to select the sllw/srlw/sraw/divw/divuw/remuw instructions. - SDValue Src = N->getOperand(0); - if (N->getValueType(0) != MVT::i64 || Src.getValueType() != MVT::i32) - break; - if (!isVariableShift(Src) && - !(Subtarget.hasStdExtM() && isVariableSDivUDivURem(Src))) - break; - SDLoc DL(N); - return DCI.CombineTo(N, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Src)); - } case RISCVISD::SplitF64: { + SDValue Op0 = N->getOperand(0); // If the input to SplitF64 is just BuildPairF64 then the operation is // redundant. Instead, use BuildPairF64's operands directly. + if (Op0->getOpcode() == RISCVISD::BuildPairF64) + return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1)); + + SDLoc DL(N); + + // It's cheaper to materialise two 32-bit integers than to load a double + // from the constant pool and transfer it to integer registers through the + // stack. + if (ConstantFPSDNode *C = dyn_cast(Op0)) { + APInt V = C->getValueAPF().bitcastToAPInt(); + SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32); + SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32); + return DCI.CombineTo(N, Lo, Hi); + } + + // This is a target-specific version of a DAGCombine performed in + // DAGCombiner::visitBITCAST. It performs the equivalent of: + // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) + // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) + if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) || + !Op0.getNode()->hasOneUse()) + break; + SDValue NewSplitF64 = + DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), + Op0.getOperand(0)); + SDValue Lo = NewSplitF64.getValue(0); + SDValue Hi = NewSplitF64.getValue(1); + APInt SignBit = APInt::getSignMask(32); + if (Op0.getOpcode() == ISD::FNEG) { + SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi, + DAG.getConstant(SignBit, DL, MVT::i32)); + return DCI.CombineTo(N, Lo, NewHi); + } + assert(Op0.getOpcode() == ISD::FABS); + SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi, + DAG.getConstant(~SignBit, DL, MVT::i32)); + return DCI.CombineTo(N, Lo, NewHi); + } + case RISCVISD::SLLW: + case RISCVISD::SRAW: + case RISCVISD::SRLW: { + // Only the lower 32 bits of LHS and lower 5 bits of RHS are read. + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32); + APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5); + if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI))) + return SDValue(); + break; + } + case RISCVISD::FMV_X_ANYEXTW_RV64: { + SDLoc DL(N); SDValue Op0 = N->getOperand(0); - if (Op0->getOpcode() != RISCVISD::BuildPairF64) + // If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the + // conversion is unnecessary and can be replaced with an ANY_EXTEND + // of the FMV_W_X_RV64 operand. + if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) { + SDValue AExtOp = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0)); + return DCI.CombineTo(N, AExtOp); + } + + // This is a target-specific version of a DAGCombine performed in + // DAGCombiner::visitBITCAST. It performs the equivalent of: + // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) + // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) + if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) || + !Op0.getNode()->hasOneUse()) break; - return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1)); + SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, + Op0.getOperand(0)); + APInt SignBit = APInt::getSignMask(32).sext(64); + if (Op0.getOpcode() == ISD::FNEG) { + return DCI.CombineTo(N, + DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV, + DAG.getConstant(SignBit, DL, MVT::i64))); + } + assert(Op0.getOpcode() == ISD::FABS); + return DCI.CombineTo(N, + DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV, + DAG.getConstant(~SignBit, DL, MVT::i64))); } } return SDValue(); } +bool RISCVTargetLowering::isDesirableToCommuteWithShift( + const SDNode *N, CombineLevel Level) const { + // The following folds are only desirable if `(OP _, c1 << c2)` can be + // materialised in fewer instructions than `(OP _, c1)`: + // + // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) + SDValue N0 = N->getOperand(0); + EVT Ty = N0.getValueType(); + if (Ty.isScalarInteger() && + (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) { + auto *C1 = dyn_cast(N0->getOperand(1)); + auto *C2 = dyn_cast(N->getOperand(1)); + if (C1 && C2) { + APInt C1Int = C1->getAPIntValue(); + APInt ShiftedC1Int = C1Int << C2->getAPIntValue(); + + // We can materialise `c1 << c2` into an add immediate, so it's "free", + // and the combine should happen, to potentially allow further combines + // later. + if (isLegalAddImmediate(ShiftedC1Int.getSExtValue())) + return true; + + // We can materialise `c1` in an add immediate, so it's "free", and the + // combine should be prevented. + if (isLegalAddImmediate(C1Int.getSExtValue())) + return false; + + // Neither constant will fit into an immediate, so find materialisation + // costs. + int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(), + Subtarget.is64Bit()); + int ShiftedC1Cost = RISCVMatInt::getIntMatCost( + ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit()); + + // Materialising `c1` is cheaper than materialising `c1 << c2`, so the + // combine should be prevented. + if (C1Cost < ShiftedC1Cost) + return false; + } + } + return true; +} + +unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + unsigned Depth) const { + switch (Op.getOpcode()) { + default: + break; + case RISCVISD::SLLW: + case RISCVISD::SRAW: + case RISCVISD::SRLW: + case RISCVISD::DIVW: + case RISCVISD::DIVUW: + case RISCVISD::REMUW: + // TODO: As the result is sign-extended, this is conservatively correct. A + // more precise answer could be calculated for SRAW depending on known + // bits in the shift amount. + return 33; + } + + return 1; +} + +MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, + MachineBasicBlock *BB) { + assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction"); + + // To read the 64-bit cycle CSR on a 32-bit target, we read the two halves. + // Should the count have wrapped while it was being read, we need to try + // again. + // ... + // read: + // rdcycleh x3 # load high word of cycle + // rdcycle x2 # load low word of cycle + // rdcycleh x4 # load high word of cycle + // bne x3, x4, read # check if high word reads match, otherwise try again + // ... + + MachineFunction &MF = *BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MF.insert(It, LoopMBB); + + MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MF.insert(It, DoneMBB); + + // Transfer the remainder of BB and its successor edges to DoneMBB. + DoneMBB->splice(DoneMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + DoneMBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(LoopMBB); + + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + unsigned LoReg = MI.getOperand(0).getReg(); + unsigned HiReg = MI.getOperand(1).getReg(); + DebugLoc DL = MI.getDebugLoc(); + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg) + .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding) + .addReg(RISCV::X0); + BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg) + .addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding) + .addReg(RISCV::X0); + BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg) + .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding) + .addReg(RISCV::X0); + + BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) + .addReg(HiReg) + .addReg(ReadAgainReg) + .addMBB(LoopMBB); + + LoopMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(DoneMBB); + + MI.eraseFromParent(); + + return DoneMBB; +} + static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB) { assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction"); @@ -655,24 +1178,21 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, return BB; } -MachineBasicBlock * -RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, - MachineBasicBlock *BB) const { +static bool isSelectPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { default: - llvm_unreachable("Unexpected instr type to insert"); + return false; case RISCV::Select_GPR_Using_CC_GPR: case RISCV::Select_FPR32_Using_CC_GPR: case RISCV::Select_FPR64_Using_CC_GPR: - break; - case RISCV::BuildPairF64Pseudo: - return emitBuildPairF64Pseudo(MI, BB); - case RISCV::SplitF64Pseudo: - return emitSplitF64Pseudo(MI, BB); + return true; } +} - // To "insert" a SELECT instruction, we actually have to insert the triangle - // control-flow pattern. The incoming instruction knows the destination vreg +static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, + MachineBasicBlock *BB) { + // To "insert" Select_* instructions, we actually have to insert the triangle + // control-flow pattern. The incoming instructions know the destination vreg // to set, the condition code register to branch on, the true/false values to // select between, and the condcode to use to select the appropriate branch. // @@ -682,6 +1202,54 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // | IfFalseMBB // | / // TailMBB + // + // When we find a sequence of selects we attempt to optimize their emission + // by sharing the control flow. Currently we only handle cases where we have + // multiple selects with the exact same condition (same LHS, RHS and CC). + // The selects may be interleaved with other instructions if the other + // instructions meet some requirements we deem safe: + // - They are debug instructions. Otherwise, + // - They do not have side-effects, do not access memory and their inputs do + // not depend on the results of the select pseudo-instructions. + // The TrueV/FalseV operands of the selects cannot depend on the result of + // previous selects in the sequence. + // These conditions could be further relaxed. See the X86 target for a + // related approach and more information. + unsigned LHS = MI.getOperand(1).getReg(); + unsigned RHS = MI.getOperand(2).getReg(); + auto CC = static_cast(MI.getOperand(3).getImm()); + + SmallVector SelectDebugValues; + SmallSet SelectDests; + SelectDests.insert(MI.getOperand(0).getReg()); + + MachineInstr *LastSelectPseudo = &MI; + + for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI); + SequenceMBBI != E; ++SequenceMBBI) { + if (SequenceMBBI->isDebugInstr()) + continue; + else if (isSelectPseudo(*SequenceMBBI)) { + if (SequenceMBBI->getOperand(1).getReg() != LHS || + SequenceMBBI->getOperand(2).getReg() != RHS || + SequenceMBBI->getOperand(3).getImm() != CC || + SelectDests.count(SequenceMBBI->getOperand(4).getReg()) || + SelectDests.count(SequenceMBBI->getOperand(5).getReg())) + break; + LastSelectPseudo = &*SequenceMBBI; + SequenceMBBI->collectDebugValues(SelectDebugValues); + SelectDests.insert(SequenceMBBI->getOperand(0).getReg()); + } else { + if (SequenceMBBI->hasUnmodeledSideEffects() || + SequenceMBBI->mayLoadOrStore()) + break; + if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) { + return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg()); + })) + break; + } + } + const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); DebugLoc DL = MI.getDebugLoc(); @@ -694,20 +1262,23 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, F->insert(I, IfFalseMBB); F->insert(I, TailMBB); - // Move all remaining instructions to TailMBB. - TailMBB->splice(TailMBB->begin(), HeadMBB, - std::next(MachineBasicBlock::iterator(MI)), HeadMBB->end()); + + // Transfer debug instructions associated with the selects to TailMBB. + for (MachineInstr *DebugInstr : SelectDebugValues) { + TailMBB->push_back(DebugInstr->removeFromParent()); + } + + // Move all instructions after the sequence to TailMBB. + TailMBB->splice(TailMBB->end(), HeadMBB, + std::next(LastSelectPseudo->getIterator()), HeadMBB->end()); // Update machine-CFG edges by transferring all successors of the current - // block to the new block which will contain the Phi node for the select. + // block to the new block which will contain the Phi nodes for the selects. TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB); // Set the successors for HeadMBB. HeadMBB->addSuccessor(IfFalseMBB); HeadMBB->addSuccessor(TailMBB); // Insert appropriate branch. - unsigned LHS = MI.getOperand(1).getReg(); - unsigned RHS = MI.getOperand(2).getReg(); - auto CC = static_cast(MI.getOperand(3).getImm()); unsigned Opcode = getBranchOpcodeForIntCondCode(CC); BuildMI(HeadMBB, DL, TII.get(Opcode)) @@ -718,18 +1289,50 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // IfFalseMBB just falls through to TailMBB. IfFalseMBB->addSuccessor(TailMBB); - // %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ] - BuildMI(*TailMBB, TailMBB->begin(), DL, TII.get(RISCV::PHI), - MI.getOperand(0).getReg()) - .addReg(MI.getOperand(4).getReg()) - .addMBB(HeadMBB) - .addReg(MI.getOperand(5).getReg()) - .addMBB(IfFalseMBB); + // Create PHIs for all of the select pseudo-instructions. + auto SelectMBBI = MI.getIterator(); + auto SelectEnd = std::next(LastSelectPseudo->getIterator()); + auto InsertionPoint = TailMBB->begin(); + while (SelectMBBI != SelectEnd) { + auto Next = std::next(SelectMBBI); + if (isSelectPseudo(*SelectMBBI)) { + // %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ] + BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(), + TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg()) + .addReg(SelectMBBI->getOperand(4).getReg()) + .addMBB(HeadMBB) + .addReg(SelectMBBI->getOperand(5).getReg()) + .addMBB(IfFalseMBB); + SelectMBBI->eraseFromParent(); + } + SelectMBBI = Next; + } - MI.eraseFromParent(); // The pseudo instruction is gone now. + F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs); return TailMBB; } +MachineBasicBlock * +RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instr type to insert"); + case RISCV::ReadCycleWide: + assert(!Subtarget.is64Bit() && + "ReadCycleWrite is only to be used on riscv32"); + return emitReadCycleWidePseudo(MI, BB); + case RISCV::Select_GPR_Using_CC_GPR: + case RISCV::Select_FPR32_Using_CC_GPR: + case RISCV::Select_FPR64_Using_CC_GPR: + return emitSelectPseudo(MI, BB); + case RISCV::BuildPairF64Pseudo: + return emitBuildPairF64Pseudo(MI, BB); + case RISCV::SplitF64Pseudo: + return emitSplitF64Pseudo(MI, BB); + } +} + // Calling Convention Implementation. // The expectations for frontend ABI lowering vary from target to target. // Ideally, an LLVM frontend would be able to avoid worrying about many ABI @@ -759,6 +1362,14 @@ static const MCPhysReg ArgGPRs[] = { RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17 }; +static const MCPhysReg ArgFPR32s[] = { + RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32, + RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32 +}; +static const MCPhysReg ArgFPR64s[] = { + RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64, + RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64 +}; // Pass a 2*XLEN argument that has been split into two XLEN values through // registers or the stack as necessary. @@ -799,22 +1410,59 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, } // Implements the RISC-V calling convention. Returns true upon failure. -static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { +static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, + MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, + bool IsRet, Type *OrigTy) { unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); assert(XLen == 32 || XLen == 64); MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; - if (ValVT == MVT::f32) { - LocVT = MVT::i32; - LocInfo = CCValAssign::BCvt; - } // Any return value split in to more than two values can't be returned // directly. if (IsRet && ValNo > 1) return true; + // UseGPRForF32 if targeting one of the soft-float ABIs, if passing a + // variadic argument, or if no F32 argument registers are available. + bool UseGPRForF32 = true; + // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a + // variadic argument, or if no F64 argument registers are available. + bool UseGPRForF64 = true; + + switch (ABI) { + default: + llvm_unreachable("Unexpected ABI"); + case RISCVABI::ABI_ILP32: + case RISCVABI::ABI_LP64: + break; + case RISCVABI::ABI_ILP32F: + case RISCVABI::ABI_LP64F: + UseGPRForF32 = !IsFixed; + break; + case RISCVABI::ABI_ILP32D: + case RISCVABI::ABI_LP64D: + UseGPRForF32 = !IsFixed; + UseGPRForF64 = !IsFixed; + break; + } + + if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s)) + UseGPRForF32 = true; + if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s)) + UseGPRForF64 = true; + + // From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local + // variables rather than directly checking against the target ABI. + + if (UseGPRForF32 && ValVT == MVT::f32) { + LocVT = XLenVT; + LocInfo = CCValAssign::BCvt; + } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) { + LocVT = MVT::i64; + LocInfo = CCValAssign::BCvt; + } + // If this is a variadic argument, the RISC-V calling convention requires // that it is assigned an 'even' or 'aligned' register if it has 8-byte // alignment (RV32) or 16-byte alignment (RV64). An aligned register should @@ -838,8 +1486,9 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT, assert(PendingLocs.size() == PendingArgFlags.size() && "PendingLocs and PendingArgFlags out of sync"); - // Handle passing f64 on RV32D with a soft float ABI. - if (XLen == 32 && ValVT == MVT::f64) { + // Handle passing f64 on RV32D with a soft float ABI or when floating point + // registers are exhausted. + if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) { assert(!ArgFlags.isSplit() && PendingLocs.empty() && "Can't lower f64 if it is split"); // Depending on available argument GPRS, f64 may be passed in a pair of @@ -888,7 +1537,13 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT, } // Allocate to a register if possible, or else a stack slot. - unsigned Reg = State.AllocateReg(ArgGPRs); + unsigned Reg; + if (ValVT == MVT::f32 && !UseGPRForF32) + Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s); + else if (ValVT == MVT::f64 && !UseGPRForF64) + Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s); + else + Reg = State.AllocateReg(ArgGPRs); unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8); // If we reach this point and PendingLocs is non-empty, we must be at the @@ -909,15 +1564,17 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT, return false; } - assert(LocVT == XLenVT && "Expected an XLenVT at this stage"); + assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) && + "Expected an XLenVT at this stage"); if (Reg) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; } - if (ValVT == MVT::f32) { - LocVT = MVT::f32; + // When an f32 or f64 is passed on the stack, no bit-conversion is needed. + if (ValVT == MVT::f32 || ValVT == MVT::f64) { + LocVT = ValVT; LocInfo = CCValAssign::Full; } State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); @@ -940,7 +1597,8 @@ void RISCVTargetLowering::analyzeInputArgs( else if (Ins[i].isOrigArg()) ArgTy = FType->getParamType(Ins[i].getOrigArgIndex()); - if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full, + RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); + if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << '\n'); @@ -960,7 +1618,8 @@ void RISCVTargetLowering::analyzeOutputArgs( ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr; - if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full, + RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); + if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"); @@ -979,6 +1638,10 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, case CCValAssign::Full: break; case CCValAssign::BCvt: + if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) { + Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val); + break; + } Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; } @@ -993,8 +1656,24 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain, MachineRegisterInfo &RegInfo = MF.getRegInfo(); EVT LocVT = VA.getLocVT(); SDValue Val; + const TargetRegisterClass *RC; + + switch (LocVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("Unexpected register type"); + case MVT::i32: + case MVT::i64: + RC = &RISCV::GPRRegClass; + break; + case MVT::f32: + RC = &RISCV::FPR32RegClass; + break; + case MVT::f64: + RC = &RISCV::FPR64RegClass; + break; + } - unsigned VReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + unsigned VReg = RegInfo.createVirtualRegister(RC); RegInfo.addLiveIn(VA.getLocReg(), VReg); Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT); @@ -1014,6 +1693,10 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, case CCValAssign::Full: break; case CCValAssign::BCvt: + if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) { + Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val); + break; + } Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val); break; } @@ -1040,6 +1723,7 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: case CCValAssign::Indirect: + case CCValAssign::BCvt: ExtType = ISD::NON_EXTLOAD; break; } @@ -1227,12 +1911,12 @@ SDValue RISCVTargetLowering::LowerFormalArguments( return Chain; } -/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. /// Note: This is modelled after ARM's IsEligibleForTailCallOptimization. -bool RISCVTargetLowering::IsEligibleForTailCallOptimization( - CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, - const SmallVector &ArgLocs) const { +bool RISCVTargetLowering::isEligibleForTailCallOptimization( + CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, + const SmallVector &ArgLocs) const { auto &Callee = CLI.Callee; auto CalleeCC = CLI.CallConv; @@ -1335,8 +2019,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. if (IsTailCall) - IsTailCall = IsEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, - ArgLocs); + IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs); if (IsTailCall) ++NumTailCalls; @@ -1482,9 +2165,21 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't // split it and then direct call can be matched by PseudoCALL. if (GlobalAddressSDNode *S = dyn_cast(Callee)) { - Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT, 0, 0); + const GlobalValue *GV = S->getGlobal(); + + unsigned OpFlags = RISCVII::MO_CALL; + if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) + OpFlags = RISCVII::MO_PLT; + + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { - Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, 0); + unsigned OpFlags = RISCVII::MO_CALL; + + if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(), + nullptr)) + OpFlags = RISCVII::MO_PLT; + + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags); } // The first call operand is the chain and the second is the target address. @@ -1567,8 +2262,9 @@ bool RISCVTargetLowering::CanLowerReturn( for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - if (CC_RISCV(MF.getDataLayout(), i, VT, VT, CCValAssign::Full, ArgFlags, - CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr)) + RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); + if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full, + ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr)) return false; } return true; @@ -1679,6 +2375,24 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { return "RISCVISD::SplitF64"; case RISCVISD::TAIL: return "RISCVISD::TAIL"; + case RISCVISD::SLLW: + return "RISCVISD::SLLW"; + case RISCVISD::SRAW: + return "RISCVISD::SRAW"; + case RISCVISD::SRLW: + return "RISCVISD::SRLW"; + case RISCVISD::DIVW: + return "RISCVISD::DIVW"; + case RISCVISD::DIVUW: + return "RISCVISD::DIVUW"; + case RISCVISD::REMUW: + return "RISCVISD::REMUW"; + case RISCVISD::FMV_W_X_RV64: + return "RISCVISD::FMV_W_X_RV64"; + case RISCVISD::FMV_X_ANYEXTW_RV64: + return "RISCVISD::FMV_X_ANYEXTW_RV64"; + case RISCVISD::READ_CYCLE_WIDE: + return "RISCVISD::READ_CYCLE_WIDE"; } return nullptr; } @@ -1701,6 +2415,44 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } +void RISCVTargetLowering::LowerAsmOperandForConstraint( + SDValue Op, std::string &Constraint, std::vector &Ops, + SelectionDAG &DAG) const { + // Currently only support length 1 constraints. + if (Constraint.length() == 1) { + switch (Constraint[0]) { + case 'I': + // Validate & create a 12-bit signed immediate operand. + if (auto *C = dyn_cast(Op)) { + uint64_t CVal = C->getSExtValue(); + if (isInt<12>(CVal)) + Ops.push_back( + DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT())); + } + return; + case 'J': + // Validate & create an integer zero operand. + if (auto *C = dyn_cast(Op)) + if (C->getZExtValue() == 0) + Ops.push_back( + DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT())); + return; + case 'K': + // Validate & create a 5-bit unsigned immediate operand. + if (auto *C = dyn_cast(Op)) { + uint64_t CVal = C->getZExtValue(); + if (isUInt<5>(CVal)) + Ops.push_back( + DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT())); + } + return; + default: + break; + } + } + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { @@ -1721,6 +2473,12 @@ Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder, TargetLowering::AtomicExpansionKind RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + // atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating + // point operations can't be used in an lr/sc sequence without breaking the + // forward-progress guarantee. + if (AI->isFloatingPointOperation()) + return AtomicExpansionKind::CmpXChg; + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size == 8 || Size == 16) return AtomicExpansionKind::MaskedIntrinsic; @@ -1728,37 +2486,74 @@ RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { } static Intrinsic::ID -getIntrinsicForMaskedAtomicRMWBinOp32(AtomicRMWInst::BinOp BinOp) { - switch (BinOp) { - default: - llvm_unreachable("Unexpected AtomicRMW BinOp"); - case AtomicRMWInst::Xchg: - return Intrinsic::riscv_masked_atomicrmw_xchg_i32; - case AtomicRMWInst::Add: - return Intrinsic::riscv_masked_atomicrmw_add_i32; - case AtomicRMWInst::Sub: - return Intrinsic::riscv_masked_atomicrmw_sub_i32; - case AtomicRMWInst::Nand: - return Intrinsic::riscv_masked_atomicrmw_nand_i32; - case AtomicRMWInst::Max: - return Intrinsic::riscv_masked_atomicrmw_max_i32; - case AtomicRMWInst::Min: - return Intrinsic::riscv_masked_atomicrmw_min_i32; - case AtomicRMWInst::UMax: - return Intrinsic::riscv_masked_atomicrmw_umax_i32; - case AtomicRMWInst::UMin: - return Intrinsic::riscv_masked_atomicrmw_umin_i32; +getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) { + if (XLen == 32) { + switch (BinOp) { + default: + llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Xchg: + return Intrinsic::riscv_masked_atomicrmw_xchg_i32; + case AtomicRMWInst::Add: + return Intrinsic::riscv_masked_atomicrmw_add_i32; + case AtomicRMWInst::Sub: + return Intrinsic::riscv_masked_atomicrmw_sub_i32; + case AtomicRMWInst::Nand: + return Intrinsic::riscv_masked_atomicrmw_nand_i32; + case AtomicRMWInst::Max: + return Intrinsic::riscv_masked_atomicrmw_max_i32; + case AtomicRMWInst::Min: + return Intrinsic::riscv_masked_atomicrmw_min_i32; + case AtomicRMWInst::UMax: + return Intrinsic::riscv_masked_atomicrmw_umax_i32; + case AtomicRMWInst::UMin: + return Intrinsic::riscv_masked_atomicrmw_umin_i32; + } + } + + if (XLen == 64) { + switch (BinOp) { + default: + llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Xchg: + return Intrinsic::riscv_masked_atomicrmw_xchg_i64; + case AtomicRMWInst::Add: + return Intrinsic::riscv_masked_atomicrmw_add_i64; + case AtomicRMWInst::Sub: + return Intrinsic::riscv_masked_atomicrmw_sub_i64; + case AtomicRMWInst::Nand: + return Intrinsic::riscv_masked_atomicrmw_nand_i64; + case AtomicRMWInst::Max: + return Intrinsic::riscv_masked_atomicrmw_max_i64; + case AtomicRMWInst::Min: + return Intrinsic::riscv_masked_atomicrmw_min_i64; + case AtomicRMWInst::UMax: + return Intrinsic::riscv_masked_atomicrmw_umax_i64; + case AtomicRMWInst::UMin: + return Intrinsic::riscv_masked_atomicrmw_umin_i64; + } } + + llvm_unreachable("Unexpected XLen\n"); } Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic( IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr, Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const { - Value *Ordering = Builder.getInt32(static_cast(AI->getOrdering())); + unsigned XLen = Subtarget.getXLen(); + Value *Ordering = + Builder.getIntN(XLen, static_cast(AI->getOrdering())); Type *Tys[] = {AlignedAddr->getType()}; Function *LrwOpScwLoop = Intrinsic::getDeclaration( AI->getModule(), - getIntrinsicForMaskedAtomicRMWBinOp32(AI->getOperation()), Tys); + getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys); + + if (XLen == 64) { + Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty()); + Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); + ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty()); + } + + Value *Result; // Must pass the shift amount needed to sign extend the loaded value prior // to performing a signed comparison for min/max. ShiftAmt is the number of @@ -1770,13 +2565,18 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic( const DataLayout &DL = AI->getModule()->getDataLayout(); unsigned ValWidth = DL.getTypeStoreSizeInBits(AI->getValOperand()->getType()); - Value *SextShamt = Builder.CreateSub( - Builder.getInt32(Subtarget.getXLen() - ValWidth), ShiftAmt); - return Builder.CreateCall(LrwOpScwLoop, - {AlignedAddr, Incr, Mask, SextShamt, Ordering}); + Value *SextShamt = + Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt); + Result = Builder.CreateCall(LrwOpScwLoop, + {AlignedAddr, Incr, Mask, SextShamt, Ordering}); + } else { + Result = + Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering}); } - return Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering}); + if (XLen == 64) + Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); + return Result; } TargetLowering::AtomicExpansionKind @@ -1791,10 +2591,31 @@ RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR( Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr, Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const { - Value *Ordering = Builder.getInt32(static_cast(Ord)); + unsigned XLen = Subtarget.getXLen(); + Value *Ordering = Builder.getIntN(XLen, static_cast(Ord)); + Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32; + if (XLen == 64) { + CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty()); + NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty()); + Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); + CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64; + } Type *Tys[] = {AlignedAddr->getType()}; - Function *MaskedCmpXchg = Intrinsic::getDeclaration( - CI->getModule(), Intrinsic::riscv_masked_cmpxchg_i32, Tys); - return Builder.CreateCall(MaskedCmpXchg, - {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); + Function *MaskedCmpXchg = + Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys); + Value *Result = Builder.CreateCall( + MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); + if (XLen == 64) + Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); + return Result; +} + +unsigned RISCVTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + return RISCV::X10; +} + +unsigned RISCVTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + return RISCV::X11; } diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h index 6970900bb062..17db03bbb69e 100644 --- a/lib/Target/RISCV/RISCVISelLowering.h +++ b/lib/Target/RISCV/RISCVISelLowering.h @@ -1,9 +1,8 @@ //===-- RISCVISelLowering.h - RISCV DAG Lowering Interface ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -32,7 +31,27 @@ enum NodeType : unsigned { SELECT_CC, BuildPairF64, SplitF64, - TAIL + TAIL, + // RV64I shifts, directly matching the semantics of the named RISC-V + // instructions. + SLLW, + SRAW, + SRLW, + // 32-bit operations from RV64M that can't be simply matched with a pattern + // at instruction selection time. + DIVW, + DIVUW, + REMUW, + // FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast + // is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X. + // FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result. + // This is a more convenient semantic for producing dagcombines that remove + // unnecessary GPR->FPR->GPR moves. + FMV_W_X_RV64, + FMV_X_ANYEXTW_RV64, + // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target + // (returns (Lo, Hi)). It takes a chain operand. + READ_CYCLE_WIDE }; } @@ -56,11 +75,20 @@ public: bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; + bool hasBitPreservingFPLogic(EVT VT) const override; + // Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const override; + // This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; @@ -68,6 +96,10 @@ public: getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, + std::vector &Ops, + SelectionDAG &DAG) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; @@ -75,6 +107,10 @@ public: EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { + return VT.isScalarInteger(); + } + bool shouldInsertFencesForAtomic(const Instruction *I) const override { return isa(I) || isa(I); } @@ -83,6 +119,28 @@ public: Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; + ISD::NodeType getExtendForAtomicOps() const override { + return ISD::SIGN_EXTEND; + } + + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return false; + return true; + } + bool isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, @@ -110,17 +168,29 @@ private: Type *Ty) const override { return true; } + + template + SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; + + SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, + bool UseGOT) const; + SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const; + + bool shouldConsiderGEPOffsetSplit() const override { return true; } SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const; - bool IsEligibleForTailCallOptimization(CCState &CCInfo, - CallLoweringInfo &CLI, MachineFunction &MF, - const SmallVector &ArgLocs) const; + bool isEligibleForTailCallOptimization( + CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, + const SmallVector &ArgLocs) const; TargetLowering::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td index ebd676a6056e..7229ebfe1db0 100644 --- a/lib/Target/RISCV/RISCVInstrFormats.td +++ b/lib/Target/RISCV/RISCVInstrFormats.td @@ -1,9 +1,8 @@ //===-- RISCVInstrFormats.td - RISCV Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -109,6 +108,35 @@ class Pseudo pattern, string opcodestr = "", string let isCodeGenOnly = 1; } +// Pseudo load instructions. +class PseudoLoad + : Pseudo<(outs rdty:$rd), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr"> { + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; + let isCodeGenOnly = 0; + let isAsmParserOnly = 1; +} + +class PseudoFloatLoad + : Pseudo<(outs rdty:$rd, GPR:$tmp), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr, $tmp"> { + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; + let isCodeGenOnly = 0; + let isAsmParserOnly = 1; +} + +// Pseudo store instructions. +class PseudoStore + : Pseudo<(outs rsty:$rs, GPR:$tmp), (ins bare_symbol:$addr), [], opcodestr, "$rs, $addr, $tmp"> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 1; + let isCodeGenOnly = 0; + let isAsmParserOnly = 1; +} + // Instruction formats are listed in the order they appear in the RISC-V // instruction set manual (R, I, S, B, U, J) with sub-formats (e.g. RVInstR4, // RVInstRAtomic) sorted alphabetically. diff --git a/lib/Target/RISCV/RISCVInstrFormatsC.td b/lib/Target/RISCV/RISCVInstrFormatsC.td index bda8bbb558eb..690bec5181e2 100644 --- a/lib/Target/RISCV/RISCVInstrFormatsC.td +++ b/lib/Target/RISCV/RISCVInstrFormatsC.td @@ -1,9 +1,8 @@ //===-- RISCVInstrFormatsC.td - RISCV C Instruction Formats --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp index 76c74368ca11..99c8d2ef73de 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- RISCVInstrInfo.cpp - RISCV Instruction Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -291,9 +290,9 @@ unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB, return 0; // Remove the branch. - I->eraseFromParent(); if (BytesRemoved) *BytesRemoved += getInstSizeInBytes(*I); + I->eraseFromParent(); I = MBB.end(); @@ -304,9 +303,9 @@ unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB, return 1; // Remove the branch. - I->eraseFromParent(); if (BytesRemoved) *BytesRemoved += getInstSizeInBytes(*I); + I->eraseFromParent(); return 2; } @@ -383,8 +382,8 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, .addMBB(&DestBB, RISCVII::MO_LO); RS->enterBasicBlockEnd(MBB); - unsigned Scav = RS->scavengeRegisterBackwards( - RISCV::GPRRegClass, MachineBasicBlock::iterator(LuiMI), false, 0); + unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass, + LuiMI.getIterator(), false, 0); MRI.replaceRegWith(ScratchReg, Scav); MRI.clearVirtRegs(); RS->setRegUsed(Scav); @@ -437,10 +436,16 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: return 0; + case RISCV::PseudoCALLReg: case RISCV::PseudoCALL: case RISCV::PseudoTAIL: + case RISCV::PseudoLLA: + case RISCV::PseudoLA: + case RISCV::PseudoLA_TLS_IE: + case RISCV::PseudoLA_TLS_GD: return 8; - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { const MachineFunction &MF = *MI.getParent()->getParent(); const auto &TM = static_cast(MF.getTarget()); return getInlineAsmLength(MI.getOperand(0).getSymbolName(), @@ -448,3 +453,16 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { } } } + +bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + switch(Opcode) { + default: + break; + case RISCV::ADDI: + case RISCV::ORI: + case RISCV::XORI: + return (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0); + } + return MI.isAsCheapAsAMove(); +} diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h index 1d3279c3d31e..ff098e660d19 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.h +++ b/lib/Target/RISCV/RISCVInstrInfo.h @@ -1,9 +1,8 @@ //===-- RISCVInstrInfo.h - RISCV Instruction Information --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -79,6 +78,8 @@ public: bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override; + + bool isAsCheapAsAMove(const MachineInstr &MI) const override; }; } #endif diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td index d7cc13d4fabd..69bde15f1218 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.td +++ b/lib/Target/RISCV/RISCVInstrInfo.td @@ -1,9 +1,8 @@ //===-- RISCVInstrInfo.td - Target Description for RISCV ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,42 +10,48 @@ // //===----------------------------------------------------------------------===// -include "RISCVInstrFormats.td" - //===----------------------------------------------------------------------===// // RISC-V specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>; -def SDT_RISCVCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, - SDTCisVT<1, i32>]>; -def SDT_RISCVCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, - SDTCisVT<1, i32>]>; -def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>, - SDTCisSameAs<0, 4>, - SDTCisSameAs<4, 5>]>; - - -def Call : SDNode<"RISCVISD::CALL", SDT_RISCVCall, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPVariadic]>; -def CallSeqStart : SDNode<"ISD::CALLSEQ_START", SDT_RISCVCallSeqStart, - [SDNPHasChain, SDNPOutGlue]>; -def CallSeqEnd : SDNode<"ISD::CALLSEQ_END", SDT_RISCVCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def RetFlag : SDNode<"RISCVISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def URetFlag : SDNode<"RISCVISD::URET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; -def SRetFlag : SDNode<"RISCVISD::SRET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; -def MRetFlag : SDNode<"RISCVISD::MRET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; -def SelectCC : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC, - [SDNPInGlue]>; -def Tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPVariadic]>; +// Target-independent type requirements, but with target-specific formats. +def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; +def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + +// Target-dependent type requirements. +def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>; +def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>, + SDTCisSameAs<0, 4>, + SDTCisSameAs<4, 5>]>; + +// Target-independent nodes, but with target-specific formats. +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +// Target-dependent nodes. +def riscv_call : SDNode<"RISCVISD::CALL", SDT_RISCVCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def riscv_ret_flag : SDNode<"RISCVISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def riscv_uret_flag : SDNode<"RISCVISD::URET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; +def riscv_sret_flag : SDNode<"RISCVISD::SRET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; +def riscv_mret_flag : SDNode<"RISCVISD::MRET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; +def riscv_selectcc : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC, + [SDNPInGlue]>; +def riscv_tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def riscv_sllw : SDNode<"RISCVISD::SLLW", SDTIntShiftOp>; +def riscv_sraw : SDNode<"RISCVISD::SRAW", SDTIntShiftOp>; +def riscv_srlw : SDNode<"RISCVISD::SRLW", SDTIntShiftOp>; //===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. @@ -185,6 +190,30 @@ def bare_symbol : Operand { let ParserMatchClass = BareSymbol; } +def CallSymbol : AsmOperandClass { + let Name = "CallSymbol"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidCallSymbol"; + let ParserMethod = "parseCallSymbol"; +} + +// A bare symbol used in call/tail only. +def call_symbol : Operand { + let ParserMatchClass = CallSymbol; +} + +def TPRelAddSymbol : AsmOperandClass { + let Name = "TPRelAddSymbol"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidTPRelAddSymbol"; + let ParserMethod = "parseOperandWithModifier"; +} + +// A bare symbol with the %tprel_add variant. +def tprel_add_symbol : Operand { + let ParserMatchClass = TPRelAddSymbol; +} + def CSRSystemRegister : AsmOperandClass { let Name = "CSRSystemRegister"; let ParserMethod = "parseCSRSystemRegister"; @@ -233,6 +262,12 @@ def HI20 : SDNodeXFormgetValueType(0)); }]>; +//===----------------------------------------------------------------------===// +// Instruction Formats +//===----------------------------------------------------------------------===// + +include "RISCVInstrFormats.td" + //===----------------------------------------------------------------------===// // Instruction Class Templates //===----------------------------------------------------------------------===// @@ -307,7 +342,8 @@ class Priv funct7> // Instructions //===----------------------------------------------------------------------===// -let hasSideEffects = 0, isReMaterializable = 1, mayLoad = 0, mayStore = 0 in { +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in def LUI : RVInstU; @@ -321,7 +357,7 @@ def JAL : RVInstJ; + "jalr", "$rd, ${imm12}(${rs1})">; } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 def BEQ : BranchCC_rri<0b000, "beq">; @@ -343,13 +379,17 @@ def SW : Store_rri<0b010, "sw">; // ADDI isn't always rematerializable, but isReMaterializable will be used as // a hint which is verified in isReallyTriviallyReMaterializable. -let isReMaterializable = 1 in +let isReMaterializable = 1, isAsCheapAsAMove = 1 in def ADDI : ALU_ri<0b000, "addi">; def SLTI : ALU_ri<0b010, "slti">; def SLTIU : ALU_ri<0b011, "sltiu">; + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def XORI : ALU_ri<0b100, "xori">; def ORI : ALU_ri<0b110, "ori">; +} + def ANDI : ALU_ri<0b111, "andi">; def SLLI : Shift_ri<0, 0b001, "slli">; @@ -485,12 +525,6 @@ def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs), // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20) //===----------------------------------------------------------------------===// -// TODO la -// TODO lb lh lw -// TODO RV64I: ld -// TODO sb sh sw -// TODO RV64I: sd - def : InstAlias<"nop", (ADDI X0, X0, 0)>; // Note that the size is 32 because up to 8 32-bit instructions are needed to @@ -502,6 +536,22 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 32, def PseudoLI : Pseudo<(outs GPR:$rd), (ins ixlenimm_li:$imm), [], "li", "$rd, $imm">; +def PseudoLB : PseudoLoad<"lb">; +def PseudoLBU : PseudoLoad<"lbu">; +def PseudoLH : PseudoLoad<"lh">; +def PseudoLHU : PseudoLoad<"lhu">; +def PseudoLW : PseudoLoad<"lw">; + +def PseudoSB : PseudoStore<"sb">; +def PseudoSH : PseudoStore<"sh">; +def PseudoSW : PseudoStore<"sw">; + +let Predicates = [IsRV64] in { +def PseudoLWU : PseudoLoad<"lwu">; +def PseudoLD : PseudoLoad<"ld">; +def PseudoSD : PseudoStore<"sd">; +} // Predicates = [IsRV64] + def : InstAlias<"mv $rd, $rs", (ADDI GPR:$rd, GPR:$rs, 0)>; def : InstAlias<"not $rd, $rs", (XORI GPR:$rd, GPR:$rs, -1)>; def : InstAlias<"neg $rd, $rs", (SUB GPR:$rd, X0, GPR:$rs)>; @@ -547,27 +597,36 @@ def : InstAlias<"bgtu $rs, $rt, $offset", def : InstAlias<"bleu $rs, $rt, $offset", (BGEU GPR:$rt, GPR:$rs, simm13_lsb0:$offset), 0>; -// "ret" has more weight since "ret" and "jr" alias the same "jalr" instruction. -def : InstAlias<"j $offset", (JAL X0, simm21_lsb0_jal:$offset)>; -def : InstAlias<"jal $offset", (JAL X1, simm21_lsb0_jal:$offset)>; -def : InstAlias<"jr $rs", (JALR X0, GPR:$rs, 0)>; -def : InstAlias<"jalr $rs", (JALR X1, GPR:$rs, 0)>; -def : InstAlias<"ret", (JALR X0, X1, 0), 2>; +def : InstAlias<"j $offset", (JAL X0, simm21_lsb0_jal:$offset)>; +def : InstAlias<"jal $offset", (JAL X1, simm21_lsb0_jal:$offset)>; + +// Non-zero offset aliases of "jalr" are the lowest weight, followed by the +// two-register form, then the one-register forms and finally "ret". +def : InstAlias<"jr $rs", (JALR X0, GPR:$rs, 0), 3>; +def : InstAlias<"jr ${offset}(${rs})", (JALR X0, GPR:$rs, simm12:$offset)>; +def : InstAlias<"jalr $rs", (JALR X1, GPR:$rs, 0), 3>; +def : InstAlias<"jalr ${offset}(${rs})", (JALR X1, GPR:$rs, simm12:$offset)>; +def : InstAlias<"jalr $rd, $rs", (JALR GPR:$rd, GPR:$rs, 0), 2>; +def : InstAlias<"ret", (JALR X0, X1, 0), 4>; + +// Non-canonical forms for jump targets also accepted by the assembler. +def : InstAlias<"jr $rs, $offset", (JALR X0, GPR:$rs, simm12:$offset), 0>; +def : InstAlias<"jalr $rs, $offset", (JALR X1, GPR:$rs, simm12:$offset), 0>; +def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12:$offset), 0>; + // TODO call // TODO tail def : InstAlias<"fence", (FENCE 0xF, 0xF)>; // 0xF == iorw -// CSR Addresses: 0xC00 == cycle, 0xC01 == time, 0xC02 == instret -// 0xC80 == cycleh, 0xC81 == timeh, 0xC82 == instreth -def : InstAlias<"rdinstret $rd", (CSRRS GPR:$rd, 0xC02, X0)>; -def : InstAlias<"rdcycle $rd", (CSRRS GPR:$rd, 0xC00, X0)>; -def : InstAlias<"rdtime $rd", (CSRRS GPR:$rd, 0xC01, X0)>; +def : InstAlias<"rdinstret $rd", (CSRRS GPR:$rd, INSTRET.Encoding, X0)>; +def : InstAlias<"rdcycle $rd", (CSRRS GPR:$rd, CYCLE.Encoding, X0)>; +def : InstAlias<"rdtime $rd", (CSRRS GPR:$rd, TIME.Encoding, X0)>; let Predicates = [IsRV32] in { -def : InstAlias<"rdinstreth $rd", (CSRRS GPR:$rd, 0xC82, X0)>; -def : InstAlias<"rdcycleh $rd", (CSRRS GPR:$rd, 0xC80, X0)>; -def : InstAlias<"rdtimeh $rd", (CSRRS GPR:$rd, 0xC81, X0)>; +def : InstAlias<"rdinstreth $rd", (CSRRS GPR:$rd, INSTRETH.Encoding, X0)>; +def : InstAlias<"rdcycleh $rd", (CSRRS GPR:$rd, CYCLEH.Encoding, X0)>; +def : InstAlias<"rdtimeh $rd", (CSRRS GPR:$rd, TIMEH.Encoding, X0)>; } // Predicates = [IsRV32] def : InstAlias<"csrr $rd, $csr", (CSRRS GPR:$rd, csr_sysreg:$csr, X0)>; @@ -593,6 +652,24 @@ def : InstAlias<"sfence.vma", (SFENCE_VMA X0, X0)>; def : InstAlias<"sfence.vma $rs", (SFENCE_VMA GPR:$rs, X0)>; let EmitPriority = 0 in { +def : InstAlias<"lb $rd, (${rs1})", + (LB GPR:$rd, GPR:$rs1, 0)>; +def : InstAlias<"lh $rd, (${rs1})", + (LH GPR:$rd, GPR:$rs1, 0)>; +def : InstAlias<"lw $rd, (${rs1})", + (LW GPR:$rd, GPR:$rs1, 0)>; +def : InstAlias<"lbu $rd, (${rs1})", + (LBU GPR:$rd, GPR:$rs1, 0)>; +def : InstAlias<"lhu $rd, (${rs1})", + (LHU GPR:$rd, GPR:$rs1, 0)>; + +def : InstAlias<"sb $rs2, (${rs1})", + (SB GPR:$rs2, GPR:$rs1, 0)>; +def : InstAlias<"sh $rs2, (${rs1})", + (SH GPR:$rs2, GPR:$rs1, 0)>; +def : InstAlias<"sw $rs2, (${rs1})", + (SW GPR:$rs2, GPR:$rs1, 0)>; + def : InstAlias<"add $rd, $rs1, $imm12", (ADDI GPR:$rd, GPR:$rs1, simm12:$imm12)>; def : InstAlias<"and $rd, $rs1, $imm12", @@ -608,6 +685,13 @@ def : InstAlias<"srl $rd, $rs1, $shamt", def : InstAlias<"sra $rd, $rs1, $shamt", (SRAI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt)>; let Predicates = [IsRV64] in { +def : InstAlias<"lwu $rd, (${rs1})", + (LWU GPR:$rd, GPR:$rs1, 0)>; +def : InstAlias<"ld $rd, (${rs1})", + (LD GPR:$rd, GPR:$rs1, 0)>; +def : InstAlias<"sd $rs2, (${rs1})", + (SD GPR:$rs2, GPR:$rs1, 0)>; + def : InstAlias<"addw $rd, $rs1, $imm12", (ADDIW GPR:$rd, GPR:$rs1, simm12:$imm12)>; def : InstAlias<"sllw $rd, $rs1, $shamt", @@ -663,21 +747,9 @@ def sexti32 : PatFrags<(ops node:$src), def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{ return cast(N->getOperand(1))->getVT() == MVT::i32; }]>; -def assertzexti5 : PatFrag<(ops node:$src), (assertzext node:$src), [{ - return cast(N->getOperand(1))->getVT().getSizeInBits() <= 5; -}]>; def zexti32 : PatFrags<(ops node:$src), [(and node:$src, 0xffffffff), (assertzexti32 node:$src)]>; -// Defines a legal mask for (assertzexti5 (and src, mask)) to be combinable -// with a shiftw operation. The mask mustn't modify the lower 5 bits or the -// upper 32 bits. -def shiftwamt_mask : ImmLeaf(Imm) >= 5 && isUInt<32>(Imm); -}]>; -def shiftwamt : PatFrags<(ops node:$src), - [(assertzexti5 (and node:$src, shiftwamt_mask)), - (assertzexti5 node:$src)]>; /// Immediates @@ -714,6 +786,15 @@ def : PatGprGpr, SLL>; def : PatGprGpr, SRL>; def : PatGprGpr, SRA>; +// This is a special case of the ADD instruction used to facilitate the use of a +// fourth operand to emit a relocation on a symbol relating to this instruction. +// The relocation does not affect any bits of the instruction itself but is used +// as a hint to the linker. +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0 in +def PseudoAddTPRel : Pseudo<(outs GPR:$rd), + (ins GPR:$rs1, GPR:$rs2, tprel_add_symbol:$src), [], + "add", "$rd, $rs1, $rs2, $src">; + /// FrameIndex calculations def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12), @@ -732,8 +813,12 @@ def : PatGprSimm12; // handled by a RISC-V instruction. def : Pat<(seteq GPR:$rs1, 0), (SLTIU GPR:$rs1, 1)>; def : Pat<(seteq GPR:$rs1, GPR:$rs2), (SLTIU (XOR GPR:$rs1, GPR:$rs2), 1)>; +def : Pat<(seteq GPR:$rs1, simm12:$imm12), + (SLTIU (XORI GPR:$rs1, simm12:$imm12), 1)>; def : Pat<(setne GPR:$rs1, 0), (SLTU X0, GPR:$rs1)>; def : Pat<(setne GPR:$rs1, GPR:$rs2), (SLTU X0, (XOR GPR:$rs1, GPR:$rs2))>; +def : Pat<(setne GPR:$rs1, simm12:$imm12), + (SLTU X0, (XORI GPR:$rs1, simm12:$imm12))>; def : Pat<(setugt GPR:$rs1, GPR:$rs2), (SLTU GPR:$rs2, GPR:$rs1)>; def : Pat<(setuge GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>; def : Pat<(setule GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs2, GPR:$rs1), 1)>; @@ -746,7 +831,7 @@ class SelectCC_rrirr : Pseudo<(outs valty:$dst), (ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm, valty:$truev, valty:$falsev), - [(set valty:$dst, (SelectCC cmpty:$lhs, cmpty:$rhs, + [(set valty:$dst, (riscv_selectcc cmpty:$lhs, cmpty:$rhs, (XLenVT imm:$imm), valty:$truev, valty:$falsev))]>; def Select_GPR_Using_CC_GPR : SelectCC_rrirr; @@ -794,6 +879,17 @@ def : Pat<(brind GPR:$rs1), (PseudoBRIND GPR:$rs1, 0)>; def : Pat<(brind (add GPR:$rs1, simm12:$imm12)), (PseudoBRIND GPR:$rs1, simm12:$imm12)>; +// PsuedoCALLReg is a generic pseudo instruction for calls which will eventually +// expand to auipc and jalr while encoding, with any given register used as the +// destination. +// Define AsmString to print "call" when compile with -S flag. +// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction. +let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, hasSideEffects = 0, + mayStore = 0, mayLoad = 0 in +def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> { + let AsmString = "call\t$rd, $func"; +} + // PseudoCALL is a pseudo instruction which will eventually expand to auipc // and jalr while encoding. This is desirable, as an auipc+jalr pair with // R_RISCV_CALL and R_RISCV_RELAX relocations can be be relaxed by the linker @@ -801,23 +897,24 @@ def : Pat<(brind (add GPR:$rs1, simm12:$imm12)), // Define AsmString to print "call" when compile with -S flag. // Define isCodeGenOnly = 0 to support parsing assembly "call" instruction. let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in -def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func), - [(Call tglobaladdr:$func)]> { +def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> { let AsmString = "call\t$func"; } -def : Pat<(Call texternalsym:$func), (PseudoCALL texternalsym:$func)>; +def : Pat<(riscv_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; +def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; -def : Pat<(URetFlag), (URET X0, X0)>; -def : Pat<(SRetFlag), (SRET X0, X0)>; -def : Pat<(MRetFlag), (MRET X0, X0)>; +def : Pat<(riscv_uret_flag), (URET X0, X0)>; +def : Pat<(riscv_sret_flag), (SRET X0, X0)>; +def : Pat<(riscv_mret_flag), (MRET X0, X0)>; let isCall = 1, Defs = [X1] in -def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>, +def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1), + [(riscv_call GPR:$rs1)]>, PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>; let isBarrier = 1, isReturn = 1, isTerminator = 1 in -def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>, +def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_flag)]>, PseudoInstExpansion<(JALR X0, X1, 0)>; // PseudoTAIL is a pseudo instruction similar to PseudoCALL and will eventually @@ -825,17 +922,18 @@ def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>, // Define AsmString to print "tail" when compile with -S flag. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2], isCodeGenOnly = 0 in -def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst), []> { +def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []> { let AsmString = "tail\t$dst"; } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in -def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1), [(Tail GPRTC:$rs1)]>, +def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1), + [(riscv_tail GPRTC:$rs1)]>, PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>; -def : Pat<(Tail (iPTR tglobaladdr:$dst)), +def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)), (PseudoTAIL texternalsym:$dst)>; -def : Pat<(Tail (iPTR texternalsym:$dst)), +def : Pat<(riscv_tail (iPTR texternalsym:$dst)), (PseudoTAIL texternalsym:$dst)>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, @@ -843,6 +941,21 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "lla", "$dst, $src">; +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, + isAsmParserOnly = 1 in +def PseudoLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la", "$dst, $src">; + +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, + isAsmParserOnly = 1 in +def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.ie", "$dst, $src">; + +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, + isAsmParserOnly = 1 in +def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.gd", "$dst, $src">; + /// Loads multiclass LdPat { @@ -906,9 +1019,9 @@ def : Pat<(atomic_fence (XLenVT 7), (imm)), (FENCE 0b11, 0b11)>; // Pessimistically assume the stack pointer will be clobbered let Defs = [X2], Uses = [X2] in { def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(CallSeqStart timm:$amt1, timm:$amt2)]>; + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(CallSeqEnd timm:$amt1, timm:$amt2)]>; + [(callseq_end timm:$amt1, timm:$amt2)]>; } // Defs = [X2], Uses = [X2] /// RV64 patterns @@ -935,28 +1048,9 @@ def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32), def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt), (SRAIW GPR:$rs1, uimm5:$shamt)>; -// For variable-length shifts, we rely on assertzexti5 being inserted during -// lowering (see RISCVTargetLowering::PerformDAGCombine). This enables us to -// guarantee that selecting a 32-bit variable shift is legal (as the variable -// shift is known to be <= 32). We must also be careful not to create -// semantically incorrect patterns. For instance, selecting SRLW for -// (srl (zexti32 GPR:$rs1), (shiftwamt GPR:$rs2)), -// is not guaranteed to be safe, as we don't know whether the upper 32-bits of -// the result are used or not (in the case where rs2=0, this is a -// sign-extension operation). - -def : Pat<(sext_inreg (shl GPR:$rs1, (shiftwamt GPR:$rs2)), i32), - (SLLW GPR:$rs1, GPR:$rs2)>; -def : Pat<(zexti32 (shl GPR:$rs1, (shiftwamt GPR:$rs2))), - (SRLI (SLLI (SLLW GPR:$rs1, GPR:$rs2), 32), 32)>; - -def : Pat<(sext_inreg (srl (zexti32 GPR:$rs1), (shiftwamt GPR:$rs2)), i32), - (SRLW GPR:$rs1, GPR:$rs2)>; -def : Pat<(zexti32 (srl (zexti32 GPR:$rs1), (shiftwamt GPR:$rs2))), - (SRLI (SLLI (SRLW GPR:$rs1, GPR:$rs2), 32), 32)>; - -def : Pat<(sra (sexti32 GPR:$rs1), (shiftwamt GPR:$rs2)), - (SRAW GPR:$rs1, GPR:$rs2)>; +def : PatGprGpr; +def : PatGprGpr; +def : PatGprGpr; /// Loads @@ -971,6 +1065,16 @@ defm : StPat; defm : StPat; } // Predicates = [IsRV64] +/// readcyclecounter +// On RV64, we can directly read the 64-bit "cycle" CSR. +let Predicates = [IsRV64] in +def : Pat<(readcyclecounter), (CSRRS CYCLE.Encoding, X0)>; +// On RV32, ReadCycleWide will be expanded to the suggested loop reading both +// halves of the 64-bit "cycle" CSR. +let Predicates = [IsRV32], usesCustomInserter = 1, hasSideEffects = 0, +mayLoad = 0, mayStore = 0, hasNoSchedulingInfo = 1 in +def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins), [], "", "">; + //===----------------------------------------------------------------------===// // Standard extensions //===----------------------------------------------------------------------===// diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td index 9cb1d2f0b627..b768c9347b38 100644 --- a/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/lib/Target/RISCV/RISCVInstrInfoA.td @@ -1,9 +1,8 @@ //===-- RISCVInstrInfoA.td - RISC-V 'A' instructions -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -85,7 +84,7 @@ defm AMOMIN_D : AMO_rr_aq_rl<0b10000, 0b011, "amomin.d">; defm AMOMAX_D : AMO_rr_aq_rl<0b10100, 0b011, "amomax.d">; defm AMOMINU_D : AMO_rr_aq_rl<0b11000, 0b011, "amominu.d">; defm AMOMAXU_D : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">; -} // Predicates = [HasStedExtA, IsRV64] +} // Predicates = [HasStdExtA, IsRV64] //===----------------------------------------------------------------------===// // Pseudo-instructions and codegen patterns @@ -235,7 +234,7 @@ def : PseudoMaskedAMOPat { + (ins GPR:$addr, GPR:$cmpval, GPR:$newval, ixlenimm:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; let mayLoad = 1; let mayStore = 1; @@ -263,7 +262,7 @@ defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>; def PseudoMaskedCmpXchg32 : Pseudo<(outs GPR:$res, GPR:$scratch), (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, - i32imm:$ordering), []> { + ixlenimm:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; let mayLoad = 1; let mayStore = 1; @@ -276,3 +275,79 @@ def : Pat<(int_riscv_masked_cmpxchg_i32 GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>; } // Predicates = [HasStdExtA] + +let Predicates = [HasStdExtA, IsRV64] in { + +/// 64-bit atomic loads and stores + +// Fences will be inserted for atomic load/stores according to the logic in +// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}. +defm : LdPat; +defm : AtomicStPat; + +defm : AMOPat<"atomic_swap_64", "AMOSWAP_D">; +defm : AMOPat<"atomic_load_add_64", "AMOADD_D">; +defm : AMOPat<"atomic_load_and_64", "AMOAND_D">; +defm : AMOPat<"atomic_load_or_64", "AMOOR_D">; +defm : AMOPat<"atomic_load_xor_64", "AMOXOR_D">; +defm : AMOPat<"atomic_load_max_64", "AMOMAX_D">; +defm : AMOPat<"atomic_load_min_64", "AMOMIN_D">; +defm : AMOPat<"atomic_load_umax_64", "AMOMAXU_D">; +defm : AMOPat<"atomic_load_umin_64", "AMOMINU_D">; + +/// 64-bit AMOs + +def : Pat<(atomic_load_sub_64_monotonic GPR:$addr, GPR:$incr), + (AMOADD_D GPR:$addr, (SUB X0, GPR:$incr))>; +def : Pat<(atomic_load_sub_64_acquire GPR:$addr, GPR:$incr), + (AMOADD_D_AQ GPR:$addr, (SUB X0, GPR:$incr))>; +def : Pat<(atomic_load_sub_64_release GPR:$addr, GPR:$incr), + (AMOADD_D_RL GPR:$addr, (SUB X0, GPR:$incr))>; +def : Pat<(atomic_load_sub_64_acq_rel GPR:$addr, GPR:$incr), + (AMOADD_D_AQ_RL GPR:$addr, (SUB X0, GPR:$incr))>; +def : Pat<(atomic_load_sub_64_seq_cst GPR:$addr, GPR:$incr), + (AMOADD_D_AQ_RL GPR:$addr, (SUB X0, GPR:$incr))>; + +/// 64-bit pseudo AMOs + +def PseudoAtomicLoadNand64 : PseudoAMO; +// Ordering constants must be kept in sync with the AtomicOrdering enum in +// AtomicOrdering.h. +def : Pat<(atomic_load_nand_64_monotonic GPR:$addr, GPR:$incr), + (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 2)>; +def : Pat<(atomic_load_nand_64_acquire GPR:$addr, GPR:$incr), + (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 4)>; +def : Pat<(atomic_load_nand_64_release GPR:$addr, GPR:$incr), + (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 5)>; +def : Pat<(atomic_load_nand_64_acq_rel GPR:$addr, GPR:$incr), + (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 6)>; +def : Pat<(atomic_load_nand_64_seq_cst GPR:$addr, GPR:$incr), + (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 7)>; + +def : PseudoMaskedAMOPat; +def : PseudoMaskedAMOPat; +def : PseudoMaskedAMOPat; +def : PseudoMaskedAMOPat; +def : PseudoMaskedAMOMinMaxPat; +def : PseudoMaskedAMOMinMaxPat; +def : PseudoMaskedAMOPat; +def : PseudoMaskedAMOPat; + +/// 64-bit compare and exchange + +def PseudoCmpXchg64 : PseudoCmpXchg; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64>; + +def : Pat<(int_riscv_masked_cmpxchg_i64 + GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering), + (PseudoMaskedCmpXchg32 + GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>; +} // Predicates = [HasStdExtA, IsRV64] diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td index ad68b5a7dc97..94477341eea7 100644 --- a/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/lib/Target/RISCV/RISCVInstrInfoC.td @@ -1,9 +1,8 @@ //===- RISCVInstrInfoC.td - Compressed RISCV instructions -*- tblgen-*-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -523,6 +522,56 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> { } // Predicates = [HasStdExtC] +//===----------------------------------------------------------------------===// +// Assembler Pseudo Instructions +//===----------------------------------------------------------------------===// + +let EmitPriority = 0 in { +let Predicates = [HasStdExtC, HasStdExtD] in +def : InstAlias<"c.fld $rd, (${rs1})", (C_FLD FPR64C:$rd, GPRC:$rs1, 0)>; + +def : InstAlias<"c.lw $rd, (${rs1})", (C_LW GPRC:$rd, GPRC:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in +def : InstAlias<"c.flw $rd, (${rs1})", (C_FLW FPR32C:$rd, GPRC:$rs1, 0)>; + +let Predicates = [HasStdExtC, IsRV64] in +def : InstAlias<"c.ld $rd, (${rs1})", (C_LD GPRC:$rd, GPRC:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtD] in +def : InstAlias<"c.fsd $rs2, (${rs1})", (C_FSD FPR64C:$rs2, GPRC:$rs1, 0)>; + +def : InstAlias<"c.sw $rs2, (${rs1})", (C_SW GPRC:$rs2, GPRC:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in +def : InstAlias<"c.fsw $rs2, (${rs1})", (C_FSW FPR32C:$rs2, GPRC:$rs1, 0)>; + +let Predicates = [HasStdExtC, IsRV64] in +def : InstAlias<"c.sd $rs2, (${rs1})", (C_SD GPRC:$rs2, GPRC:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtD] in +def : InstAlias<"c.fldsp $rd, (${rs1})", (C_FLDSP FPR64C:$rd, SP:$rs1, 0)>; + +def : InstAlias<"c.lwsp $rd, (${rs1})", (C_LWSP GPRC:$rd, SP:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in +def : InstAlias<"c.flwsp $rd, (${rs1})", (C_FLWSP FPR32C:$rd, SP:$rs1, 0)>; + +let Predicates = [HasStdExtC, IsRV64] in +def : InstAlias<"c.ldsp $rd, (${rs1})", (C_LDSP GPRC:$rd, SP:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtD] in +def : InstAlias<"c.fsdsp $rs2, (${rs1})", (C_FSDSP FPR64C:$rs2, SP:$rs1, 0)>; + +def : InstAlias<"c.swsp $rs2, (${rs1})", (C_SWSP GPRC:$rs2, SP:$rs1, 0)>; + +let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in +def : InstAlias<"c.fswsp $rs2, (${rs1})", (C_FSWSP FPR32C:$rs2, SP:$rs1, 0)>; + +let Predicates = [HasStdExtC, IsRV64] in +def : InstAlias<"c.sdsp $rs2, (${rs1})", (C_SDSP GPRC:$rs2, SP:$rs1, 0)>; +} + //===----------------------------------------------------------------------===// // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// diff --git a/lib/Target/RISCV/RISCVInstrInfoD.td b/lib/Target/RISCV/RISCVInstrInfoD.td index 9f1cd50de595..fe38c4ff02d3 100644 --- a/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/lib/Target/RISCV/RISCVInstrInfoD.td @@ -1,9 +1,8 @@ //===-- RISCVInstrInfoD.td - RISC-V 'D' instructions -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -179,8 +178,8 @@ def FMV_D_X : FPUnaryOp_r<0b1111001, 0b000, FPR64, GPR, "fmv.d.x"> { //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { -// TODO fld -// TODO fsd +def : InstAlias<"fld $rd, (${rs1})", (FLD FPR64:$rd, GPR:$rs1, 0), 0>; +def : InstAlias<"fsd $rs2, (${rs1})", (FSD FPR64:$rs2, GPR:$rs1, 0), 0>; def : InstAlias<"fmv.d $rd, $rs", (FSGNJ_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>; def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>; @@ -192,6 +191,9 @@ def : InstAlias<"fgt.d $rd, $rs, $rt", (FLT_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>; def : InstAlias<"fge.d $rd, $rs, $rt", (FLE_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>; + +def PseudoFLD : PseudoFloatLoad<"fld", FPR64>; +def PseudoFSD : PseudoStore<"fsd", FPR64>; } // Predicates = [HasStdExtD] //===----------------------------------------------------------------------===// @@ -268,6 +270,10 @@ def : PatFpr64Fpr64; // handled by a RISC-V instruction and aren't expanded in the SelectionDAG // Legalizer. +def : Pat<(seto FPR64:$rs1, FPR64:$rs2), + (AND (FEQ_D FPR64:$rs1, FPR64:$rs1), + (FEQ_D FPR64:$rs2, FPR64:$rs2))>; + def : Pat<(setuo FPR64:$rs1, FPR64:$rs2), (SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1), (FEQ_D FPR64:$rs2, FPR64:$rs2)), @@ -308,3 +314,26 @@ def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>; def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_W GPR:$rs1)>; def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>; } // Predicates = [HasStdExtD, IsRV32] + +let Predicates = [HasStdExtD, IsRV64] in { +def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>; +def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>; + +// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe +// because fpto[u|s]i produce poison if the value can't fit into the target. +// We match the single case below because fcvt.wu.d sign-extends its result so +// is cheaper than fcvt.lu.d+sext.w. +def : Pat<(sext_inreg (zexti32 (fp_to_uint FPR64:$rs1)), i32), + (FCVT_WU_D $rs1, 0b001)>; + +// [u]int32->fp +def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_D_W $rs1)>; +def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_D_WU $rs1)>; + +def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_L_D FPR64:$rs1, 0b001)>; +def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_LU_D FPR64:$rs1, 0b001)>; + +// [u]int64->fp. Match GCC and default to using dynamic rounding mode. +def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_L GPR:$rs1, 0b111)>; +def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_LU GPR:$rs1, 0b111)>; +} // Predicates = [HasStdExtD, IsRV64] diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td index 03bdac45873d..032642942f2b 100644 --- a/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/lib/Target/RISCV/RISCVInstrInfoF.td @@ -1,9 +1,8 @@ //===-- RISCVInstrInfoF.td - RISC-V 'F' instructions -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,20 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// RISC-V specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_RISCVFMV_W_X_RV64 + : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>; +def SDT_RISCVFMV_X_ANYEXTW_RV64 + : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; + +def riscv_fmv_w_x_rv64 + : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>; +def riscv_fmv_x_anyextw_rv64 + : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>; + //===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// @@ -193,8 +206,8 @@ def : FPUnaryOpDynFrmAlias; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { -// TODO flw -// TODO fsw +def : InstAlias<"flw $rd, (${rs1})", (FLW FPR32:$rd, GPR:$rs1, 0), 0>; +def : InstAlias<"fsw $rs2, (${rs1})", (FSW FPR32:$rs2, GPR:$rs1, 0), 0>; def : InstAlias<"fmv.s $rd, $rs", (FSGNJ_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>; def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>; @@ -209,28 +222,30 @@ def : InstAlias<"fge.s $rd, $rs, $rt", // The following csr instructions actually alias instructions from the base ISA. // However, it only makes sense to support them when the F extension is enabled. -// CSR Addresses: 0x003 == fcsr, 0x002 == frm, 0x001 == fflags // NOTE: "frcsr", "frrm", and "frflags" are more specialized version of "csrr". -def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, 0x003, X0), 2>; -def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, 0x003, GPR:$rs)>; -def : InstAlias<"fscsr $rs", (CSRRW X0, 0x003, GPR:$rs), 2>; - -def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, 0x002, X0), 2>; -def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, 0x002, GPR:$rs)>; -def : InstAlias<"fsrm $rs", (CSRRW X0, 0x002, GPR:$rs), 2>; -def : InstAlias<"fsrmi $rd, $imm", (CSRRWI GPR:$rd, 0x002, uimm5:$imm)>; -def : InstAlias<"fsrmi $imm", (CSRRWI X0, 0x002, uimm5:$imm), 2>; - -def : InstAlias<"frflags $rd", (CSRRS GPR:$rd, 0x001, X0), 2>; -def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, 0x001, GPR:$rs)>; -def : InstAlias<"fsflags $rs", (CSRRW X0, 0x001, GPR:$rs), 2>; -def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, 0x001, uimm5:$imm)>; -def : InstAlias<"fsflagsi $imm", (CSRRWI X0, 0x001, uimm5:$imm), 2>; +def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>; +def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>; +def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>; + +def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>; +def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>; +def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>; +def : InstAlias<"fsrmi $rd, $imm", (CSRRWI GPR:$rd, FRM.Encoding, uimm5:$imm)>; +def : InstAlias<"fsrmi $imm", (CSRRWI X0, FRM.Encoding, uimm5:$imm), 2>; + +def : InstAlias<"frflags $rd", (CSRRS GPR:$rd, FFLAGS.Encoding, X0), 2>; +def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, FFLAGS.Encoding, GPR:$rs)>; +def : InstAlias<"fsflags $rs", (CSRRW X0, FFLAGS.Encoding, GPR:$rs), 2>; +def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, FFLAGS.Encoding, uimm5:$imm)>; +def : InstAlias<"fsflagsi $imm", (CSRRWI X0, FFLAGS.Encoding, uimm5:$imm), 2>; // fmv.w.x and fmv.x.w were previously known as fmv.s.x and fmv.x.s. Both // spellings should be supported by standard tools. def : MnemonicAlias<"fmv.s.x", "fmv.w.x">; def : MnemonicAlias<"fmv.x.s", "fmv.x.w">; + +def PseudoFLW : PseudoFloatLoad<"flw", FPR32>; +def PseudoFSW : PseudoStore<"fsw", FPR32>; } // Predicates = [HasStdExtF] //===----------------------------------------------------------------------===// @@ -308,6 +323,10 @@ def : PatFpr32Fpr32; // handled by a RISC-V instruction and aren't expanded in the SelectionDAG // Legalizer. +def : Pat<(seto FPR32:$rs1, FPR32:$rs2), + (AND (FEQ_S FPR32:$rs1, FPR32:$rs1), + (FEQ_S FPR32:$rs2, FPR32:$rs2))>; + def : Pat<(setuo FPR32:$rs1, FPR32:$rs2), (SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1), (FEQ_S FPR32:$rs2, FPR32:$rs2)), @@ -334,3 +353,37 @@ def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>; def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>; def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>; } // Predicates = [HasStdExtF, IsRV32] + +let Predicates = [HasStdExtF, IsRV32] in { +// FP->[u]int. Round-to-zero must be used +def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>; +def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>; + +// [u]int->fp. Match GCC and default to using dynamic rounding mode. +def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>; +def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>; +} // Predicates = [HasStdExtF, IsRV32] + +let Predicates = [HasStdExtF, IsRV64] in { +def : Pat<(riscv_fmv_w_x_rv64 GPR:$src), (FMV_W_X GPR:$src)>; +def : Pat<(riscv_fmv_x_anyextw_rv64 FPR32:$src), (FMV_X_W FPR32:$src)>; +def : Pat<(sexti32 (riscv_fmv_x_anyextw_rv64 FPR32:$src)), + (FMV_X_W FPR32:$src)>; + +// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe +// because fpto[u|s]i produces poison if the value can't fit into the target. +// We match the single case below because fcvt.wu.s sign-extends its result so +// is cheaper than fcvt.lu.s+sext.w. +def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR32:$rs1)), i32), + (FCVT_WU_S $rs1, 0b001)>; + +// FP->[u]int64 +def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_L_S $rs1, 0b001)>; +def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_LU_S $rs1, 0b001)>; + +// [u]int->fp. Match GCC and default to using dynamic rounding mode. +def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_S_W $rs1, 0b111)>; +def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>; +def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_L $rs1, 0b111)>; +def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_LU $rs1, 0b111)>; +} // Predicates = [HasStdExtF, IsRV64] diff --git a/lib/Target/RISCV/RISCVInstrInfoM.td b/lib/Target/RISCV/RISCVInstrInfoM.td index 05dd3311ad54..e75151ba99c7 100644 --- a/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/lib/Target/RISCV/RISCVInstrInfoM.td @@ -1,9 +1,8 @@ //===-- RISCVInstrInfoM.td - RISC-V 'M' instructions -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,14 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// RISC-V specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def riscv_divw : SDNode<"RISCVISD::DIVW", SDTIntBinOp>; +def riscv_divuw : SDNode<"RISCVISD::DIVUW", SDTIntBinOp>; +def riscv_remuw : SDNode<"RISCVISD::REMUW", SDTIntBinOp>; + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -53,18 +60,19 @@ def : PatGprGpr; let Predicates = [HasStdExtM, IsRV64] in { def : Pat<(sext_inreg (mul GPR:$rs1, GPR:$rs2), i32), (MULW GPR:$rs1, GPR:$rs2)>; -def : Pat<(sext_inreg (sdiv (sexti32 GPR:$rs1), - (sexti32 GPR:$rs2)), i32), - (DIVW GPR:$rs1, GPR:$rs2)>; -def : Pat<(zexti32 (sdiv (sexti32 GPR:$rs1), - (sexti32 GPR:$rs2))), - (SRLI (SLLI (DIVW GPR:$rs1, GPR:$rs2), 32), 32)>; -def : Pat<(sext_inreg (udiv (zexti32 GPR:$rs1), (zexti32 GPR:$rs2)), i32), - (DIVUW GPR:$rs1, GPR:$rs2)>; -// It's cheaper to perform a divuw and zero-extend the result than to -// zero-extend both inputs to a udiv. -def : Pat<(udiv (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff)), - (SRLI (SLLI (DIVUW GPR:$rs1, GPR:$rs2), 32), 32)>; + +def : PatGprGpr; +def : PatGprGpr; +def : PatGprGpr; + +// Handle the specific cases where using DIVU/REMU would be correct and result +// in fewer instructions than emitting DIVUW/REMUW then zero-extending the +// result. +def : Pat<(zexti32 (riscv_divuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))), + (DIVU GPR:$rs1, GPR:$rs2)>; +def : Pat<(zexti32 (riscv_remuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))), + (REMU GPR:$rs1, GPR:$rs2)>; + // Although the sexti32 operands may not have originated from an i32 srem, // this pattern is safe as it is impossible for two sign extended inputs to // produce a result where res[63:32]=0 and res[31]=1. @@ -73,10 +81,4 @@ def : Pat<(srem (sexti32 GPR:$rs1), (sexti32 GPR:$rs2)), def : Pat<(sext_inreg (srem (sexti32 GPR:$rs1), (sexti32 GPR:$rs2)), i32), (REMW GPR:$rs1, GPR:$rs2)>; -def : Pat<(sext_inreg (urem (zexti32 GPR:$rs1), (zexti32 GPR:$rs2)), i32), - (REMUW GPR:$rs1, GPR:$rs2)>; -// It's cheaper to perform a remuw and zero-extend the result than to -// zero-extend both inputs to a urem. -def : Pat<(urem (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff)), - (SRLI (SLLI (REMUW GPR:$rs1, GPR:$rs2), 32), 32)>; } // Predicates = [HasStdExtM, IsRV64] diff --git a/lib/Target/RISCV/RISCVMCInstLower.cpp b/lib/Target/RISCV/RISCVMCInstLower.cpp index e0100b1679be..b1dbcfa7f738 100644 --- a/lib/Target/RISCV/RISCVMCInstLower.cpp +++ b/lib/Target/RISCV/RISCVMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- RISCVMCInstLower.cpp - Convert RISCV MachineInstr to an MCInst ------=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -37,12 +36,42 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, case RISCVII::MO_None: Kind = RISCVMCExpr::VK_RISCV_None; break; + case RISCVII::MO_CALL: + Kind = RISCVMCExpr::VK_RISCV_CALL; + break; + case RISCVII::MO_PLT: + Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; + break; case RISCVII::MO_LO: Kind = RISCVMCExpr::VK_RISCV_LO; break; case RISCVII::MO_HI: Kind = RISCVMCExpr::VK_RISCV_HI; break; + case RISCVII::MO_PCREL_LO: + Kind = RISCVMCExpr::VK_RISCV_PCREL_LO; + break; + case RISCVII::MO_PCREL_HI: + Kind = RISCVMCExpr::VK_RISCV_PCREL_HI; + break; + case RISCVII::MO_GOT_HI: + Kind = RISCVMCExpr::VK_RISCV_GOT_HI; + break; + case RISCVII::MO_TPREL_LO: + Kind = RISCVMCExpr::VK_RISCV_TPREL_LO; + break; + case RISCVII::MO_TPREL_HI: + Kind = RISCVMCExpr::VK_RISCV_TPREL_HI; + break; + case RISCVII::MO_TPREL_ADD: + Kind = RISCVMCExpr::VK_RISCV_TPREL_ADD; + break; + case RISCVII::MO_TLS_GOT_HI: + Kind = RISCVMCExpr::VK_RISCV_TLS_GOT_HI; + break; + case RISCVII::MO_TLS_GD_HI: + Kind = RISCVMCExpr::VK_RISCV_TLS_GD_HI; + break; } const MCExpr *ME = diff --git a/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/lib/Target/RISCV/RISCVMachineFunctionInfo.h index 2fea3a1bdd2f..585bff2bc20a 100644 --- a/lib/Target/RISCV/RISCVMachineFunctionInfo.h +++ b/lib/Target/RISCV/RISCVMachineFunctionInfo.h @@ -1,9 +1,8 @@ //=- RISCVMachineFunctionInfo.h - RISCV machine function info -----*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,8 +32,6 @@ private: int MoveF64FrameIndex = -1; public: - // RISCVMachineFunctionInfo() = default; - RISCVMachineFunctionInfo(MachineFunction &MF) : MF(MF) {} int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } diff --git a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index cea009c5447d..82b1209cb8e7 100644 --- a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -1,9 +1,8 @@ //===----- RISCVMergeBaseOffset.cpp - Optimise address calculations ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp index 3ed1dec434ce..e6a126e3e513 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- RISCVRegisterInfo.cpp - RISCV Register Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,17 +32,32 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode) const MCPhysReg * RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + auto &Subtarget = MF->getSubtarget(); if (MF->getFunction().hasFnAttribute("interrupt")) { - if (MF->getSubtarget().hasStdExtD()) + if (Subtarget.hasStdExtD()) return CSR_XLEN_F64_Interrupt_SaveList; - if (MF->getSubtarget().hasStdExtF()) + if (Subtarget.hasStdExtF()) return CSR_XLEN_F32_Interrupt_SaveList; return CSR_Interrupt_SaveList; } - return CSR_SaveList; + + switch (Subtarget.getTargetABI()) { + default: + llvm_unreachable("Unrecognized ABI"); + case RISCVABI::ABI_ILP32: + case RISCVABI::ABI_LP64: + return CSR_ILP32_LP64_SaveList; + case RISCVABI::ABI_ILP32F: + case RISCVABI::ABI_LP64F: + return CSR_ILP32F_LP64F_SaveList; + case RISCVABI::ABI_ILP32D: + case RISCVABI::ABI_LP64D: + return CSR_ILP32D_LP64D_SaveList; + } } BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = getFrameLowering(MF); BitVector Reserved(getNumRegs()); // Use markSuperRegs to ensure any register aliases are also reserved @@ -52,7 +66,8 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, RISCV::X2); // sp markSuperRegs(Reserved, RISCV::X3); // gp markSuperRegs(Reserved, RISCV::X4); // tp - markSuperRegs(Reserved, RISCV::X8); // fp + if (TFI->hasFP(MF)) + markSuperRegs(Reserved, RISCV::X8); // fp assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } @@ -109,7 +124,7 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); } -unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const TargetFrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? RISCV::X8 : RISCV::X2; } @@ -117,12 +132,26 @@ unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const uint32_t * RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF, CallingConv::ID /*CC*/) const { + auto &Subtarget = MF.getSubtarget(); if (MF.getFunction().hasFnAttribute("interrupt")) { - if (MF.getSubtarget().hasStdExtD()) + if (Subtarget.hasStdExtD()) return CSR_XLEN_F64_Interrupt_RegMask; - if (MF.getSubtarget().hasStdExtF()) + if (Subtarget.hasStdExtF()) return CSR_XLEN_F32_Interrupt_RegMask; return CSR_Interrupt_RegMask; } - return CSR_RegMask; + + switch (Subtarget.getTargetABI()) { + default: + llvm_unreachable("Unrecognized ABI"); + case RISCVABI::ABI_ILP32: + case RISCVABI::ABI_LP64: + return CSR_ILP32_LP64_RegMask; + case RISCVABI::ABI_ILP32F: + case RISCVABI::ABI_LP64F: + return CSR_ILP32F_LP64F_RegMask; + case RISCVABI::ABI_ILP32D: + case RISCVABI::ABI_LP64D: + return CSR_ILP32D_LP64D_RegMask; + } } diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h index cbbb70079dd1..4f339475508f 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/lib/Target/RISCV/RISCVRegisterInfo.h @@ -1,9 +1,8 @@ //===-- RISCVRegisterInfo.h - RISCV Register Information Impl ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,7 +39,7 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &MF) const override { return true; diff --git a/lib/Target/RISCV/RISCVRegisterInfo.td b/lib/Target/RISCV/RISCVRegisterInfo.td index 4be8ff9200e9..79f8ab12f6c0 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/lib/Target/RISCV/RISCVRegisterInfo.td @@ -1,9 +1,8 @@ //===-- RISCVRegisterInfo.td - RISC-V Register defs --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -56,7 +55,7 @@ let RegAltNameIndices = [ABIRegAltName] in { def X6 : RISCVReg<6, "x6", ["t1"]>, DwarfRegNum<[6]>; def X7 : RISCVReg<7, "x7", ["t2"]>, DwarfRegNum<[7]>; } - def X8 : RISCVReg<8, "x8", ["s0"]>, DwarfRegNum<[8]>; + def X8 : RISCVReg<8, "x8", ["s0", "fp"]>, DwarfRegNum<[8]>; def X9 : RISCVReg<9, "x9", ["s1"]>, DwarfRegNum<[9]>; def X10 : RISCVReg<10,"x10", ["a0"]>, DwarfRegNum<[10]>; def X11 : RISCVReg<11,"x11", ["a1"]>, DwarfRegNum<[11]>; diff --git a/lib/Target/RISCV/RISCVSubtarget.cpp b/lib/Target/RISCV/RISCVSubtarget.cpp index b221ea84a33c..6902ed75d852 100644 --- a/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/lib/Target/RISCV/RISCVSubtarget.cpp @@ -1,9 +1,8 @@ //===-- RISCVSubtarget.cpp - RISCV Subtarget Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,10 +25,10 @@ using namespace llvm; void RISCVSubtarget::anchor() {} -RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(StringRef CPU, - StringRef FS, - bool Is64Bit) { +RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies( + const Triple &TT, StringRef CPU, StringRef FS, StringRef ABIName) { // Determine default and user-specified characteristics + bool Is64Bit = TT.isArch64Bit(); std::string CPUName = CPU; if (CPUName.empty()) CPUName = Is64Bit ? "generic-rv64" : "generic-rv32"; @@ -38,11 +37,14 @@ RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(StringRef CPU, XLenVT = MVT::i64; XLen = 64; } + + TargetABI = RISCVABI::computeTargetABI(TT, getFeatureBits(), ABIName); + RISCVFeatures::validate(TT, getFeatureBits()); return *this; } -RISCVSubtarget::RISCVSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, const TargetMachine &TM) +RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + StringRef ABIName, const TargetMachine &TM) : RISCVGenSubtargetInfo(TT, CPU, FS), - FrameLowering(initializeSubtargetDependencies(CPU, FS, TT.isArch64Bit())), + FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)), InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) {} diff --git a/lib/Target/RISCV/RISCVSubtarget.h b/lib/Target/RISCV/RISCVSubtarget.h index 0e09391e7829..106ff49f021a 100644 --- a/lib/Target/RISCV/RISCVSubtarget.h +++ b/lib/Target/RISCV/RISCVSubtarget.h @@ -1,9 +1,8 @@ //===-- RISCVSubtarget.h - Define Subtarget for the RISCV -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "RISCVFrameLowering.h" #include "RISCVISelLowering.h" #include "RISCVInstrInfo.h" +#include "Utils/RISCVBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" @@ -36,9 +36,11 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { bool HasStdExtD = false; bool HasStdExtC = false; bool HasRV64 = false; + bool IsRV32E = false; bool EnableLinkerRelax = false; unsigned XLen = 32; MVT XLenVT = MVT::i32; + RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown; RISCVFrameLowering FrameLowering; RISCVInstrInfo InstrInfo; RISCVRegisterInfo RegInfo; @@ -47,13 +49,14 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { /// Initializes using the passed in CPU and feature strings so that we can /// use initializer lists for subtarget initialization. - RISCVSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS, - bool Is64Bit); + RISCVSubtarget &initializeSubtargetDependencies(const Triple &TT, + StringRef CPU, StringRef FS, + StringRef ABIName); public: // Initializes the data members to match that of the specified triple. - RISCVSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, const TargetMachine &TM); + RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + StringRef ABIName, const TargetMachine &TM); // Parses features string setting specified subtarget options. The // definition of this function is auto-generated by tblgen. @@ -78,9 +81,11 @@ public: bool hasStdExtD() const { return HasStdExtD; } bool hasStdExtC() const { return HasStdExtC; } bool is64Bit() const { return HasRV64; } + bool isRV32E() const { return IsRV32E; } bool enableLinkerRelax() const { return EnableLinkerRelax; } MVT getXLenVT() const { return XLenVT; } unsigned getXLen() const { return XLen; } + RISCVABI::ABI getTargetABI() const { return TargetABI; } }; } // End llvm namespace diff --git a/lib/Target/RISCV/RISCVSystemOperands.td b/lib/Target/RISCV/RISCVSystemOperands.td index f1b7984ffe6b..a46a32c4e7f2 100644 --- a/lib/Target/RISCV/RISCVSystemOperands.td +++ b/lib/Target/RISCV/RISCVSystemOperands.td @@ -1,9 +1,8 @@ //===- RISCVSystemOperands.td ----------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -72,18 +71,16 @@ def : SysReg<"uip", 0x044>; // User Floating-Point CSRs //===-------------------------- -let FeaturesRequired = [{ {RISCV::FeatureStdExtF} }] in { -def : SysReg<"fflags", 0x001>; -def : SysReg<"frm", 0x002>; -def : SysReg<"fcsr", 0x003>; -} +def FFLAGS : SysReg<"fflags", 0x001>; +def FRM : SysReg<"frm", 0x002>; +def FCSR : SysReg<"fcsr", 0x003>; //===-------------------------- // User Counter/Timers //===-------------------------- -def : SysReg<"cycle", 0xC00>; -def : SysReg<"time", 0xC01>; -def : SysReg<"instret", 0xC02>; +def CYCLE : SysReg<"cycle", 0xC00>; +def TIME : SysReg<"time", 0xC01>; +def INSTRET : SysReg<"instret", 0xC02>; def : SysReg<"hpmcounter3", 0xC03>; def : SysReg<"hpmcounter4", 0xC04>; @@ -116,9 +113,9 @@ def : SysReg<"hpmcounter30", 0xC1E>; def : SysReg<"hpmcounter31", 0xC1F>; let isRV32Only = 1 in { -def: SysReg<"cycleh", 0xC80>; -def: SysReg<"timeh", 0xC81>; -def: SysReg<"instreth", 0xC82>; +def CYCLEH : SysReg<"cycleh", 0xC80>; +def TIMEH : SysReg<"timeh", 0xC81>; +def INSTRETH : SysReg<"instreth", 0xC82>; def: SysReg<"hpmcounter3h", 0xC83>; def: SysReg<"hpmcounter4h", 0xC84>; diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp index 8937ec200bd7..f4e6ed9f6284 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- RISCVTargetMachine.cpp - Define TargetMachine for RISCV -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,10 +10,13 @@ // //===----------------------------------------------------------------------===// -#include "RISCV.h" #include "RISCVTargetMachine.h" +#include "RISCV.h" #include "RISCVTargetObjectFile.h" +#include "RISCVTargetTransformInfo.h" +#include "TargetInfo/RISCVTargetInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -31,7 +33,7 @@ extern "C" void LLVMInitializeRISCVTarget() { initializeRISCVExpandPseudoPass(*PR); } -static std::string computeDataLayout(const Triple &TT) { +static StringRef computeDataLayout(const Triple &TT) { if (TT.isArch64Bit()) { return "e-m:e-p:64:64-i64:64-i128:128-n64-S128"; } else { @@ -57,10 +59,15 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, getEffectiveRelocModel(TT, RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(make_unique()), - Subtarget(TT, CPU, FS, *this) { + Subtarget(TT, CPU, FS, Options.MCOptions.getABIName(), *this) { initAsmInfo(); } +TargetTransformInfo +RISCVTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(RISCVTTIImpl(this, F)); +} + namespace { class RISCVPassConfig : public TargetPassConfig { public: diff --git a/lib/Target/RISCV/RISCVTargetMachine.h b/lib/Target/RISCV/RISCVTargetMachine.h index 02361dddebf7..ebf3f3c07955 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.h +++ b/lib/Target/RISCV/RISCVTargetMachine.h @@ -1,9 +1,8 @@ //===-- RISCVTargetMachine.h - Define TargetMachine for RISCV ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,6 +39,8 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + TargetTransformInfo getTargetTransformInfo(const Function &F) override; }; } diff --git a/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/lib/Target/RISCV/RISCVTargetObjectFile.cpp index 46e81b628b65..bbd45c970d3d 100644 --- a/lib/Target/RISCV/RISCVTargetObjectFile.cpp +++ b/lib/Target/RISCV/RISCVTargetObjectFile.cpp @@ -1,14 +1,16 @@ //===-- RISCVTargetObjectFile.cpp - RISCV Object Info -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "RISCVTargetObjectFile.h" #include "RISCVTargetMachine.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" using namespace llvm; @@ -16,4 +18,97 @@ void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(TM.Options.UseInitArray); + + SmallDataSection = getContext().getELFSection( + ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS, + ELF::SHF_WRITE | ELF::SHF_ALLOC); +} + +// A address must be loaded from a small section if its size is less than the +// small section size threshold. Data in this section could be addressed by +// using gp_rel operator. +bool RISCVELFTargetObjectFile::isInSmallSection(uint64_t Size) const { + // gcc has traditionally not treated zero-sized objects as small data, so this + // is effectively part of the ABI. + return Size > 0 && Size <= SSThreshold; +} + +// Return true if this global address should be placed into small data/bss +// section. +bool RISCVELFTargetObjectFile::isGlobalInSmallSection( + const GlobalObject *GO, const TargetMachine &TM) const { + // Only global variables, not functions. + const GlobalVariable *GVA = dyn_cast(GO); + if (!GVA) + return false; + + // If the variable has an explicit section, it is placed in that section. + if (GVA->hasSection()) { + StringRef Section = GVA->getSection(); + + // Explicitly placing any variable in the small data section overrides + // the global -G value. + if (Section == ".sdata" || Section == ".sbss") + return true; + + // Otherwise reject putting the variable to small section if it has an + // explicit section name. + return false; + } + + if (((GVA->hasExternalLinkage() && GVA->isDeclaration()) || + GVA->hasCommonLinkage())) + return false; + + Type *Ty = GVA->getValueType(); + // It is possible that the type of the global is unsized, i.e. a declaration + // of a extern struct. In this case don't presume it is in the small data + // section. This happens e.g. when building the FreeBSD kernel. + if (!Ty->isSized()) + return false; + + return isInSmallSection( + GVA->getParent()->getDataLayout().getTypeAllocSize(Ty)); +} + +MCSection *RISCVELFTargetObjectFile::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + // Handle Small Section classification here. + if (Kind.isBSS() && isGlobalInSmallSection(GO, TM)) + return SmallBSSSection; + if (Kind.isData() && isGlobalInSmallSection(GO, TM)) + return SmallDataSection; + + // Otherwise, we work the same as ELF. + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); +} + +void RISCVELFTargetObjectFile::getModuleMetadata(Module &M) { + SmallVector ModuleFlags; + M.getModuleFlagsMetadata(ModuleFlags); + + for (const auto &MFE : ModuleFlags) { + StringRef Key = MFE.Key->getString(); + if (Key == "SmallDataLimit") { + SSThreshold = mdconst::extract(MFE.Val)->getZExtValue(); + break; + } + } +} + +/// Return true if this constant should be placed into small data section. +bool RISCVELFTargetObjectFile::isConstantInSmallSection( + const DataLayout &DL, const Constant *CN) const { + return isInSmallSection(DL.getTypeAllocSize(CN->getType())); +} + +MCSection *RISCVELFTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C, + unsigned &Align) const { + if (isConstantInSmallSection(DL, C)) + return SmallDataSection; + + // Otherwise, we work the same as ELF. + return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align); } diff --git a/lib/Target/RISCV/RISCVTargetObjectFile.h b/lib/Target/RISCV/RISCVTargetObjectFile.h index 5467220301c1..b2daaaa9d364 100644 --- a/lib/Target/RISCV/RISCVTargetObjectFile.h +++ b/lib/Target/RISCV/RISCVTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- RISCVTargetObjectFile.h - RISCV Object Info -*- C++ ---------*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -17,7 +16,31 @@ class RISCVTargetMachine; /// This implementation is used for RISCV ELF targets. class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF { + MCSection *SmallDataSection; + MCSection *SmallBSSSection; + unsigned SSThreshold = 8; + +public: void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + /// Return true if this global address should be placed into small data/bss + /// section. + bool isGlobalInSmallSection(const GlobalObject *GO, + const TargetMachine &TM) const; + + MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, + const TargetMachine &TM) const override; + + /// Return true if this constant should be placed into small data section. + bool isConstantInSmallSection(const DataLayout &DL, const Constant *CN) const; + + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, + const Constant *C, + unsigned &Align) const override; + + void getModuleMetadata(Module &M) override; + + bool isInSmallSection(uint64_t Size) const; }; } // end namespace llvm diff --git a/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/lib/Target/RISCV/RISCVTargetTransformInfo.cpp new file mode 100644 index 000000000000..2c6400cbb1eb --- /dev/null +++ b/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -0,0 +1,92 @@ +//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RISCVTargetTransformInfo.h" +#include "Utils/RISCVMatInt.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "riscvtti" + +int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { + assert(Ty->isIntegerTy() && + "getIntImmCost can only estimate cost of materialising integers"); + + // We have a Zero register, so 0 is always free. + if (Imm == 0) + return TTI::TCC_Free; + + // Otherwise, we check how many instructions it will take to materialise. + const DataLayout &DL = getDataLayout(); + return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), + getST()->is64Bit()); +} + +int RISCVTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { + assert(Ty->isIntegerTy() && + "getIntImmCost can only estimate cost of materialising integers"); + + // We have a Zero register, so 0 is always free. + if (Imm == 0) + return TTI::TCC_Free; + + // Some instructions in RISC-V can take a 12-bit immediate. Some of these are + // commutative, in others the immediate comes from a specific argument index. + bool Takes12BitImm = false; + unsigned ImmArgIdx = ~0U; + + switch (Opcode) { + case Instruction::GetElementPtr: + // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will + // split up large offsets in GEP into better parts than ConstantHoisting + // can. + return TTI::TCC_Free; + case Instruction::Add: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + Takes12BitImm = true; + break; + case Instruction::Sub: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + Takes12BitImm = true; + ImmArgIdx = 1; + break; + default: + break; + } + + if (Takes12BitImm) { + // Check immediate is the correct argument... + if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { + // ... and fits into the 12-bit immediate. + if (Imm.getMinSignedBits() <= 64 && + getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { + return TTI::TCC_Free; + } + } + + // Otherwise, use the full materialisation cost. + return getIntImmCost(Imm, Ty); + } + + // By default, prevent hoisting. + return TTI::TCC_Free; +} + +int RISCVTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { + // Prevent hoisting in unknown cases. + return TTI::TCC_Free; +} diff --git a/lib/Target/RISCV/RISCVTargetTransformInfo.h b/lib/Target/RISCV/RISCVTargetTransformInfo.h new file mode 100644 index 000000000000..f361b25a0c70 --- /dev/null +++ b/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -0,0 +1,52 @@ +//===- RISCVTargetTransformInfo.h - RISC-V specific TTI ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines a TargetTransformInfo::Concept conforming object specific +/// to the RISC-V target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H + +#include "RISCVSubtarget.h" +#include "RISCVTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Function.h" + +namespace llvm { + +class RISCVTTIImpl : public BasicTTIImplBase { + using BaseT = BasicTTIImplBase; + using TTI = TargetTransformInfo; + + friend BaseT; + + const RISCVSubtarget *ST; + const RISCVTargetLowering *TLI; + + const RISCVSubtarget *getST() const { return ST; } + const RISCVTargetLowering *getTLI() const { return TLI; } + +public: + explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H \ No newline at end of file diff --git a/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp index 0f369d960fe1..e44984a3fcc5 100644 --- a/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp +++ b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp @@ -1,26 +1,24 @@ //===-- RISCVTargetInfo.cpp - RISCV Target Implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include "TargetInfo/RISCVTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; -namespace llvm { -Target &getTheRISCV32Target() { +Target &llvm::getTheRISCV32Target() { static Target TheRISCV32Target; return TheRISCV32Target; } -Target &getTheRISCV64Target() { +Target &llvm::getTheRISCV64Target() { static Target TheRISCV64Target; return TheRISCV64Target; } -} extern "C" void LLVMInitializeRISCVTargetInfo() { RegisterTarget X(getTheRISCV32Target(), "riscv32", diff --git a/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h new file mode 100644 index 000000000000..ef3d9d116efa --- /dev/null +++ b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h @@ -0,0 +1,21 @@ +//===-- RISCVTargetInfo.h - RISCV Target Implementation ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_TARGETINFO_RISCVTARGETINFO_H +#define LLVM_LIB_TARGET_RISCV_TARGETINFO_RISCVTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheRISCV32Target(); +Target &getTheRISCV64Target(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_RISCV_TARGETINFO_RISCVTARGETINFO_H diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp b/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp index 964af1f74cec..bc5395768ca1 100644 --- a/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp +++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp @@ -1,9 +1,80 @@ #include "RISCVBaseInfo.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Support/raw_ostream.h" namespace llvm { namespace RISCVSysReg { #define GET_SysRegsList_IMPL #include "RISCVGenSystemOperands.inc" } // namespace RISCVSysReg + +namespace RISCVABI { +ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits, + StringRef ABIName) { + auto TargetABI = StringSwitch(ABIName) + .Case("ilp32", ABI_ILP32) + .Case("ilp32f", ABI_ILP32F) + .Case("ilp32d", ABI_ILP32D) + .Case("ilp32e", ABI_ILP32E) + .Case("lp64", ABI_LP64) + .Case("lp64f", ABI_LP64F) + .Case("lp64d", ABI_LP64D) + .Default(ABI_Unknown); + + bool IsRV64 = TT.isArch64Bit(); + bool IsRV32E = FeatureBits[RISCV::FeatureRV32E]; + + if (!ABIName.empty() && TargetABI == ABI_Unknown) { + errs() + << "'" << ABIName + << "' is not a recognized ABI for this target (ignoring target-abi)\n"; + } else if (ABIName.startswith("ilp32") && IsRV64) { + errs() << "32-bit ABIs are not supported for 64-bit targets (ignoring " + "target-abi)\n"; + TargetABI = ABI_Unknown; + } else if (ABIName.startswith("lp64") && !IsRV64) { + errs() << "64-bit ABIs are not supported for 32-bit targets (ignoring " + "target-abi)\n"; + TargetABI = ABI_Unknown; + } else if (ABIName.endswith("f") && !FeatureBits[RISCV::FeatureStdExtF]) { + errs() << "Hard-float 'f' ABI can't be used for a target that " + "doesn't support the F instruction set extension (ignoring " + "target-abi)\n"; + TargetABI = ABI_Unknown; + } else if (ABIName.endswith("d") && !FeatureBits[RISCV::FeatureStdExtD]) { + errs() << "Hard-float 'd' ABI can't be used for a target that " + "doesn't support the D instruction set extension (ignoring " + "target-abi)\n"; + TargetABI = ABI_Unknown; + } else if (IsRV32E && TargetABI != ABI_ILP32E && TargetABI != ABI_Unknown) { + errs() + << "Only the ilp32e ABI is supported for RV32E (ignoring target-abi)\n"; + TargetABI = ABI_Unknown; + } + + if (TargetABI != ABI_Unknown) + return TargetABI; + + // For now, default to the ilp32/ilp32e/lp64 ABI if no explicit ABI is given + // or an invalid/unrecognised string is given. In the future, it might be + // worth changing this to default to ilp32f/lp64f and ilp32d/lp64d when + // hardware support for floating point is present. + if (IsRV32E) + return ABI_ILP32E; + if (IsRV64) + return ABI_LP64; + return ABI_ILP32; +} +} // namespace RISCVABI + +namespace RISCVFeatures { + +void validate(const Triple &TT, const FeatureBitset &FeatureBits) { + if (TT.isArch64Bit() && FeatureBits[RISCV::FeatureRV32E]) + report_fatal_error("RV32E can't be enabled for an RV64 target"); +} + +} // namespace RISCVFeatures + } // namespace llvm diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h index 372e0e80bbaf..c33c72f24319 100644 --- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h +++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h @@ -1,9 +1,8 @@ //===-- RISCVBaseInfo.h - Top level definitions for RISCV MC ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -49,9 +48,18 @@ enum { enum { MO_None, + MO_CALL, + MO_PLT, MO_LO, MO_HI, + MO_PCREL_LO, MO_PCREL_HI, + MO_GOT_HI, + MO_TPREL_LO, + MO_TPREL_HI, + MO_TPREL_ADD, + MO_TLS_GOT_HI, + MO_TLS_GD_HI, }; } // namespace RISCVII @@ -153,6 +161,34 @@ struct SysReg { #include "RISCVGenSystemOperands.inc" } // end namespace RISCVSysReg +namespace RISCVABI { + +enum ABI { + ABI_ILP32, + ABI_ILP32F, + ABI_ILP32D, + ABI_ILP32E, + ABI_LP64, + ABI_LP64F, + ABI_LP64D, + ABI_Unknown +}; + +// Returns the target ABI, or else a StringError if the requested ABIName is +// not supported for the given TT and FeatureBits combination. +ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits, + StringRef ABIName); + +} // namespace RISCVABI + +namespace RISCVFeatures { + +// Validates if the given combination of features are valid for the target +// triple. Exits with report_fatal_error if not. +void validate(const Triple &TT, const FeatureBitset &FeatureBits); + +} // namespace RISCVFeatures + } // namespace llvm #endif diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/lib/Target/RISCV/Utils/RISCVMatInt.cpp index 3dc298246bc5..f390ddb89e3c 100644 --- a/lib/Target/RISCV/Utils/RISCVMatInt.cpp +++ b/lib/Target/RISCV/Utils/RISCVMatInt.cpp @@ -1,9 +1,8 @@ //===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -17,7 +16,7 @@ namespace llvm { namespace RISCVMatInt { -void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) { +void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res) { if (isInt<32>(Val)) { // Depending on the active bits in the immediate Value v, the following // instruction sequences are emitted: @@ -33,13 +32,13 @@ void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) { Res.push_back(Inst(RISCV::LUI, Hi20)); if (Lo12 || Hi20 == 0) { - unsigned AddiOpc = (Is64Bit && Hi20) ? RISCV::ADDIW : RISCV::ADDI; + unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI; Res.push_back(Inst(AddiOpc, Lo12)); } return; } - assert(Is64Bit && "Can't emit >32-bit imm for non-RV64 target"); + assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target"); // In the worst case, for a full 64-bit constant, a sequence of 8 instructions // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note @@ -65,15 +64,30 @@ void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) { // performed when the recursion returns. int64_t Lo12 = SignExtend64<12>(Val); - int64_t Hi52 = (Val + 0x800) >> 12; + int64_t Hi52 = ((uint64_t)Val + 0x800ull) >> 12; int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52); Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount); - generateInstSeq(Hi52, Is64Bit, Res); + generateInstSeq(Hi52, IsRV64, Res); Res.push_back(Inst(RISCV::SLLI, ShiftAmount)); if (Lo12) Res.push_back(Inst(RISCV::ADDI, Lo12)); } + +int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64) { + int PlatRegSize = IsRV64 ? 64 : 32; + + // Split the constant into platform register sized chunks, and calculate cost + // of each chunk. + int Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) { + APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize); + InstSeq MatSeq; + generateInstSeq(Chunk.getSExtValue(), IsRV64, MatSeq); + Cost += MatSeq.size(); + } + return std::max(1, Cost); +} } // namespace RISCVMatInt } // namespace llvm diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.h b/lib/Target/RISCV/Utils/RISCVMatInt.h index 49d1d89adc7a..b12ae2eade99 100644 --- a/lib/Target/RISCV/Utils/RISCVMatInt.h +++ b/lib/Target/RISCV/Utils/RISCVMatInt.h @@ -1,15 +1,15 @@ //===- RISCVMatInt.h - Immediate materialisation ---------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_RISCV_MATINT_H #define LLVM_LIB_TARGET_RISCV_MATINT_H +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/MachineValueType.h" #include @@ -31,6 +31,14 @@ using InstSeq = SmallVector; // order to allow this helper to be used from both the MC layer and during // instruction selection. void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res); + +// Helper to estimate the number of instructions required to materialise the +// given immediate value into a register. This estimate does not account for +// `Val` possibly fitting into an immediate, and so may over-estimate. +// +// This will attempt to produce instructions to materialise `Val` as an +// `Size`-bit immediate. `IsRV64` should match the target architecture. +int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64); } // namespace RISCVMatInt } // namespace llvm #endif diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 691421e533ea..15453ae59a4f 100644 --- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -1,14 +1,14 @@ //===-- SparcAsmParser.cpp - Parse Sparc assembly to MCInst instructions --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/SparcMCExpr.h" #include "MCTargetDesc/SparcMCTargetDesc.h" +#include "TargetInfo/SparcTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -646,7 +646,8 @@ bool SparcAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, return Error(StartLoc, "invalid register name"); } -static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features, +static void applyMnemonicAliases(StringRef &Mnemonic, + const FeatureBitset &Features, unsigned VariantID); bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info, diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp index 6290e5a15a8b..f1ca8e18c228 100644 --- a/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/lib/Target/Sparc/DelaySlotFiller.cpp @@ -1,9 +1,8 @@ //===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index 0045e63a824e..bee331874e96 100644 --- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -1,9 +1,8 @@ //===- SparcDisassembler.cpp - Disassembler for Sparc -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/SparcMCTargetDesc.h" +#include "TargetInfo/SparcTargetInfo.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -41,12 +41,6 @@ public: }; } -namespace llvm { -Target &getTheSparcTarget(); -Target &getTheSparcV9Target(); -Target &getTheSparcelTarget(); -} - static MCDisassembler *createSparcDisassembler(const Target &T, const MCSubtargetInfo &STI, MCContext &Ctx) { diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp deleted file mode 100644 index d152efae6d1f..000000000000 --- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp +++ /dev/null @@ -1,220 +0,0 @@ -//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an Sparc MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "SparcInstPrinter.h" -#include "Sparc.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target -// namespace. But SPARC backend uses "SP" as its namespace. -namespace llvm { -namespace Sparc { - using namespace SP; -} -} - -#define GET_INSTRUCTION_NAME -#define PRINT_ALIAS_INSTR -#include "SparcGenAsmWriter.inc" - -bool SparcInstPrinter::isV9(const MCSubtargetInfo &STI) const { - return (STI.getFeatureBits()[Sparc::FeatureV9]) != 0; -} - -void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const -{ - OS << '%' << StringRef(getRegisterName(RegNo)).lower(); -} - -void SparcInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - if (!printAliasInstr(MI, STI, O) && !printSparcAliasInstr(MI, STI, O)) - printInstruction(MI, STI, O); - printAnnotation(O, Annot); -} - -bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, - const MCSubtargetInfo &STI, - raw_ostream &O) { - switch (MI->getOpcode()) { - default: return false; - case SP::JMPLrr: - case SP::JMPLri: { - if (MI->getNumOperands() != 3) - return false; - if (!MI->getOperand(0).isReg()) - return false; - switch (MI->getOperand(0).getReg()) { - default: return false; - case SP::G0: // jmp $addr | ret | retl - if (MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 8) { - switch(MI->getOperand(1).getReg()) { - default: break; - case SP::I7: O << "\tret"; return true; - case SP::O7: O << "\tretl"; return true; - } - } - O << "\tjmp "; printMemOperand(MI, 1, STI, O); - return true; - case SP::O7: // call $addr - O << "\tcall "; printMemOperand(MI, 1, STI, O); - return true; - } - } - case SP::V9FCMPS: case SP::V9FCMPD: case SP::V9FCMPQ: - case SP::V9FCMPES: case SP::V9FCMPED: case SP::V9FCMPEQ: { - if (isV9(STI) - || (MI->getNumOperands() != 3) - || (!MI->getOperand(0).isReg()) - || (MI->getOperand(0).getReg() != SP::FCC0)) - return false; - // if V8, skip printing %fcc0. - switch(MI->getOpcode()) { - default: - case SP::V9FCMPS: O << "\tfcmps "; break; - case SP::V9FCMPD: O << "\tfcmpd "; break; - case SP::V9FCMPQ: O << "\tfcmpq "; break; - case SP::V9FCMPES: O << "\tfcmpes "; break; - case SP::V9FCMPED: O << "\tfcmped "; break; - case SP::V9FCMPEQ: O << "\tfcmpeq "; break; - } - printOperand(MI, 1, STI, O); - O << ", "; - printOperand(MI, 2, STI, O); - return true; - } - } -} - -void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand (opNum); - - if (MO.isReg()) { - printRegName(O, MO.getReg()); - return ; - } - - if (MO.isImm()) { - switch (MI->getOpcode()) { - default: - O << (int)MO.getImm(); - return; - - case SP::TICCri: // Fall through - case SP::TICCrr: // Fall through - case SP::TRAPri: // Fall through - case SP::TRAPrr: // Fall through - case SP::TXCCri: // Fall through - case SP::TXCCrr: // Fall through - // Only seven-bit values up to 127. - O << ((int) MO.getImm() & 0x7f); - return; - } - } - - assert(MO.isExpr() && "Unknown operand kind in printOperand"); - MO.getExpr()->print(O, &MAI); -} - -void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum, - const MCSubtargetInfo &STI, - raw_ostream &O, const char *Modifier) { - printOperand(MI, opNum, STI, O); - - // If this is an ADD operand, emit it like normal operands. - if (Modifier && !strcmp(Modifier, "arith")) { - O << ", "; - printOperand(MI, opNum+1, STI, O); - return; - } - const MCOperand &MO = MI->getOperand(opNum+1); - - if (MO.isReg() && MO.getReg() == SP::G0) - return; // don't print "+%g0" - if (MO.isImm() && MO.getImm() == 0) - return; // don't print "+0" - - O << "+"; - - printOperand(MI, opNum+1, STI, O); -} - -void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int CC = (int)MI->getOperand(opNum).getImm(); - switch (MI->getOpcode()) { - default: break; - case SP::FBCOND: - case SP::FBCONDA: - case SP::BPFCC: - case SP::BPFCCA: - case SP::BPFCCNT: - case SP::BPFCCANT: - case SP::MOVFCCrr: case SP::V9MOVFCCrr: - case SP::MOVFCCri: case SP::V9MOVFCCri: - case SP::FMOVS_FCC: case SP::V9FMOVS_FCC: - case SP::FMOVD_FCC: case SP::V9FMOVD_FCC: - case SP::FMOVQ_FCC: case SP::V9FMOVQ_FCC: - // Make sure CC is a fp conditional flag. - CC = (CC < 16) ? (CC + 16) : CC; - break; - case SP::CBCOND: - case SP::CBCONDA: - // Make sure CC is a cp conditional flag. - CC = (CC < 32) ? (CC + 32) : CC; - break; - } - O << SPARCCondCodeToString((SPCC::CondCodes)CC); -} - -bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX."); - return true; -} - -void SparcInstPrinter::printMembarTag(const MCInst *MI, int opNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static const char *const TagNames[] = { - "#LoadLoad", "#StoreLoad", "#LoadStore", "#StoreStore", - "#Lookaside", "#MemIssue", "#Sync"}; - - unsigned Imm = MI->getOperand(opNum).getImm(); - - if (Imm > 127) { - O << Imm; - return; - } - - bool First = true; - for (unsigned i = 0; i < sizeof(TagNames) / sizeof(char *); i++) { - if (Imm & (1 << i)) { - O << (First ? "" : " | ") << TagNames[i]; - First = false; - } - } -} diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h deleted file mode 100644 index 89015eb137c2..000000000000 --- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h +++ /dev/null @@ -1,57 +0,0 @@ -//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an Sparc MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H -#define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class SparcInstPrinter : public MCInstPrinter { -public: - SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - bool printSparcAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &OS); - bool isV9(const MCSubtargetInfo &STI) const; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, - const MCSubtargetInfo &STI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, - raw_ostream &OS); - void printMemOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, - raw_ostream &OS, const char *Modifier = nullptr); - void printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, - raw_ostream &OS); - bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &OS); - void printMembarTag(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, - raw_ostream &O); -}; -} // end namespace llvm - -#endif diff --git a/lib/Target/Sparc/LeonFeatures.td b/lib/Target/Sparc/LeonFeatures.td index 61e5f16e0a1e..e0ea4e9c7645 100755 --- a/lib/Target/Sparc/LeonFeatures.td +++ b/lib/Target/Sparc/LeonFeatures.td @@ -1,9 +1,8 @@ //===-- LeonFeatures.td - Describe the Leon Features -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/LeonPasses.cpp b/lib/Target/Sparc/LeonPasses.cpp index 5ce00db365ab..e9d3aaeb9cfe 100755 --- a/lib/Target/Sparc/LeonPasses.cpp +++ b/lib/Target/Sparc/LeonPasses.cpp @@ -1,9 +1,8 @@ //===------ LeonPasses.cpp - Define passes specific to LEON ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/LeonPasses.h b/lib/Target/Sparc/LeonPasses.h index 1b3d9a7a32f9..154a2b467e16 100755 --- a/lib/Target/Sparc/LeonPasses.h +++ b/lib/Target/Sparc/LeonPasses.h @@ -1,9 +1,8 @@ //===------- LeonPasses.h - Define passes specific to LEON ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index d7f1e3a1ab1d..2e8fa0dbaf4c 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- SparcAsmBackend.cpp - Sparc Assembler Backend ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 5a730947796e..88547075c5ae 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- SparcELFObjectWriter.cpp - Sparc ELF Writer -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h index 99aa63fe2290..b5fac0264019 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h @@ -1,9 +1,8 @@ //===-- SparcFixupKinds.h - Sparc Specific Fixup Entries --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp new file mode 100644 index 000000000000..c479459786d7 --- /dev/null +++ b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp @@ -0,0 +1,219 @@ +//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an Sparc MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "SparcInstPrinter.h" +#include "Sparc.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target +// namespace. But SPARC backend uses "SP" as its namespace. +namespace llvm { +namespace Sparc { + using namespace SP; +} +} + +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "SparcGenAsmWriter.inc" + +bool SparcInstPrinter::isV9(const MCSubtargetInfo &STI) const { + return (STI.getFeatureBits()[Sparc::FeatureV9]) != 0; +} + +void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const +{ + OS << '%' << StringRef(getRegisterName(RegNo)).lower(); +} + +void SparcInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + if (!printAliasInstr(MI, STI, O) && !printSparcAliasInstr(MI, STI, O)) + printInstruction(MI, STI, O); + printAnnotation(O, Annot); +} + +bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { + switch (MI->getOpcode()) { + default: return false; + case SP::JMPLrr: + case SP::JMPLri: { + if (MI->getNumOperands() != 3) + return false; + if (!MI->getOperand(0).isReg()) + return false; + switch (MI->getOperand(0).getReg()) { + default: return false; + case SP::G0: // jmp $addr | ret | retl + if (MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 8) { + switch(MI->getOperand(1).getReg()) { + default: break; + case SP::I7: O << "\tret"; return true; + case SP::O7: O << "\tretl"; return true; + } + } + O << "\tjmp "; printMemOperand(MI, 1, STI, O); + return true; + case SP::O7: // call $addr + O << "\tcall "; printMemOperand(MI, 1, STI, O); + return true; + } + } + case SP::V9FCMPS: case SP::V9FCMPD: case SP::V9FCMPQ: + case SP::V9FCMPES: case SP::V9FCMPED: case SP::V9FCMPEQ: { + if (isV9(STI) + || (MI->getNumOperands() != 3) + || (!MI->getOperand(0).isReg()) + || (MI->getOperand(0).getReg() != SP::FCC0)) + return false; + // if V8, skip printing %fcc0. + switch(MI->getOpcode()) { + default: + case SP::V9FCMPS: O << "\tfcmps "; break; + case SP::V9FCMPD: O << "\tfcmpd "; break; + case SP::V9FCMPQ: O << "\tfcmpq "; break; + case SP::V9FCMPES: O << "\tfcmpes "; break; + case SP::V9FCMPED: O << "\tfcmped "; break; + case SP::V9FCMPEQ: O << "\tfcmpeq "; break; + } + printOperand(MI, 1, STI, O); + O << ", "; + printOperand(MI, 2, STI, O); + return true; + } + } +} + +void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand (opNum); + + if (MO.isReg()) { + printRegName(O, MO.getReg()); + return ; + } + + if (MO.isImm()) { + switch (MI->getOpcode()) { + default: + O << (int)MO.getImm(); + return; + + case SP::TICCri: // Fall through + case SP::TICCrr: // Fall through + case SP::TRAPri: // Fall through + case SP::TRAPrr: // Fall through + case SP::TXCCri: // Fall through + case SP::TXCCrr: // Fall through + // Only seven-bit values up to 127. + O << ((int) MO.getImm() & 0x7f); + return; + } + } + + assert(MO.isExpr() && "Unknown operand kind in printOperand"); + MO.getExpr()->print(O, &MAI); +} + +void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum, + const MCSubtargetInfo &STI, + raw_ostream &O, const char *Modifier) { + printOperand(MI, opNum, STI, O); + + // If this is an ADD operand, emit it like normal operands. + if (Modifier && !strcmp(Modifier, "arith")) { + O << ", "; + printOperand(MI, opNum+1, STI, O); + return; + } + const MCOperand &MO = MI->getOperand(opNum+1); + + if (MO.isReg() && MO.getReg() == SP::G0) + return; // don't print "+%g0" + if (MO.isImm() && MO.getImm() == 0) + return; // don't print "+0" + + O << "+"; + + printOperand(MI, opNum+1, STI, O); +} + +void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int CC = (int)MI->getOperand(opNum).getImm(); + switch (MI->getOpcode()) { + default: break; + case SP::FBCOND: + case SP::FBCONDA: + case SP::BPFCC: + case SP::BPFCCA: + case SP::BPFCCNT: + case SP::BPFCCANT: + case SP::MOVFCCrr: case SP::V9MOVFCCrr: + case SP::MOVFCCri: case SP::V9MOVFCCri: + case SP::FMOVS_FCC: case SP::V9FMOVS_FCC: + case SP::FMOVD_FCC: case SP::V9FMOVD_FCC: + case SP::FMOVQ_FCC: case SP::V9FMOVQ_FCC: + // Make sure CC is a fp conditional flag. + CC = (CC < 16) ? (CC + 16) : CC; + break; + case SP::CBCOND: + case SP::CBCONDA: + // Make sure CC is a cp conditional flag. + CC = (CC < 32) ? (CC + 32) : CC; + break; + } + O << SPARCCondCodeToString((SPCC::CondCodes)CC); +} + +bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX."); + return true; +} + +void SparcInstPrinter::printMembarTag(const MCInst *MI, int opNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + static const char *const TagNames[] = { + "#LoadLoad", "#StoreLoad", "#LoadStore", "#StoreStore", + "#Lookaside", "#MemIssue", "#Sync"}; + + unsigned Imm = MI->getOperand(opNum).getImm(); + + if (Imm > 127) { + O << Imm; + return; + } + + bool First = true; + for (unsigned i = 0; i < sizeof(TagNames) / sizeof(char *); i++) { + if (Imm & (1 << i)) { + O << (First ? "" : " | ") << TagNames[i]; + First = false; + } + } +} diff --git a/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h new file mode 100644 index 000000000000..499bcadb0d4d --- /dev/null +++ b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h @@ -0,0 +1,56 @@ +//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an Sparc MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCINSTPRINTER_H +#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class SparcInstPrinter : public MCInstPrinter { +public: + SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + bool printSparcAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &OS); + bool isV9(const MCSubtargetInfo &STI) const; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, + raw_ostream &OS); + void printMemOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, + raw_ostream &OS, const char *Modifier = nullptr); + void printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, + raw_ostream &OS); + bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &OS); + void printMembarTag(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, + raw_ostream &O); +}; +} // end namespace llvm + +#endif diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp index 50e8825b15e8..1a2a040990ae 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===- SparcMCAsmInfo.cpp - Sparc asm properties --------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h index 5e8d0cb50312..c9162f2dc8a5 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h @@ -1,9 +1,8 @@ //===- SparcMCAsmInfo.h - Sparc asm properties -----------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index 647be159a151..7e908011bd50 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- SparcMCCodeEmitter.cpp - Convert Sparc code to machine code -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -84,9 +83,10 @@ public: const MCSubtargetInfo &STI) const; private: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp index 4ddb72643a91..00f319fc37e1 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp @@ -1,9 +1,8 @@ //===-- SparcMCExpr.cpp - Sparc specific MC expression classes --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h index cf2db067749c..c2467faca257 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h @@ -1,9 +1,8 @@ //====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index bd6596faee5d..ce593bb66770 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- SparcMCTargetDesc.cpp - Sparc Target Descriptions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,10 @@ //===----------------------------------------------------------------------===// #include "SparcMCTargetDesc.h" -#include "InstPrinter/SparcInstPrinter.h" +#include "SparcInstPrinter.h" #include "SparcMCAsmInfo.h" #include "SparcTargetStreamer.h" +#include "TargetInfo/SparcTargetInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index 3cd24104c443..e5699bb1c133 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- SparcMCTargetDesc.h - Sparc Target Descriptions ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,10 +32,6 @@ class StringRef; class raw_pwrite_stream; class raw_ostream; -Target &getTheSparcTarget(); -Target &getTheSparcV9Target(); -Target &getTheSparcelTarget(); - MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp index 94af791e0e75..a322d49adb87 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- SparcTargetStreamer.cpp - Sparc Target Streamer Methods -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,7 @@ //===----------------------------------------------------------------------===// #include "SparcTargetStreamer.h" -#include "InstPrinter/SparcInstPrinter.h" +#include "SparcInstPrinter.h" #include "llvm/Support/FormattedStream.h" using namespace llvm; diff --git a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h index 8bb418e39ab4..9f729a6c2cf4 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h @@ -1,9 +1,8 @@ //===-- SparcTargetStreamer.h - Sparc Target Streamer ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h index 0cea53b359eb..967c463f5281 100644 --- a/lib/Target/Sparc/Sparc.h +++ b/lib/Target/Sparc/Sparc.h @@ -1,9 +1,8 @@ //===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td index 0412215be8ab..ca6147edc46b 100644 --- a/lib/Target/Sparc/Sparc.td +++ b/lib/Target/Sparc/Sparc.td @@ -1,9 +1,8 @@ //===-- Sparc.td - Describe the Sparc Target Machine -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp index 5f0e359a3b00..4d5cbfbadc9d 100644 --- a/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,12 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/SparcInstPrinter.h" +#include "MCTargetDesc/SparcInstPrinter.h" #include "MCTargetDesc/SparcMCExpr.h" #include "MCTargetDesc/SparcTargetStreamer.h" #include "Sparc.h" #include "SparcInstrInfo.h" #include "SparcTargetMachine.h" +#include "TargetInfo/SparcTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" @@ -60,11 +60,9 @@ namespace { } bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI, const MCSubtargetInfo &STI); @@ -360,7 +358,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum, MO.getMBB()->getSymbol()->print(O, MAI); return; case MachineOperand::MO_GlobalAddress: - getSymbol(MO.getGlobal())->print(O, MAI); + PrintSymbolOperand(MO, O); break; case MachineOperand::MO_BlockAddress: O << GetBlockAddressSymbol(MO.getBlockAddress())->getName(); @@ -406,7 +404,6 @@ void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum, /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) { @@ -415,7 +412,7 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); case 'f': case 'r': break; @@ -428,7 +425,7 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, + unsigned OpNo, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td index 0aa29d186dc1..4be432211f1d 100644 --- a/lib/Target/Sparc/SparcCallingConv.td +++ b/lib/Target/Sparc/SparcCallingConv.td @@ -1,9 +1,8 @@ //===-- SparcCallingConv.td - Calling Conventions Sparc ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index 9f6c7d65592d..1834a6fd861d 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index 6098afa68985..8e6001da05db 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -1,9 +1,8 @@ //===-- SparcFrameLowering.h - Define frame lowering for Sparc --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp index f845c41ede45..8cff50d19ed4 100644 --- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -313,7 +312,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){ SelectInlineAsmMemoryOperands(AsmNodeOperands, SDLoc(N)); - SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), + SDValue New = CurDAG->getNode(N->getOpcode(), SDLoc(N), CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); New->setNodeId(-1); ReplaceNode(N, New.getNode()); @@ -329,7 +328,8 @@ void SparcDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { default: break; - case ISD::INLINEASM: { + case ISD::INLINEASM: + case ISD::INLINEASM_BR: { if (tryInlineAsm(N)) return; break; diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index ae2257618a55..a6d440fa8aa2 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -1,9 +1,8 @@ //===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,6 +17,7 @@ #include "SparcRegisterInfo.h" #include "SparcTargetMachine.h" #include "SparcTargetObjectFile.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -3258,6 +3258,8 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'r': if (VT == MVT::v2i32) return std::make_pair(0U, &SP::IntPairRegClass); + else if (Subtarget->is64Bit()) + return std::make_pair(0U, &SP::I64RegsRegClass); else return std::make_pair(0U, &SP::IntRegsRegClass); case 'f': diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h index 718851db25bf..8d557a4225e5 100644 --- a/lib/Target/Sparc/SparcISelLowering.h +++ b/lib/Target/Sparc/SparcISelLowering.h @@ -1,9 +1,8 @@ //===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td index 0b94c6b614eb..2d4f687f72d2 100644 --- a/lib/Target/Sparc/SparcInstr64Bit.td +++ b/lib/Target/Sparc/SparcInstr64Bit.td @@ -1,9 +1,8 @@ //===-- SparcInstr64Bit.td - 64-bit instructions for Sparc Target ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td index 35987390d7ba..d4d056ea0af6 100644 --- a/lib/Target/Sparc/SparcInstrAliases.td +++ b/lib/Target/Sparc/SparcInstrAliases.td @@ -1,9 +1,8 @@ //===-- SparcInstrAliases.td - Instruction Aliases for Sparc Target -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td index 76366c6695f4..fbf08b49d60c 100644 --- a/lib/Target/Sparc/SparcInstrFormats.td +++ b/lib/Target/Sparc/SparcInstrFormats.td @@ -1,9 +1,8 @@ //===-- SparcInstrFormats.td - Sparc Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp index 47b42444b94d..ad343fe6f80a 100644 --- a/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/lib/Target/Sparc/SparcInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- SparcInstrInfo.cpp - Sparc Instruction Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h index 524b5d054163..b587b28c25fc 100644 --- a/lib/Target/Sparc/SparcInstrInfo.h +++ b/lib/Target/Sparc/SparcInstrInfo.h @@ -1,9 +1,8 @@ //===-- SparcInstrInfo.h - Sparc Instruction Information --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index 558b37aeebcb..8474c7abffb3 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -1,9 +1,8 @@ //===-- SparcInstrInfo.td - Target Description for Sparc Target -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcInstrVIS.td b/lib/Target/Sparc/SparcInstrVIS.td index d9adf3e8b0f5..bdefc70869d7 100644 --- a/lib/Target/Sparc/SparcInstrVIS.td +++ b/lib/Target/Sparc/SparcInstrVIS.td @@ -1,9 +1,8 @@ //===---- SparcInstrVIS.td - Visual Instruction Set extensions (VIS) -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp index a784124ff688..8ea317fdd453 100644 --- a/lib/Target/Sparc/SparcMCInstLower.cpp +++ b/lib/Target/Sparc/SparcMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- SparcMCInstLower.cpp - Convert Sparc MachineInstr to MCInst -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/lib/Target/Sparc/SparcMachineFunctionInfo.cpp index e7442826e78b..7c36c4ab865f 100644 --- a/lib/Target/Sparc/SparcMachineFunctionInfo.cpp +++ b/lib/Target/Sparc/SparcMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- SparcMachineFunctionInfo.cpp - Sparc Machine Function Info --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h index 104744279d9d..fe5705878693 100644 --- a/lib/Target/Sparc/SparcMachineFunctionInfo.h +++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp index 33caa66154ff..ce11a423d10e 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- SparcRegisterInfo.cpp - SPARC Register Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -189,7 +188,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri)) .addReg(FrameReg).addImm(0).addReg(SrcEvenReg); - replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg); + replaceFI(MF, *StMI, *StMI, dl, 0, Offset, FrameReg); MI.setDesc(TII.get(SP::STDFri)); MI.getOperand(2).setReg(SrcOddReg); Offset += 8; @@ -198,10 +197,10 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned DestReg = MI.getOperand(0).getReg(); unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64); unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64); - MachineInstr *StMI = + MachineInstr *LdMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg) .addReg(FrameReg).addImm(0); - replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg); + replaceFI(MF, *LdMI, *LdMI, dl, 1, Offset, FrameReg); MI.setDesc(TII.get(SP::LDDFri)); MI.getOperand(0).setReg(DestOddReg); @@ -213,7 +212,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } -unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return SP::I6; } diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h index 8dd2569d10de..118ef9d80fae 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.h +++ b/lib/Target/Sparc/SparcRegisterInfo.h @@ -1,9 +1,8 @@ //===-- SparcRegisterInfo.h - Sparc Register Information Impl ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -39,7 +38,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo { int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; bool canRealignStack(const MachineFunction &MF) const override; diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td index 6625eaafd992..98959d512955 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.td +++ b/lib/Target/Sparc/SparcRegisterInfo.td @@ -1,9 +1,8 @@ //===-- SparcRegisterInfo.td - Sparc Register defs ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/SparcSchedule.td b/lib/Target/Sparc/SparcSchedule.td index f243546b029b..31e43c9bd95d 100755 --- a/lib/Target/Sparc/SparcSchedule.td +++ b/lib/Target/Sparc/SparcSchedule.td @@ -1,9 +1,8 @@ //===-- SparcSchedule.td - Describe the Sparc Itineries ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp index 5301fc30a006..075a002a358d 100644 --- a/lib/Target/Sparc/SparcSubtarget.cpp +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -1,9 +1,8 @@ //===-- SparcSubtarget.cpp - SPARC Subtarget Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index 24ea41a266e7..db19f99e3c9c 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -1,9 +1,8 @@ //===-- SparcSubtarget.h - Define Subtarget for the SPARC -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 5b467235f809..195cff79de03 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,6 +13,7 @@ #include "LeonPasses.h" #include "Sparc.h" #include "SparcTargetObjectFile.h" +#include "TargetInfo/SparcTargetInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" @@ -75,9 +75,9 @@ getEffectiveSparcCodeModel(Optional CM, Reloc::Model RM, bool Is64Bit, bool JIT) { if (CM) { if (*CM == CodeModel::Tiny) - report_fatal_error("Target does not support the tiny CodeModel"); + report_fatal_error("Target does not support the tiny CodeModel", false); if (*CM == CodeModel::Kernel) - report_fatal_error("Target does not support the kernel CodeModel"); + report_fatal_error("Target does not support the kernel CodeModel", false); return *CM; } if (Is64Bit) { diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index d1eb1d329a4c..4083f61433b1 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -1,9 +1,8 @@ //===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp index d0db854f7849..e6ad4d2d67aa 100644 --- a/lib/Target/Sparc/SparcTargetObjectFile.cpp +++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===------- SparcTargetObjectFile.cpp - Sparc Object Info Impl -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h index 3b1b345c3b19..9bbe602b32b3 100644 --- a/lib/Target/Sparc/SparcTargetObjectFile.h +++ b/lib/Target/Sparc/SparcTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- SparcTargetObjectFile.h - Sparc Object Info -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp index d030bd9f232d..eafa2b4b2f13 100644 --- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp +++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp @@ -1,14 +1,12 @@ //===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "Sparc.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/SparcTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.h b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.h new file mode 100644 index 000000000000..e02ff59fdac3 --- /dev/null +++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.h @@ -0,0 +1,22 @@ +//===-- SparcTargetInfo.h - Sparc Target Implementation ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SPARC_TARGETINFO_SPARCTARGETINFO_H +#define LLVM_LIB_TARGET_SPARC_TARGETINFO_SPARCTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheSparcTarget(); +Target &getTheSparcV9Target(); +Target &getTheSparcelTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_SPARC_TARGETINFO_SPARCTARGETINFO_H diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 91959b4151b3..a259ba3433d6 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -1,14 +1,14 @@ //===-- SystemZAsmParser.cpp - Parse SystemZ assembly instructions --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "InstPrinter/SystemZInstPrinter.h" +#include "MCTargetDesc/SystemZInstPrinter.h" #include "MCTargetDesc/SystemZMCTargetDesc.h" +#include "TargetInfo/SystemZTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -651,7 +651,6 @@ static void printMCExpr(const MCExpr *E, raw_ostream &OS) { void SystemZOperand::print(raw_ostream &OS) const { switch (Kind) { - break; case KindToken: OS << "Token:" << getToken(); break; @@ -1181,8 +1180,10 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, // features to be available during the operand check, or else we will fail to // find the custom parser, and then we will later get an InvalidOperand error // instead of a MissingFeature errror. - uint64_t AvailableFeatures = getAvailableFeatures(); - setAvailableFeatures(~(uint64_t)0); + FeatureBitset AvailableFeatures = getAvailableFeatures(); + FeatureBitset All; + All.set(); + setAvailableFeatures(All); OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); setAvailableFeatures(AvailableFeatures); if (ResTy == MatchOperand_Success) @@ -1233,7 +1234,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, return false; } -static std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string SystemZMnemonicSpellCheck(StringRef S, + const FeatureBitset &FBS, unsigned VariantID = 0); bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -1244,8 +1246,9 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst Inst; unsigned MatchResult; + FeatureBitset MissingFeatures; MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, - MatchingInlineAsm); + MissingFeatures, MatchingInlineAsm); switch (MatchResult) { case Match_Success: Inst.setLoc(IDLoc); @@ -1253,17 +1256,15 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return false; case Match_MissingFeature: { - assert(ErrorInfo && "Unknown missing feature!"); + assert(MissingFeatures.any() && "Unknown missing feature!"); // Special case the error message for the very common case where only // a single subtarget feature is missing std::string Msg = "instruction requires:"; - uint64_t Mask = 1; - for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) { - if (ErrorInfo & Mask) { + for (unsigned I = 0, E = MissingFeatures.size(); I != E; ++I) { + if (MissingFeatures[I]) { Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfo & Mask); + Msg += getSubtargetFeatureName(I); } - Mask <<= 1; } return Error(IDLoc, Msg); } @@ -1282,7 +1283,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } case Match_MnemonicFail: { - uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = SystemZMnemonicSpellCheck( ((SystemZOperand &)*Operands[0]).getToken(), FBS); return Error(IDLoc, "invalid instruction" + Suggestion, diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp index 8903b57ffd0b..70c26db33ced 100644 --- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp +++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp @@ -1,14 +1,14 @@ //===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/SystemZMCTargetDesc.h" #include "SystemZ.h" +#include "TargetInfo/SystemZTargetInfo.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp deleted file mode 100644 index 6cd12e13e220..000000000000 --- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp +++ /dev/null @@ -1,234 +0,0 @@ -//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "SystemZInstPrinter.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "SystemZGenAsmWriter.inc" - -void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp, - unsigned Index, raw_ostream &O) { - O << Disp; - if (Base || Index) { - O << '('; - if (Index) { - O << '%' << getRegisterName(Index); - if (Base) - O << ','; - } - if (Base) - O << '%' << getRegisterName(Base); - O << ')'; - } -} - -void SystemZInstPrinter::printOperand(const MCOperand &MO, const MCAsmInfo *MAI, - raw_ostream &O) { - if (MO.isReg()) - O << '%' << getRegisterName(MO.getReg()); - else if (MO.isImm()) - O << MO.getImm(); - else if (MO.isExpr()) - MO.getExpr()->print(O, MAI); - else - llvm_unreachable("Invalid operand"); -} - -void SystemZInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, - const MCSubtargetInfo &STI) { - printInstruction(MI, O); - printAnnotation(O, Annot); -} - -void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { - O << '%' << getRegisterName(RegNo); -} - -template -static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - int64_t Value = MI->getOperand(OpNum).getImm(); - assert(isUInt(Value) && "Invalid uimm argument"); - O << Value; -} - -template -static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - int64_t Value = MI->getOperand(OpNum).getImm(); - assert(isInt(Value) && "Invalid simm argument"); - O << Value; -} - -void SystemZInstPrinter::printU1ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<1>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU2ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<2>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU3ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<3>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<4>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<6>(MI, OpNum, O); -} - -void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printSImmOperand<8>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<8>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU12ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<12>(MI, OpNum, O); -} - -void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printSImmOperand<16>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<16>(MI, OpNum, O); -} - -void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printSImmOperand<32>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<32>(MI, OpNum, O); -} - -void SystemZInstPrinter::printU48ImmOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printUImmOperand<48>(MI, OpNum, O); -} - -void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - if (MO.isImm()) { - O << "0x"; - O.write_hex(MO.getImm()); - } else - MO.getExpr()->print(O, &MAI); -} - -void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - // Output the PC-relative operand. - printPCRelOperand(MI, OpNum, O); - - // Output the TLS marker if present. - if ((unsigned)OpNum + 1 < MI->getNumOperands()) { - const MCOperand &MO = MI->getOperand(OpNum + 1); - const MCSymbolRefExpr &refExp = cast(*MO.getExpr()); - switch (refExp.getKind()) { - case MCSymbolRefExpr::VK_TLSGD: - O << ":tls_gdcall:"; - break; - case MCSymbolRefExpr::VK_TLSLDM: - O << ":tls_ldcall:"; - break; - default: - llvm_unreachable("Unexpected symbol kind"); - } - O << refExp.getSymbol().getName(); - } -} - -void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printOperand(MI->getOperand(OpNum), &MAI, O); -} - -void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printAddress(MI->getOperand(OpNum).getReg(), - MI->getOperand(OpNum + 1).getImm(), 0, O); -} - -void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printAddress(MI->getOperand(OpNum).getReg(), - MI->getOperand(OpNum + 1).getImm(), - MI->getOperand(OpNum + 2).getReg(), O); -} - -void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); - uint64_t Disp = MI->getOperand(OpNum + 1).getImm(); - uint64_t Length = MI->getOperand(OpNum + 2).getImm(); - O << Disp << '(' << Length; - if (Base) - O << ",%" << getRegisterName(Base); - O << ')'; -} - -void SystemZInstPrinter::printBDRAddrOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); - uint64_t Disp = MI->getOperand(OpNum + 1).getImm(); - unsigned Length = MI->getOperand(OpNum + 2).getReg(); - O << Disp << "(%" << getRegisterName(Length); - if (Base) - O << ",%" << getRegisterName(Base); - O << ')'; -} - -void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum, - raw_ostream &O) { - printAddress(MI->getOperand(OpNum).getReg(), - MI->getOperand(OpNum + 1).getImm(), - MI->getOperand(OpNum + 2).getReg(), O); -} - -void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum, - raw_ostream &O) { - static const char *const CondNames[] = { - "o", "h", "nle", "l", "nhe", "lh", "ne", - "e", "nlh", "he", "nl", "le", "nh", "no" - }; - uint64_t Imm = MI->getOperand(OpNum).getImm(); - assert(Imm > 0 && Imm < 15 && "Invalid condition"); - O << CondNames[Imm - 1]; -} diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h deleted file mode 100644 index d65c661545eb..000000000000 --- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h +++ /dev/null @@ -1,78 +0,0 @@ -//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a SystemZ MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H -#define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" -#include - -namespace llvm { - -class MCOperand; - -class SystemZInstPrinter : public MCInstPrinter { -public: - SystemZInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - // Automatically generated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - // Print an address with the given base, displacement and index. - static void printAddress(unsigned Base, int64_t Disp, unsigned Index, - raw_ostream &O); - - // Print the given operand. - static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI, - raw_ostream &O); - - // Override MCInstPrinter. - void printRegName(raw_ostream &O, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - -private: - // Print various types of operand. - void printOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printBDVAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU1ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU2ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU3ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU12ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printU48ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O); - void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O); - - // Print the mnemonic for a condition-code mask ("ne", "lh", etc.) - // This forms part of the instruction name rather than the operand list. - void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O); -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp new file mode 100644 index 000000000000..91cb35dd72f2 --- /dev/null +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp @@ -0,0 +1,233 @@ +//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SystemZInstPrinter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "SystemZGenAsmWriter.inc" + +void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp, + unsigned Index, raw_ostream &O) { + O << Disp; + if (Base || Index) { + O << '('; + if (Index) { + O << '%' << getRegisterName(Index); + if (Base) + O << ','; + } + if (Base) + O << '%' << getRegisterName(Base); + O << ')'; + } +} + +void SystemZInstPrinter::printOperand(const MCOperand &MO, const MCAsmInfo *MAI, + raw_ostream &O) { + if (MO.isReg()) + O << '%' << getRegisterName(MO.getReg()); + else if (MO.isImm()) + O << MO.getImm(); + else if (MO.isExpr()) + MO.getExpr()->print(O, MAI); + else + llvm_unreachable("Invalid operand"); +} + +void SystemZInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, + const MCSubtargetInfo &STI) { + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { + O << '%' << getRegisterName(RegNo); +} + +template +static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { + int64_t Value = MI->getOperand(OpNum).getImm(); + assert(isUInt(Value) && "Invalid uimm argument"); + O << Value; +} + +template +static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { + int64_t Value = MI->getOperand(OpNum).getImm(); + assert(isInt(Value) && "Invalid simm argument"); + O << Value; +} + +void SystemZInstPrinter::printU1ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<1>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU2ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<2>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU3ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<3>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<4>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<6>(MI, OpNum, O); +} + +void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printSImmOperand<8>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<8>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU12ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<12>(MI, OpNum, O); +} + +void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printSImmOperand<16>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<16>(MI, OpNum, O); +} + +void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printSImmOperand<32>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<32>(MI, OpNum, O); +} + +void SystemZInstPrinter::printU48ImmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printUImmOperand<48>(MI, OpNum, O); +} + +void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.isImm()) { + O << "0x"; + O.write_hex(MO.getImm()); + } else + MO.getExpr()->print(O, &MAI); +} + +void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + // Output the PC-relative operand. + printPCRelOperand(MI, OpNum, O); + + // Output the TLS marker if present. + if ((unsigned)OpNum + 1 < MI->getNumOperands()) { + const MCOperand &MO = MI->getOperand(OpNum + 1); + const MCSymbolRefExpr &refExp = cast(*MO.getExpr()); + switch (refExp.getKind()) { + case MCSymbolRefExpr::VK_TLSGD: + O << ":tls_gdcall:"; + break; + case MCSymbolRefExpr::VK_TLSLDM: + O << ":tls_ldcall:"; + break; + default: + llvm_unreachable("Unexpected symbol kind"); + } + O << refExp.getSymbol().getName(); + } +} + +void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printOperand(MI->getOperand(OpNum), &MAI, O); +} + +void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printAddress(MI->getOperand(OpNum).getReg(), + MI->getOperand(OpNum + 1).getImm(), 0, O); +} + +void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printAddress(MI->getOperand(OpNum).getReg(), + MI->getOperand(OpNum + 1).getImm(), + MI->getOperand(OpNum + 2).getReg(), O); +} + +void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + unsigned Base = MI->getOperand(OpNum).getReg(); + uint64_t Disp = MI->getOperand(OpNum + 1).getImm(); + uint64_t Length = MI->getOperand(OpNum + 2).getImm(); + O << Disp << '(' << Length; + if (Base) + O << ",%" << getRegisterName(Base); + O << ')'; +} + +void SystemZInstPrinter::printBDRAddrOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + unsigned Base = MI->getOperand(OpNum).getReg(); + uint64_t Disp = MI->getOperand(OpNum + 1).getImm(); + unsigned Length = MI->getOperand(OpNum + 2).getReg(); + O << Disp << "(%" << getRegisterName(Length); + if (Base) + O << ",%" << getRegisterName(Base); + O << ')'; +} + +void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + printAddress(MI->getOperand(OpNum).getReg(), + MI->getOperand(OpNum + 1).getImm(), + MI->getOperand(OpNum + 2).getReg(), O); +} + +void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum, + raw_ostream &O) { + static const char *const CondNames[] = { + "o", "h", "nle", "l", "nhe", "lh", "ne", + "e", "nlh", "he", "nl", "le", "nh", "no" + }; + uint64_t Imm = MI->getOperand(OpNum).getImm(); + assert(Imm > 0 && Imm < 15 && "Invalid condition"); + O << CondNames[Imm - 1]; +} diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h new file mode 100644 index 000000000000..4235d4e21792 --- /dev/null +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h @@ -0,0 +1,77 @@ +//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints a SystemZ MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H +#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" +#include + +namespace llvm { + +class MCOperand; + +class SystemZInstPrinter : public MCInstPrinter { +public: + SystemZInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + // Automatically generated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + // Print an address with the given base, displacement and index. + static void printAddress(unsigned Base, int64_t Disp, unsigned Index, + raw_ostream &O); + + // Print the given operand. + static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI, + raw_ostream &O); + + // Override MCInstPrinter. + void printRegName(raw_ostream &O, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + +private: + // Print various types of operand. + void printOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printBDVAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU1ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU2ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU3ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU12ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printU48ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O); + + // Print the mnemonic for a condition-code mask ("ne", "lh", etc.) + // This forms part of the instruction name rather than the operand list. + void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index 2146832f7794..23d8585095cc 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- SystemZMCAsmBackend.cpp - SystemZ assembler backend ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp index 6e00981939b6..d6cdacfcab92 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h index 800f89232063..b8818a65f9e3 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h @@ -1,9 +1,8 @@ //====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index d188f56512ab..a5ccf4f68ffd 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- SystemZMCCodeEmitter.cpp - Convert SystemZ code to machine code ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -144,9 +143,10 @@ private: } private: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h index c012accc14dd..14f6198183b9 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h @@ -1,9 +1,8 @@ //===-- SystemZMCFixups.h - SystemZ-specific fixup entries ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp index 888be519fb16..8d8ba5644e10 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- SystemZMCObjectWriter.cpp - SystemZ ELF writer --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -37,8 +36,8 @@ protected: } // end anonymous namespace SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI) - : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390, - /*HasRelocationAddend=*/ true) {} + : MCELFObjectTargetWriter(/*Is64Bit_=*/true, OSABI, ELF::EM_S390, + /*HasRelocationAddend_=*/ true) {} // Return the relocation type for an absolute value of MCFixupKind Kind. static unsigned getAbsoluteReloc(unsigned Kind) { diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 05688ed8efbb..3c0300cfd8f0 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -1,15 +1,16 @@ //===-- SystemZMCTargetDesc.cpp - SystemZ target descriptions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "SystemZMCTargetDesc.h" -#include "InstPrinter/SystemZInstPrinter.h" +#include "SystemZInstPrinter.h" #include "SystemZMCAsmInfo.h" +#include "TargetInfo/SystemZTargetInfo.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 1617a807e65a..8f720c5abb34 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- SystemZMCTargetDesc.h - SystemZ target descriptions -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -30,8 +29,6 @@ class Triple; class raw_pwrite_stream; class raw_ostream; -Target &getTheSystemZTarget(); - namespace SystemZMC { // How many bytes are in the ABI-defined, caller-allocated part of // a stack frame. diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h index fdbde3d8dbc3..2b0f90182d7f 100644 --- a/lib/Target/SystemZ/SystemZ.h +++ b/lib/Target/SystemZ/SystemZ.h @@ -1,9 +1,8 @@ //==- SystemZ.h - Top-Level Interface for SystemZ representation -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -195,6 +194,7 @@ FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM); FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM); FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM); FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM); +FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM); FunctionPass *createSystemZTDCPass(); } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td index 3800f7a26b79..ebbc6ffd2f1e 100644 --- a/lib/Target/SystemZ/SystemZ.td +++ b/lib/Target/SystemZ/SystemZ.td @@ -1,9 +1,8 @@ //===-- SystemZ.td - Describe the SystemZ target machine -----*- tblgen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp index e2de721be568..ef378e4ade7a 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly printer -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "SystemZAsmPrinter.h" -#include "InstPrinter/SystemZInstPrinter.h" +#include "MCTargetDesc/SystemZInstPrinter.h" #include "SystemZConstantPoolValue.h" #include "SystemZMCInstLower.h" +#include "TargetInfo/SystemZTargetInfo.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/Mangler.h" @@ -80,6 +80,27 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) { Context); } +// MI is an instruction that accepts an optional alignment hint, +// and which was already lowered to LoweredMI. If the alignment +// of the original memory operand is known, update LoweredMI to +// an instruction with the corresponding hint set. +static void lowerAlignmentHint(const MachineInstr *MI, MCInst &LoweredMI, + unsigned Opcode) { + if (!MI->hasOneMemOperand()) + return; + const MachineMemOperand *MMO = *MI->memoperands_begin(); + unsigned AlignmentHint = 0; + if (MMO->getAlignment() >= 16) + AlignmentHint = 4; + else if (MMO->getAlignment() >= 8) + AlignmentHint = 3; + if (AlignmentHint == 0) + return; + + LoweredMI.setOpcode(Opcode); + LoweredMI.addOperand(MCOperand::createImm(AlignmentHint)); +} + // MI loads the high part of a vector from memory. Return an instruction // that uses replicating vector load Opcode to do the same thing. static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) { @@ -351,6 +372,26 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg())); break; + case SystemZ::VL: + Lower.lower(MI, LoweredMI); + lowerAlignmentHint(MI, LoweredMI, SystemZ::VLAlign); + break; + + case SystemZ::VST: + Lower.lower(MI, LoweredMI); + lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTAlign); + break; + + case SystemZ::VLM: + Lower.lower(MI, LoweredMI); + lowerAlignmentHint(MI, LoweredMI, SystemZ::VLMAlign); + break; + + case SystemZ::VSTM: + Lower.lower(MI, LoweredMI); + lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTMAlign); + break; + case SystemZ::VL32: LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF); break; @@ -618,26 +659,19 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { OutStreamer->EmitValue(Expr, Size); } -bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI, - unsigned OpNo, - unsigned AsmVariant, +bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS) { - if (ExtraCode && *ExtraCode == 'n') { - if (!MI->getOperand(OpNo).isImm()) - return true; - OS << -int64_t(MI->getOperand(OpNo).getImm()); - } else { - SystemZMCInstLower Lower(MF->getContext(), *this); - MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo))); - SystemZInstPrinter::printOperand(MO, MAI, OS); - } + if (ExtraCode) + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS); + SystemZMCInstLower Lower(MF->getContext(), *this); + MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo))); + SystemZInstPrinter::printOperand(MO, MAI, OS); return false; } bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) { SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(), diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h index cb88ec32f83a..aa5d3ca78e61 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.h +++ b/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -1,9 +1,8 @@ //===-- SystemZAsmPrinter.h - SystemZ LLVM assembly printer ----*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -37,11 +36,9 @@ public: void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; void EmitEndOfAsmFile(Module &M) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool doInitialization(Module &M) override { SM.reset(); diff --git a/lib/Target/SystemZ/SystemZCallingConv.cpp b/lib/Target/SystemZ/SystemZCallingConv.cpp index 72da51f74b10..91c7fae17a75 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.cpp +++ b/lib/Target/SystemZ/SystemZCallingConv.cpp @@ -1,9 +1,8 @@ //===-- SystemZCallingConv.cpp - Calling conventions for SystemZ ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h index b5523e586f4c..82f29b6361f1 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.h +++ b/lib/Target/SystemZ/SystemZCallingConv.h @@ -1,9 +1,8 @@ //===-- SystemZCallingConv.h - Calling conventions for SystemZ --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td index deba27fee7fe..bbd51546ac9f 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.td +++ b/lib/Target/SystemZ/SystemZCallingConv.td @@ -1,9 +1,8 @@ //=- SystemZCallingConv.td - Calling conventions for SystemZ -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for the SystemZ ABI. diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp index 4a6beb67f182..ffeee4da95cc 100644 --- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp +++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp @@ -1,9 +1,8 @@ //===-- SystemZConstantPoolValue.cpp - SystemZ constant-pool value --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h index a71b595560d2..6cb7710abdfe 100644 --- a/lib/Target/SystemZ/SystemZConstantPoolValue.h +++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h @@ -1,9 +1,8 @@ //===- SystemZConstantPoolValue.h - SystemZ constant-pool value -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp index 668a77ac014f..9cbf6b320504 100644 --- a/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -1,9 +1,8 @@ //===-- SystemZElimCompare.cpp - Eliminate comparison instructions --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -147,6 +146,9 @@ static bool resultTests(MachineInstr &MI, unsigned Reg) { // Describe the references to Reg or any of its aliases in MI. Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) { Reference Ref; + if (MI.isDebugInstr()) + return Ref; + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { const MachineOperand &MO = MI.getOperand(I); if (MO.isReg()) { @@ -523,9 +525,9 @@ bool SystemZElimCompare::fuseCompareOperations( // SrcReg2 is the register if the source operand is a register, // 0 if the source operand is immediate, and the base register // if the source operand is memory (index is not supported). - unsigned SrcReg = Compare.getOperand(0).getReg(); - unsigned SrcReg2 = - Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0; + Register SrcReg = Compare.getOperand(0).getReg(); + Register SrcReg2 = + Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : Register(); MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; for (++MBBI; MBBI != MBBE; ++MBBI) if (MBBI->modifiesRegister(SrcReg, TRI) || diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp index 67c80899d491..09708fb4241c 100644 --- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp +++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp @@ -1,9 +1,8 @@ //==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td index beff45dba81d..dae795e845b0 100644 --- a/lib/Target/SystemZ/SystemZFeatures.td +++ b/lib/Target/SystemZ/SystemZFeatures.td @@ -1,9 +1,8 @@ //===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -240,6 +239,51 @@ def Arch12NewFeatures : SystemZFeatureList<[ FeatureInsertReferenceBitsMultiple ]>; +//===----------------------------------------------------------------------===// +// +// New features added in the Thirteenth Edition of the z/Architecture +// +//===----------------------------------------------------------------------===// + +def FeatureMiscellaneousExtensions3 : SystemZFeature< + "miscellaneous-extensions-3", "MiscellaneousExtensions3", + "Assume that the miscellaneous-extensions facility 3 is installed" +>; + +def FeatureMessageSecurityAssist9 : SystemZFeature< + "message-security-assist-extension9", "MessageSecurityAssist9", + "Assume that the message-security-assist extension facility 9 is installed" +>; + +def FeatureVectorEnhancements2 : SystemZFeature< + "vector-enhancements-2", "VectorEnhancements2", + "Assume that the vector enhancements facility 2 is installed" +>; + +def FeatureVectorPackedDecimalEnhancement : SystemZFeature< + "vector-packed-decimal-enhancement", "VectorPackedDecimalEnhancement", + "Assume that the vector packed decimal enhancement facility is installed" +>; + +def FeatureEnhancedSort : SystemZFeature< + "enhanced-sort", "EnhancedSort", + "Assume that the enhanced-sort facility is installed" +>; + +def FeatureDeflateConversion : SystemZFeature< + "deflate-conversion", "DeflateConversion", + "Assume that the deflate-conversion facility is installed" +>; + +def Arch13NewFeatures : SystemZFeatureList<[ + FeatureMiscellaneousExtensions3, + FeatureMessageSecurityAssist9, + FeatureVectorEnhancements2, + FeatureVectorPackedDecimalEnhancement, + FeatureEnhancedSort, + FeatureDeflateConversion +]>; + //===----------------------------------------------------------------------===// // // Cumulative supported and unsupported feature sets @@ -256,9 +300,13 @@ def Arch11SupportedFeatures : SystemZFeatureAdd; def Arch12SupportedFeatures : SystemZFeatureAdd; +def Arch13SupportedFeatures + : SystemZFeatureAdd; -def Arch12UnsupportedFeatures +def Arch13UnsupportedFeatures : SystemZFeatureList<[]>; +def Arch12UnsupportedFeatures + : SystemZFeatureAdd; def Arch11UnsupportedFeatures : SystemZFeatureAdd; def Arch10UnsupportedFeatures diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp index 565299c90139..da28faebb326 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- SystemZFrameLowering.cpp - Frame lowering for SystemZ -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h index 08c84c785cc0..71ef3e4dc240 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/lib/Target/SystemZ/SystemZFrameLowering.h @@ -1,9 +1,8 @@ //===-- SystemZFrameLowering.h - Frame lowering for SystemZ -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index 8726b56bc94f..e2af02227999 100644 --- a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -1,9 +1,8 @@ //=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.h b/lib/Target/SystemZ/SystemZHazardRecognizer.h index 6292feefbfea..38bf41ebe96a 100644 --- a/lib/Target/SystemZ/SystemZHazardRecognizer.h +++ b/lib/Target/SystemZ/SystemZHazardRecognizer.h @@ -1,9 +1,8 @@ //=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 5bc2ab0ef2d8..9dc4512255cc 100644 --- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "SystemZTargetMachine.h" +#include "SystemZISelLowering.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Support/Debug.h" @@ -304,6 +304,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel { void splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0, uint64_t UpperVal, uint64_t LowerVal); + void loadVectorConstant(const SystemZVectorConstantInfo &VCI, + SDNode *Node); + // Try to use gather instruction Opcode to implement vector insertion N. bool tryGather(SDNode *N, unsigned Opcode); @@ -1132,6 +1135,35 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node, SelectCode(Or.getNode()); } +void SystemZDAGToDAGISel::loadVectorConstant( + const SystemZVectorConstantInfo &VCI, SDNode *Node) { + assert((VCI.Opcode == SystemZISD::BYTE_MASK || + VCI.Opcode == SystemZISD::REPLICATE || + VCI.Opcode == SystemZISD::ROTATE_MASK) && + "Bad opcode!"); + assert(VCI.VecVT.getSizeInBits() == 128 && "Expected a vector type"); + EVT VT = Node->getValueType(0); + SDLoc DL(Node); + SmallVector Ops; + for (unsigned OpVal : VCI.OpVals) + Ops.push_back(CurDAG->getConstant(OpVal, DL, MVT::i32)); + SDValue Op = CurDAG->getNode(VCI.Opcode, DL, VCI.VecVT, Ops); + + if (VCI.VecVT == VT.getSimpleVT()) + ReplaceNode(Node, Op.getNode()); + else if (VT.getSizeInBits() == 128) { + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op); + ReplaceNode(Node, BitCast.getNode()); + SelectCode(BitCast.getNode()); + } else { // float or double + unsigned SubRegIdx = + (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64); + ReplaceNode( + Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode()); + } + SelectCode(Op.getNode()); +} + bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) { SDValue ElemV = N->getOperand(2); auto *ElemN = dyn_cast(ElemV); @@ -1243,6 +1275,9 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, InputChain = LoadNode->getChain(); } else if (Chain.getOpcode() == ISD::TokenFactor) { SmallVector ChainOps; + SmallVector LoopWorklist; + SmallPtrSet Visited; + const unsigned int Max = 1024; for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -1251,28 +1286,26 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, ChainOps.push_back(Load.getOperand(0)); continue; } - - // Make sure using Op as part of the chain would not cause a cycle here. - // In theory, we could check whether the chain node is a predecessor of - // the load. But that can be very expensive. Instead visit the uses and - // make sure they all have smaller node id than the load. - int LoadId = LoadNode->getNodeId(); - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = UI->use_end(); UI != UE; ++UI) { - if (UI.getUse().getResNo() != 0) - continue; - if (UI->getNodeId() > LoadId) - return false; - } - + LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } - if (ChainCheck) + if (ChainCheck) { + // Add the other operand of StoredVal to worklist. + for (SDValue Op : StoredVal->ops()) + if (Op.getNode() != LoadNode) + LoopWorklist.push_back(Op.getNode()); + + // Check if Load is reachable from any of the nodes in the worklist. + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, + true)) + return false; + // Make a new TokenFactor with all the other input chains except // for the load. InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + } } if (!ChainCheck) return false; @@ -1447,6 +1480,23 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { Node->getOperand(0).getOpcode() != ISD::Constant) if (auto *Op1 = dyn_cast(Node->getOperand(1))) { uint64_t Val = Op1->getZExtValue(); + // Don't split the operation if we can match one of the combined + // logical operations provided by miscellaneous-extensions-3. + if (Subtarget->hasMiscellaneousExtensions3()) { + unsigned ChildOpcode = Node->getOperand(0).getOpcode(); + // Check whether this expression matches NAND/NOR/NXOR. + if (Val == (uint64_t)-1 && Opcode == ISD::XOR) + if (ChildOpcode == ISD::AND || ChildOpcode == ISD::OR || + ChildOpcode == ISD::XOR) + break; + // Check whether this expression matches OR-with-complement. + if (Opcode == ISD::OR && ChildOpcode == ISD::XOR) { + auto Op0 = Node->getOperand(0); + if (auto *Op0Op1 = dyn_cast(Op0->getOperand(1))) + if (Op0Op1->getZExtValue() == (uint64_t)-1) + break; + } + } if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val)) { splitLargeImmediate(Opcode, Node, Node->getOperand(0), Val - uint32_t(Val), uint32_t(Val)); @@ -1527,6 +1577,27 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { break; } + case ISD::BUILD_VECTOR: { + auto *BVN = cast(Node); + SystemZVectorConstantInfo VCI(BVN); + if (VCI.isVectorConstantLegal(*Subtarget)) { + loadVectorConstant(VCI, Node); + return; + } + break; + } + + case ISD::ConstantFP: { + APFloat Imm = cast(Node)->getValueAPF(); + if (Imm.isZero() || Imm.isNegZero()) + break; + SystemZVectorConstantInfo VCI(Imm); + bool Success = VCI.isVectorConstantLegal(*Subtarget); (void)Success; + assert(Success && "Expected legal FP immediate"); + loadVectorConstant(VCI, Node); + return; + } + case ISD::STORE: { if (tryFoldLoadStoreIntoMemOperand(Node)) return; diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 2a825c1316f3..78820f511ab4 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1,9 +1,8 @@ //===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -250,8 +249,15 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // We have native support for a 64-bit CTLZ, via FLOGR. setOperationAction(ISD::CTLZ, MVT::i32, Promote); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote); setOperationAction(ISD::CTLZ, MVT::i64, Legal); + // On arch13 we have native support for a 64-bit CTPOP. + if (Subtarget.hasMiscellaneousExtensions3()) { + setOperationAction(ISD::CTPOP, MVT::i32, Promote); + setOperationAction(ISD::CTPOP, MVT::i64, Legal); + } + // Give LowerOperation the chance to replace 64-bit ORs with subregs. setOperationAction(ISD::OR, MVT::i64, Custom); @@ -377,6 +383,17 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal); } + if (Subtarget.hasVectorEnhancements2()) { + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4f32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4f32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal); + } + // Handle floating-point types. for (unsigned I = MVT::FIRST_FP_VALUETYPE; I <= MVT::LAST_FP_VALUETYPE; @@ -401,6 +418,24 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); + + // Handle constrained floating-point operations. + setOperationAction(ISD::STRICT_FADD, VT, Legal); + setOperationAction(ISD::STRICT_FSUB, VT, Legal); + setOperationAction(ISD::STRICT_FMUL, VT, Legal); + setOperationAction(ISD::STRICT_FDIV, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); + setOperationAction(ISD::STRICT_FSQRT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); + if (Subtarget.hasFPExtension()) { + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FROUND, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + } } } @@ -432,6 +467,20 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); setOperationAction(ISD::FROUND, MVT::v2f64, Legal); + + // Handle constrained floating-point operations. + setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal); } // The vector enhancements facility 1 has instructions for these. @@ -475,6 +524,25 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal); setOperationAction(ISD::FMINNUM, MVT::f128, Legal); setOperationAction(ISD::FMINIMUM, MVT::f128, Legal); + + // Handle constrained floating-point operations. + setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal); + for (auto VT : { MVT::f32, MVT::f64, MVT::f128, + MVT::v4f32, MVT::v2f64 }) { + setOperationAction(ISD::STRICT_FMAXNUM, VT, Legal); + setOperationAction(ISD::STRICT_FMINNUM, VT, Legal); + } } // We have fused multiply-addition for f32 and f64 but not f128. @@ -525,6 +593,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::FP_EXTEND); @@ -577,9 +646,127 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } -bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +// Return true if the constant can be generated with a vector instruction, +// such as VGM, VGMB or VREPI. +bool SystemZVectorConstantInfo::isVectorConstantLegal( + const SystemZSubtarget &Subtarget) { + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + if (!Subtarget.hasVector() || + (isFP128 && !Subtarget.hasVectorEnhancements1())) + return false; + + // Try using VECTOR GENERATE BYTE MASK. This is the architecturally- + // preferred way of creating all-zero and all-one vectors so give it + // priority over other methods below. + unsigned Mask = 0; + unsigned I = 0; + for (; I < SystemZ::VectorBytes; ++I) { + uint64_t Byte = IntBits.lshr(I * 8).trunc(8).getZExtValue(); + if (Byte == 0xff) + Mask |= 1ULL << I; + else if (Byte != 0) + break; + } + if (I == SystemZ::VectorBytes) { + Opcode = SystemZISD::BYTE_MASK; + OpVals.push_back(Mask); + VecVT = MVT::getVectorVT(MVT::getIntegerVT(8), 16); + return true; + } + + if (SplatBitSize > 64) + return false; + + auto tryValue = [&](uint64_t Value) -> bool { + // Try VECTOR REPLICATE IMMEDIATE + int64_t SignedValue = SignExtend64(Value, SplatBitSize); + if (isInt<16>(SignedValue)) { + OpVals.push_back(((unsigned) SignedValue)); + Opcode = SystemZISD::REPLICATE; + VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize), + SystemZ::VectorBits / SplatBitSize); + return true; + } + // Try VECTOR GENERATE MASK + unsigned Start, End; + if (TII->isRxSBGMask(Value, SplatBitSize, Start, End)) { + // isRxSBGMask returns the bit numbers for a full 64-bit value, with 0 + // denoting 1 << 63 and 63 denoting 1. Convert them to bit numbers for + // an SplatBitSize value, so that 0 denotes 1 << (SplatBitSize-1). + OpVals.push_back(Start - (64 - SplatBitSize)); + OpVals.push_back(End - (64 - SplatBitSize)); + Opcode = SystemZISD::ROTATE_MASK; + VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize), + SystemZ::VectorBits / SplatBitSize); + return true; + } + return false; + }; + + // First try assuming that any undefined bits above the highest set bit + // and below the lowest set bit are 1s. This increases the likelihood of + // being able to use a sign-extended element value in VECTOR REPLICATE + // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK. + uint64_t SplatBitsZ = SplatBits.getZExtValue(); + uint64_t SplatUndefZ = SplatUndef.getZExtValue(); + uint64_t Lower = + (SplatUndefZ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1)); + uint64_t Upper = + (SplatUndefZ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1)); + if (tryValue(SplatBitsZ | Upper | Lower)) + return true; + + // Now try assuming that any undefined bits between the first and + // last defined set bits are set. This increases the chances of + // using a non-wraparound mask. + uint64_t Middle = SplatUndefZ & ~Upper & ~Lower; + return tryValue(SplatBitsZ | Middle); +} + +SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) { + IntBits = FPImm.bitcastToAPInt().zextOrSelf(128); + isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad()); + + // Find the smallest splat. + SplatBits = FPImm.bitcastToAPInt(); + unsigned Width = SplatBits.getBitWidth(); + while (Width > 8) { + unsigned HalfSize = Width / 2; + APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize); + APInt LowValue = SplatBits.trunc(HalfSize); + + // If the two halves do not match, stop here. + if (HighValue != LowValue || 8 > HalfSize) + break; + + SplatBits = HighValue; + Width = HalfSize; + } + SplatUndef = 0; + SplatBitSize = Width; +} + +SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) { + assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR"); + bool HasAnyUndefs; + + // Get IntBits by finding the 128 bit splat. + BVN->isConstantSplat(IntBits, SplatUndef, SplatBitSize, HasAnyUndefs, 128, + true); + + // Get SplatBits by finding the 8 bit or greater splat. + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 8, + true); +} + +bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { // We can load zero using LZ?R and negative zero using LZ?R;LC?BR. - return Imm.isZero() || Imm.isNegZero(); + if (Imm.isZero() || Imm.isNegZero()) + return true; + + return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget); } bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const { @@ -592,10 +779,8 @@ bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isUInt<32>(Imm) || isUInt<32>(-Imm); } -bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, - bool *Fast) const { +bool SystemZTargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const { // Unaligned accesses should never be slower than the expanded version. // We check specifically for aligned accesses in the few cases where // they are required. @@ -1642,6 +1827,20 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) { CCValid = SystemZ::CCMASK_ANY; return true; + case Intrinsic::s390_vstrsb: + case Intrinsic::s390_vstrsh: + case Intrinsic::s390_vstrsf: + Opcode = SystemZISD::VSTRS_CC; + CCValid = SystemZ::CCMASK_ANY; + return true; + + case Intrinsic::s390_vstrszb: + case Intrinsic::s390_vstrszh: + case Intrinsic::s390_vstrszf: + Opcode = SystemZISD::VSTRSZ_CC; + CCValid = SystemZ::CCMASK_ANY; + return true; + case Intrinsic::s390_vfcedbs: case Intrinsic::s390_vfcesbs: Opcode = SystemZISD::VFCMPES; @@ -2511,9 +2710,8 @@ SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG, break; } if (Invert) { - SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, - DAG.getConstant(65535, DL, MVT::i32)); - Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask); + SDValue Mask = + DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64)); Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask); } return Cmp; @@ -3261,6 +3459,18 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op, return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC); } +static bool isAddCarryChain(SDValue Carry) { + while (Carry.getOpcode() == ISD::ADDCARRY) + Carry = Carry.getOperand(2); + return Carry.getOpcode() == ISD::UADDO; +} + +static bool isSubBorrowChain(SDValue Carry) { + while (Carry.getOpcode() == ISD::SUBCARRY) + Carry = Carry.getOperand(2); + return Carry.getOpcode() == ISD::USUBO; +} + // Lower ADDCARRY/SUBCARRY nodes. SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) const { @@ -3283,11 +3493,17 @@ SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op, switch (Op.getOpcode()) { default: llvm_unreachable("Unknown instruction!"); case ISD::ADDCARRY: + if (!isAddCarryChain(Carry)) + return SDValue(); + BaseOp = SystemZISD::ADDCARRY; CCValid = SystemZ::CCMASK_LOGICAL; CCMask = SystemZ::CCMASK_LOGICAL_CARRY; break; case ISD::SUBCARRY: + if (!isSubBorrowChain(Carry)) + return SDValue(); + BaseOp = SystemZISD::SUBCARRY; CCValid = SystemZ::CCMASK_LOGICAL; CCMask = SystemZ::CCMASK_LOGICAL_BORROW; @@ -3331,14 +3547,14 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, break; } case 32: { - SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, - DAG.getConstant(0, DL, MVT::i32)); + SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL, + DAG.getConstant(0, DL, MVT::i32)); Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); break; } case 64: { - SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8, - DAG.getConstant(0, DL, MVT::i32)); + SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL, + DAG.getConstant(0, DL, MVT::i32)); Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp); Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); break; @@ -3602,6 +3818,27 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op, return SDValue(); } +MachineMemOperand::Flags +SystemZTargetLowering::getMMOFlags(const Instruction &I) const { + // Because of how we convert atomic_load and atomic_store to normal loads and + // stores in the DAG, we need to ensure that the MMOs are marked volatile + // since DAGCombine hasn't been updated to account for atomic, but non + // volatile loads. (See D57601) + if (auto *SI = dyn_cast(&I)) + if (SI->isAtomic()) + return MachineMemOperand::MOVolatile; + if (auto *LI = dyn_cast(&I)) + if (LI->isAtomic()) + return MachineMemOperand::MOVolatile; + if (auto *AI = dyn_cast(&I)) + if (AI->isAtomic()) + return MachineMemOperand::MOVolatile; + if (auto *AI = dyn_cast(&I)) + if (AI->isAtomic()) + return MachineMemOperand::MOVolatile; + return MachineMemOperand::MONone; +} + SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -4260,78 +4497,6 @@ static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0, return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1); } -// Try to represent constant BUILD_VECTOR node BVN using a -// SystemZISD::BYTE_MASK-style mask. Store the mask value in Mask -// on success. -static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) { - EVT ElemVT = BVN->getValueType(0).getVectorElementType(); - unsigned BytesPerElement = ElemVT.getStoreSize(); - for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) { - SDValue Op = BVN->getOperand(I); - if (!Op.isUndef()) { - uint64_t Value; - if (Op.getOpcode() == ISD::Constant) - Value = cast(Op)->getZExtValue(); - else if (Op.getOpcode() == ISD::ConstantFP) - Value = (cast(Op)->getValueAPF().bitcastToAPInt() - .getZExtValue()); - else - return false; - for (unsigned J = 0; J < BytesPerElement; ++J) { - uint64_t Byte = (Value >> (J * 8)) & 0xff; - if (Byte == 0xff) - Mask |= 1ULL << ((E - I - 1) * BytesPerElement + J); - else if (Byte != 0) - return false; - } - } - } - return true; -} - -// Try to load a vector constant in which BitsPerElement-bit value Value -// is replicated to fill the vector. VT is the type of the resulting -// constant, which may have elements of a different size from BitsPerElement. -// Return the SDValue of the constant on success, otherwise return -// an empty value. -static SDValue tryBuildVectorReplicate(SelectionDAG &DAG, - const SystemZInstrInfo *TII, - const SDLoc &DL, EVT VT, uint64_t Value, - unsigned BitsPerElement) { - // Signed 16-bit values can be replicated using VREPI. - // Mark the constants as opaque or DAGCombiner will convert back to - // BUILD_VECTOR. - int64_t SignedValue = SignExtend64(Value, BitsPerElement); - if (isInt<16>(SignedValue)) { - MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), - SystemZ::VectorBits / BitsPerElement); - SDValue Op = DAG.getNode( - SystemZISD::REPLICATE, DL, VecVT, - DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/)); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); - } - // See whether rotating the constant left some N places gives a value that - // is one less than a power of 2 (i.e. all zeros followed by all ones). - // If so we can use VGM. - unsigned Start, End; - if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) { - // isRxSBGMask returns the bit numbers for a full 64-bit value, - // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to - // bit numbers for an BitsPerElement value, so that 0 denotes - // 1 << (BitsPerElement-1). - Start -= 64 - BitsPerElement; - End -= 64 - BitsPerElement; - MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), - SystemZ::VectorBits / BitsPerElement); - SDValue Op = DAG.getNode( - SystemZISD::ROTATE_MASK, DL, VecVT, - DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/), - DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/)); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); - } - return SDValue(); -} - // If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually // better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for // the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR @@ -4385,9 +4550,18 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, return GS.getNode(DAG, SDLoc(BVN)); } +bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const { + if (Op.getOpcode() == ISD::LOAD && cast(Op)->isUnindexed()) + return true; + if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV) + return true; + return false; +} + // Combine GPR scalar values Elems into a vector of type VT. -static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, - SmallVectorImpl &Elems) { +SDValue +SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, + SmallVectorImpl &Elems) const { // See whether there is a single replicated value. SDValue Single; unsigned int NumElements = Elems.size(); @@ -4416,13 +4590,13 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // we would need 2 instructions to replicate it: VLVGP followed by VREPx. // This is only a win if the single defined element is used more than once. // In other cases we're better off using a single VLVGx. - if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD)) + if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single))) return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single); // If all elements are loads, use VLREP/VLEs (below). bool AllLoads = true; for (auto Elem : Elems) - if (Elem.getOpcode() != ISD::LOAD || cast(Elem)->isIndexed()) { + if (!isVectorElementLoad(Elem)) { AllLoads = false; break; } @@ -4494,8 +4668,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, std::map UseCounts; SDNode *LoadMaxUses = nullptr; for (unsigned I = 0; I < NumElements; ++I) - if (Elems[I].getOpcode() == ISD::LOAD && - cast(Elems[I])->isUnindexed()) { + if (isVectorElementLoad(Elems[I])) { SDNode *Ld = Elems[I].getNode(); UseCounts[Ld]++; if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld]) @@ -4532,56 +4705,13 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - const SystemZInstrInfo *TII = - static_cast(Subtarget.getInstrInfo()); auto *BVN = cast(Op.getNode()); SDLoc DL(Op); EVT VT = Op.getValueType(); if (BVN->isConstant()) { - // Try using VECTOR GENERATE BYTE MASK. This is the architecturally- - // preferred way of creating all-zero and all-one vectors so give it - // priority over other methods below. - uint64_t Mask = 0; - if (tryBuildVectorByteMask(BVN, Mask)) { - SDValue Op = DAG.getNode( - SystemZISD::BYTE_MASK, DL, MVT::v16i8, - DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/)); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); - } - - // Try using some form of replication. - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, - 8, true) && - SplatBitSize <= 64) { - // First try assuming that any undefined bits above the highest set bit - // and below the lowest set bit are 1s. This increases the likelihood of - // being able to use a sign-extended element value in VECTOR REPLICATE - // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK. - uint64_t SplatBitsZ = SplatBits.getZExtValue(); - uint64_t SplatUndefZ = SplatUndef.getZExtValue(); - uint64_t Lower = (SplatUndefZ - & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1)); - uint64_t Upper = (SplatUndefZ - & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1)); - uint64_t Value = SplatBitsZ | Upper | Lower; - SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, - SplatBitSize); - if (Op.getNode()) - return Op; - - // Now try assuming that any undefined bits between the first and - // last defined set bits are set. This increases the chances of - // using a non-wraparound mask. - uint64_t Middle = SplatUndefZ & ~Upper & ~Lower; - Value = SplatBitsZ | Middle; - Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize); - if (Op.getNode()) - return Op; - } + if (SystemZVectorConstantInfo(BVN).isVectorConstantLegal(Subtarget)) + return Op; // Fall back to loading it from memory. return SDValue(); @@ -5074,6 +5204,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(VISTR_CC); OPCODE(VSTRC_CC); OPCODE(VSTRCZ_CC); + OPCODE(VSTRS_CC); + OPCODE(VSTRSZ_CC); OPCODE(TDC); OPCODE(ATOMIC_SWAPW); OPCODE(ATOMIC_LOADW_ADD); @@ -5093,6 +5225,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(ATOMIC_CMP_SWAP_128); OPCODE(LRV); OPCODE(STRV); + OPCODE(VLER); + OPCODE(VSTER); OPCODE(PREFETCH); } return nullptr; @@ -5340,8 +5474,7 @@ SDValue SystemZTargetLowering::combineMERGE( SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() == ISD::BITCAST) Op0 = Op0.getOperand(0); - if (Op0.getOpcode() == SystemZISD::BYTE_MASK && - cast(Op0.getOperand(0))->getZExtValue() == 0) { + if (ISD::isBuildVectorAllZeros(Op0.getNode())) { // (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF // for v4f32. if (Op1 == N->getOperand(0)) @@ -5407,6 +5540,31 @@ SDValue SystemZTargetLowering::combineLOAD( return SDValue(N, 0); } +bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const { + if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) + return true; + if (Subtarget.hasVectorEnhancements2()) + if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) + return true; + return false; +} + +static bool isVectorElementSwap(ArrayRef M, EVT VT) { + if (!VT.isVector() || !VT.isSimple() || + VT.getSizeInBits() != 128 || + VT.getScalarSizeInBits() % 8 != 0) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) continue; // ignore UNDEF indices + if ((unsigned) M[i] != NumElts - 1 - i) + return false; + } + + return true; +} + SDValue SystemZTargetLowering::combineSTORE( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5428,13 +5586,11 @@ SDValue SystemZTargetLowering::combineSTORE( SN->getMemOperand()); } } - // Combine STORE (BSWAP) into STRVH/STRV/STRVG + // Combine STORE (BSWAP) into STRVH/STRV/STRVG/VSTBR if (!SN->isTruncatingStore() && Op1.getOpcode() == ISD::BSWAP && Op1.getNode()->hasOneUse() && - (Op1.getValueType() == MVT::i16 || - Op1.getValueType() == MVT::i32 || - Op1.getValueType() == MVT::i64)) { + canLoadStoreByteSwapped(Op1.getValueType())) { SDValue BSwapOp = Op1.getOperand(0); @@ -5449,15 +5605,97 @@ SDValue SystemZTargetLowering::combineSTORE( DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other), Ops, MemVT, SN->getMemOperand()); } + // Combine STORE (element-swap) into VSTER + if (!SN->isTruncatingStore() && + Op1.getOpcode() == ISD::VECTOR_SHUFFLE && + Op1.getNode()->hasOneUse() && + Subtarget.hasVectorEnhancements2()) { + ShuffleVectorSDNode *SVN = cast(Op1.getNode()); + ArrayRef ShuffleMask = SVN->getMask(); + if (isVectorElementSwap(ShuffleMask, Op1.getValueType())) { + SDValue Ops[] = { + N->getOperand(0), Op1.getOperand(0), N->getOperand(2) + }; + + return DAG.getMemIntrinsicNode(SystemZISD::VSTER, SDLoc(N), + DAG.getVTList(MVT::Other), + Ops, MemVT, SN->getMemOperand()); + } + } + + return SDValue(); +} + +SDValue SystemZTargetLowering::combineVECTOR_SHUFFLE( + SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + // Combine element-swap (LOAD) into VLER + if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && + N->getOperand(0).hasOneUse() && + Subtarget.hasVectorEnhancements2()) { + ShuffleVectorSDNode *SVN = cast(N); + ArrayRef ShuffleMask = SVN->getMask(); + if (isVectorElementSwap(ShuffleMask, N->getValueType(0))) { + SDValue Load = N->getOperand(0); + LoadSDNode *LD = cast(Load); + + // Create the element-swapping load. + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr() // Ptr + }; + SDValue ESLoad = + DAG.getMemIntrinsicNode(SystemZISD::VLER, SDLoc(N), + DAG.getVTList(LD->getValueType(0), MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + + // First, combine the VECTOR_SHUFFLE away. This makes the value produced + // by the load dead. + DCI.CombineTo(N, ESLoad); + + // Next, combine the load away, we give it a bogus result value but a real + // chain result. The result value is dead because the shuffle is dead. + DCI.CombineTo(Load.getNode(), ESLoad, ESLoad.getValue(1)); + + // Return N so it doesn't get rechecked! + return SDValue(N, 0); + } + } + return SDValue(); } SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT( SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; if (!Subtarget.hasVector()) return SDValue(); + // Look through bitcasts that retain the number of vector elements. + SDValue Op = N->getOperand(0); + if (Op.getOpcode() == ISD::BITCAST && + Op.getValueType().isVector() && + Op.getOperand(0).getValueType().isVector() && + Op.getValueType().getVectorNumElements() == + Op.getOperand(0).getValueType().getVectorNumElements()) + Op = Op.getOperand(0); + + // Pull BSWAP out of a vector extraction. + if (Op.getOpcode() == ISD::BSWAP && Op.hasOneUse()) { + EVT VecVT = Op.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), EltVT, + Op.getOperand(0), N->getOperand(1)); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Op); + if (EltVT != N->getValueType(0)) { + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op); + } + return Op; + } + // Try to simplify a vector extraction. if (auto *IndexN = dyn_cast(N->getOperand(1))) { SDValue Op0 = N->getOperand(0); @@ -5480,6 +5718,10 @@ SDValue SystemZTargetLowering::combineJOIN_DWORDS( SDValue SystemZTargetLowering::combineFP_ROUND( SDNode *N, DAGCombinerInfo &DCI) const { + + if (!Subtarget.hasVector()) + return SDValue(); + // (fpround (extract_vector_elt X 0)) // (fpround (extract_vector_elt X 1)) -> // (extract_vector_elt (VROUND X) 0) @@ -5527,6 +5769,10 @@ SDValue SystemZTargetLowering::combineFP_ROUND( SDValue SystemZTargetLowering::combineFP_EXTEND( SDNode *N, DAGCombinerInfo &DCI) const { + + if (!Subtarget.hasVector()) + return SDValue(); + // (fpextend (extract_vector_elt X 0)) // (fpextend (extract_vector_elt X 2)) -> // (extract_vector_elt (VEXTEND X) 0) @@ -5575,11 +5821,10 @@ SDValue SystemZTargetLowering::combineFP_EXTEND( SDValue SystemZTargetLowering::combineBSWAP( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - // Combine BSWAP (LOAD) into LRVH/LRV/LRVG + // Combine BSWAP (LOAD) into LRVH/LRV/LRVG/VLBR if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && N->getOperand(0).hasOneUse() && - (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 || - N->getValueType(0) == MVT::i64)) { + canLoadStoreByteSwapped(N->getValueType(0))) { SDValue Load = N->getOperand(0); LoadSDNode *LD = cast(Load); @@ -5612,61 +5857,170 @@ SDValue SystemZTargetLowering::combineBSWAP( // Return N so it doesn't get rechecked! return SDValue(N, 0); } + + // Look through bitcasts that retain the number of vector elements. + SDValue Op = N->getOperand(0); + if (Op.getOpcode() == ISD::BITCAST && + Op.getValueType().isVector() && + Op.getOperand(0).getValueType().isVector() && + Op.getValueType().getVectorNumElements() == + Op.getOperand(0).getValueType().getVectorNumElements()) + Op = Op.getOperand(0); + + // Push BSWAP into a vector insertion if at least one side then simplifies. + if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT && Op.hasOneUse()) { + SDValue Vec = Op.getOperand(0); + SDValue Elt = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (DAG.isConstantIntBuildVectorOrConstantInt(Vec) || + Vec.getOpcode() == ISD::BSWAP || Vec.isUndef() || + DAG.isConstantIntBuildVectorOrConstantInt(Elt) || + Elt.getOpcode() == ISD::BSWAP || Elt.isUndef() || + (canLoadStoreByteSwapped(N->getValueType(0)) && + ISD::isNON_EXTLoad(Elt.getNode()) && Elt.hasOneUse())) { + EVT VecVT = N->getValueType(0); + EVT EltVT = N->getValueType(0).getVectorElementType(); + if (VecVT != Vec.getValueType()) { + Vec = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Vec); + DCI.AddToWorklist(Vec.getNode()); + } + if (EltVT != Elt.getValueType()) { + Elt = DAG.getNode(ISD::BITCAST, SDLoc(N), EltVT, Elt); + DCI.AddToWorklist(Elt.getNode()); + } + Vec = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Vec); + DCI.AddToWorklist(Vec.getNode()); + Elt = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Elt); + DCI.AddToWorklist(Elt.getNode()); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VecVT, + Vec, Elt, Idx); + } + } + + // Push BSWAP into a vector shuffle if at least one side then simplifies. + ShuffleVectorSDNode *SV = dyn_cast(Op); + if (SV && Op.hasOneUse()) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || + Op0.getOpcode() == ISD::BSWAP || Op0.isUndef() || + DAG.isConstantIntBuildVectorOrConstantInt(Op1) || + Op1.getOpcode() == ISD::BSWAP || Op1.isUndef()) { + EVT VecVT = N->getValueType(0); + if (VecVT != Op0.getValueType()) { + Op0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op0); + DCI.AddToWorklist(Op0.getNode()); + } + if (VecVT != Op1.getValueType()) { + Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op1); + DCI.AddToWorklist(Op1.getNode()); + } + Op0 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op0); + DCI.AddToWorklist(Op0.getNode()); + Op1 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op1); + DCI.AddToWorklist(Op1.getNode()); + return DAG.getVectorShuffle(VecVT, SDLoc(N), Op0, Op1, SV->getMask()); + } + } + return SDValue(); } static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) { // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code // set by the CCReg instruction using the CCValid / CCMask masks, - // If the CCReg instruction is itself a (ICMP (SELECT_CCMASK)) testing - // the condition code set by some other instruction, see whether we - // can directly use that condition code. - bool Invert = false; + // If the CCReg instruction is itself a ICMP testing the condition + // code set by some other instruction, see whether we can directly + // use that condition code. - // Verify that we have an appropriate mask for a EQ or NE comparison. + // Verify that we have an ICMP against some constant. if (CCValid != SystemZ::CCMASK_ICMP) return false; - if (CCMask == SystemZ::CCMASK_CMP_NE) - Invert = !Invert; - else if (CCMask != SystemZ::CCMASK_CMP_EQ) - return false; - - // Verify that we have an ICMP that is the user of a SELECT_CCMASK. - SDNode *ICmp = CCReg.getNode(); + auto *ICmp = CCReg.getNode(); if (ICmp->getOpcode() != SystemZISD::ICMP) return false; - SDNode *Select = ICmp->getOperand(0).getNode(); - if (Select->getOpcode() != SystemZISD::SELECT_CCMASK) + auto *CompareLHS = ICmp->getOperand(0).getNode(); + auto *CompareRHS = dyn_cast(ICmp->getOperand(1)); + if (!CompareRHS) return false; - // Verify that the ICMP compares against one of select values. - auto *CompareVal = dyn_cast(ICmp->getOperand(1)); - if (!CompareVal) - return false; - auto *TrueVal = dyn_cast(Select->getOperand(0)); - if (!TrueVal) - return false; - auto *FalseVal = dyn_cast(Select->getOperand(1)); - if (!FalseVal) - return false; - if (CompareVal->getZExtValue() == FalseVal->getZExtValue()) - Invert = !Invert; - else if (CompareVal->getZExtValue() != TrueVal->getZExtValue()) - return false; + // Optimize the case where CompareLHS is a SELECT_CCMASK. + if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) { + // Verify that we have an appropriate mask for a EQ or NE comparison. + bool Invert = false; + if (CCMask == SystemZ::CCMASK_CMP_NE) + Invert = !Invert; + else if (CCMask != SystemZ::CCMASK_CMP_EQ) + return false; - // Compute the effective CC mask for the new branch or select. - auto *NewCCValid = dyn_cast(Select->getOperand(2)); - auto *NewCCMask = dyn_cast(Select->getOperand(3)); - if (!NewCCValid || !NewCCMask) - return false; - CCValid = NewCCValid->getZExtValue(); - CCMask = NewCCMask->getZExtValue(); - if (Invert) - CCMask ^= CCValid; + // Verify that the ICMP compares against one of select values. + auto *TrueVal = dyn_cast(CompareLHS->getOperand(0)); + if (!TrueVal) + return false; + auto *FalseVal = dyn_cast(CompareLHS->getOperand(1)); + if (!FalseVal) + return false; + if (CompareRHS->getZExtValue() == FalseVal->getZExtValue()) + Invert = !Invert; + else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue()) + return false; - // Return the updated CCReg link. - CCReg = Select->getOperand(4); - return true; + // Compute the effective CC mask for the new branch or select. + auto *NewCCValid = dyn_cast(CompareLHS->getOperand(2)); + auto *NewCCMask = dyn_cast(CompareLHS->getOperand(3)); + if (!NewCCValid || !NewCCMask) + return false; + CCValid = NewCCValid->getZExtValue(); + CCMask = NewCCMask->getZExtValue(); + if (Invert) + CCMask ^= CCValid; + + // Return the updated CCReg link. + CCReg = CompareLHS->getOperand(4); + return true; + } + + // Optimize the case where CompareRHS is (SRA (SHL (IPM))). + if (CompareLHS->getOpcode() == ISD::SRA) { + auto *SRACount = dyn_cast(CompareLHS->getOperand(1)); + if (!SRACount || SRACount->getZExtValue() != 30) + return false; + auto *SHL = CompareLHS->getOperand(0).getNode(); + if (SHL->getOpcode() != ISD::SHL) + return false; + auto *SHLCount = dyn_cast(SHL->getOperand(1)); + if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC) + return false; + auto *IPM = SHL->getOperand(0).getNode(); + if (IPM->getOpcode() != SystemZISD::IPM) + return false; + + // Avoid introducing CC spills (because SRA would clobber CC). + if (!CompareLHS->hasOneUse()) + return false; + // Verify that the ICMP compares against zero. + if (CompareRHS->getZExtValue() != 0) + return false; + + // Compute the effective CC mask for the new branch or select. + switch (CCMask) { + case SystemZ::CCMASK_CMP_EQ: break; + case SystemZ::CCMASK_CMP_NE: break; + case SystemZ::CCMASK_CMP_LT: CCMask = SystemZ::CCMASK_CMP_GT; break; + case SystemZ::CCMASK_CMP_GT: CCMask = SystemZ::CCMASK_CMP_LT; break; + case SystemZ::CCMASK_CMP_LE: CCMask = SystemZ::CCMASK_CMP_GE; break; + case SystemZ::CCMASK_CMP_GE: CCMask = SystemZ::CCMASK_CMP_LE; break; + default: return false; + } + + // Return the updated CCReg link. + CCReg = IPM->getOperand(0); + return true; + } + + return false; } SDValue SystemZTargetLowering::combineBR_CCMASK( @@ -5770,12 +6124,18 @@ SDValue SystemZTargetLowering::combineIntDIVREM( // since it is not Legal but Custom it can only happen before // legalization. Therefore we must scalarize this early before Combine // 1. For widened vectors, this is already the result of type legalization. - if (VT.isVector() && isTypeLegal(VT) && + if (DCI.Level == BeforeLegalizeTypes && VT.isVector() && isTypeLegal(VT) && DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1))) return DAG.UnrollVectorOp(N); return SDValue(); } +SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const { + if (N->getOpcode() == SystemZISD::PCREL_WRAPPER) + return N->getOperand(0); + return N; +} + SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch(N->getOpcode()) { @@ -5787,6 +6147,7 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI); case ISD::LOAD: return combineLOAD(N, DCI); case ISD::STORE: return combineSTORE(N, DCI); + case ISD::VECTOR_SHUFFLE: return combineVECTOR_SHUFFLE(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI); case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI); case ISD::FP_ROUND: return combineFP_ROUND(N, DCI); @@ -5977,12 +6338,10 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case Intrinsic::s390_vuplhw: case Intrinsic::s390_vuplf: { SDValue SrcOp = Op.getOperand(1); - unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits(); APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0); Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1); if (IsLogical) { - Known = Known.zext(BitWidth); - Known.Zero.setBitsFrom(SrcBitWidth); + Known = Known.zext(BitWidth, true); } else Known = Known.sext(BitWidth); break; @@ -6011,7 +6370,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, // Known has the width of the source operand(s). Adjust if needed to match // the passed bitwidth. if (Known.getBitWidth() != BitWidth) - Known = Known.zextOrTrunc(BitWidth); + Known = Known.zextOrTrunc(BitWidth, false); } static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts, @@ -6125,7 +6484,7 @@ static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI, } // Force base value Base into a register before MI. Return the register. -static unsigned forceReg(MachineInstr &MI, MachineOperand &Base, +static Register forceReg(MachineInstr &MI, MachineOperand &Base, const SystemZInstrInfo *TII) { if (Base.isReg()) return Base.getReg(); @@ -6134,7 +6493,7 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base, MachineFunction &MF = *MBB->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg) .add(Base) .addImm(0) @@ -6213,7 +6572,8 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin, // destination registers, and the registers that went into the PHI. DenseMap> RegRewriteTable; - for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; + MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) { unsigned DestReg = MIIt->getOperand(0).getReg(); unsigned TrueReg = MIIt->getOperand(1).getReg(); unsigned FalseReg = MIIt->getOperand(2).getReg(); @@ -6237,6 +6597,8 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin, // Add this PHI to the rewrite table. RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg); } + + MF->getProperties().reset(MachineFunctionProperties::Property::NoPHIs); } // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI. @@ -6254,8 +6616,8 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, // same condition code value, we want to expand all of them into // a single pair of basic blocks using the same condition. MachineInstr *LastMI = &MI; - MachineBasicBlock::iterator NextMIIt = - std::next(MachineBasicBlock::iterator(MI)); + MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward( + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); if (isSelectPseudo(MI)) while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) && @@ -6263,7 +6625,7 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, (NextMIIt->getOperand(4).getImm() == CCMask || NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) { LastMI = &*NextMIIt; - ++NextMIIt; + NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end()); } MachineBasicBlock *StartMBB = MBB; @@ -6296,8 +6658,8 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, // ... MBB = JoinMBB; MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); - MachineBasicBlock::iterator MIItEnd = - std::next(MachineBasicBlock::iterator(LastMI)); + MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward( + std::next(MachineBasicBlock::iterator(LastMI)), MBB->end()); createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB); StartMBB->erase(MIItBegin, MIItEnd); @@ -6415,8 +6777,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); MachineOperand Src2 = earlyUseOperand(MI.getOperand(3)); - unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0); - unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0); + Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register(); + Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register(); DebugLoc DL = MI.getDebugLoc(); if (IsSubWord) BitSize = MI.getOperand(6).getImm(); @@ -6434,12 +6796,12 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( assert(LOpcode && CSOpcode && "Displacement out of range"); // Create virtual registers for temporary results. - unsigned OrigVal = MRI.createVirtualRegister(RC); - unsigned OldVal = MRI.createVirtualRegister(RC); - unsigned NewVal = (BinOpcode || IsSubWord ? + Register OrigVal = MRI.createVirtualRegister(RC); + Register OldVal = MRI.createVirtualRegister(RC); + Register NewVal = (BinOpcode || IsSubWord ? MRI.createVirtualRegister(RC) : Src2.getReg()); - unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal); - unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal); + Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal); + Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal); // Insert a basic block for the main loop. MachineBasicBlock *StartMBB = MBB; @@ -6532,9 +6894,9 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( unsigned Dest = MI.getOperand(0).getReg(); MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); - unsigned Src2 = MI.getOperand(3).getReg(); - unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0); - unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0); + Register Src2 = MI.getOperand(3).getReg(); + Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register()); + Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register()); DebugLoc DL = MI.getDebugLoc(); if (IsSubWord) BitSize = MI.getOperand(6).getImm(); @@ -6552,12 +6914,12 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( assert(LOpcode && CSOpcode && "Displacement out of range"); // Create virtual registers for temporary results. - unsigned OrigVal = MRI.createVirtualRegister(RC); - unsigned OldVal = MRI.createVirtualRegister(RC); - unsigned NewVal = MRI.createVirtualRegister(RC); - unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal); - unsigned RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2); - unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal); + Register OrigVal = MRI.createVirtualRegister(RC); + Register OldVal = MRI.createVirtualRegister(RC); + Register NewVal = MRI.createVirtualRegister(RC); + Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal); + Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2); + Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal); // Insert 3 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; @@ -6840,22 +7202,22 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( if (MI.getNumExplicitOperands() > 5) { bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); - uint64_t StartCountReg = MI.getOperand(5).getReg(); - uint64_t StartSrcReg = forceReg(MI, SrcBase, TII); - uint64_t StartDestReg = (HaveSingleBase ? StartSrcReg : + Register StartCountReg = MI.getOperand(5).getReg(); + Register StartSrcReg = forceReg(MI, SrcBase, TII); + Register StartDestReg = (HaveSingleBase ? StartSrcReg : forceReg(MI, DestBase, TII)); const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass; - uint64_t ThisSrcReg = MRI.createVirtualRegister(RC); - uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg : + Register ThisSrcReg = MRI.createVirtualRegister(RC); + Register ThisDestReg = (HaveSingleBase ? ThisSrcReg : MRI.createVirtualRegister(RC)); - uint64_t NextSrcReg = MRI.createVirtualRegister(RC); - uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg : + Register NextSrcReg = MRI.createVirtualRegister(RC); + Register NextDestReg = (HaveSingleBase ? NextSrcReg : MRI.createVirtualRegister(RC)); RC = &SystemZ::GR64BitRegClass; - uint64_t ThisCountReg = MRI.createVirtualRegister(RC); - uint64_t NextCountReg = MRI.createVirtualRegister(RC); + Register ThisCountReg = MRI.createVirtualRegister(RC); + Register NextCountReg = MRI.createVirtualRegister(RC); MachineBasicBlock *StartMBB = MBB; MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 622da32e418d..23cdcc72bc42 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -1,9 +1,8 @@ //===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,6 +15,7 @@ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H #include "SystemZ.h" +#include "SystemZInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -281,6 +281,8 @@ enum NodeType : unsigned { VISTR_CC, VSTRC_CC, VSTRCZ_CC, + VSTRS_CC, + VSTRSZ_CC, // Test Data Class. // @@ -340,6 +342,9 @@ enum NodeType : unsigned { // Byte swapping load/store. Same operands as regular load/store. LRV, STRV, + // Element swapping load/store. Same operands as regular load/store. + VLER, VSTER, + // Prefetch from the second operand using the 4-bit control code in // the first operand. The code is 1 for a load prefetch and 2 for // a store prefetch. @@ -396,10 +401,12 @@ public: return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } + bool isCheapToSpeculateCtlz() const override { return true; } EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; bool isLegalICmpImmediate(int64_t Imm) const override; bool isLegalAddImmediate(int64_t Imm) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, @@ -407,6 +414,7 @@ public: Instruction *I = nullptr) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags, bool *Fast) const override; bool isTruncateFree(Type *, Type *) const override; bool isTruncateFree(EVT, EVT) const override; @@ -568,6 +576,9 @@ private: SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + bool isVectorElementLoad(SDValue Op) const; + SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, + SmallVectorImpl &Elems) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; @@ -587,8 +598,10 @@ private: SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const; + bool canLoadStoreByteSwapped(EVT VT) const; SDValue combineLOAD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVECTOR_SHUFFLE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const; @@ -599,6 +612,8 @@ private: SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue unwrapAddress(SDValue N) const override; + // If the last instruction before MBBI in MBB was some form of COMPARE, // try to replace it with a COMPARE AND BRANCH just before MBBI. // CCMask and Target are the BRC-like operands for the branch. @@ -639,8 +654,27 @@ private: MachineBasicBlock *MBB, unsigned Opcode) const; + MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; }; + +struct SystemZVectorConstantInfo { +private: + APInt IntBits; // The 128 bits as an integer. + APInt SplatBits; // Smallest splat value. + APInt SplatUndef; // Bits correspoding to undef operands of the BVN. + unsigned SplatBitSize = 0; + bool isFP128 = false; + +public: + unsigned Opcode = 0; + SmallVector OpVals; + MVT VecVT; + SystemZVectorConstantInfo(APFloat FPImm); + SystemZVectorConstantInfo(BuildVectorSDNode *BVN); + bool isVectorConstantLegal(const SystemZSubtarget &Subtarget); +}; + } // end namespace llvm #endif diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h index 896b665d25eb..ec7639e71f81 100644 --- a/lib/Target/SystemZ/SystemZInstrBuilder.h +++ b/lib/Target/SystemZ/SystemZInstrBuilder.h @@ -1,9 +1,8 @@ //===-- SystemZInstrBuilder.h - Functions to aid building insts -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZInstrDFP.td b/lib/Target/SystemZ/SystemZInstrDFP.td index 08ab2d7bbc52..8d7a773ff4d9 100644 --- a/lib/Target/SystemZ/SystemZInstrDFP.td +++ b/lib/Target/SystemZ/SystemZInstrDFP.td @@ -1,9 +1,8 @@ //==- SystemZInstrDFP.td - Floating-point SystemZ instructions -*- tblgen-*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,7 +19,7 @@ //===----------------------------------------------------------------------===// // Load and test. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { def LTDTR : UnaryRRE<"ltdtr", 0xB3D6, null_frag, FP64, FP64>; def LTXTR : UnaryRRE<"ltxtr", 0xB3DE, null_frag, FP128, FP128>; } @@ -32,25 +31,31 @@ let Defs = [CC] in { // Convert floating-point values to narrower representations. The destination // of LDXTR is a 128-bit value, but only the first register of the pair is used. -def LEDTR : TernaryRRFe<"ledtr", 0xB3D5, FP32, FP64>; -def LDXTR : TernaryRRFe<"ldxtr", 0xB3DD, FP128, FP128>; +let Uses = [FPC] in { + def LEDTR : TernaryRRFe<"ledtr", 0xB3D5, FP32, FP64>; + def LDXTR : TernaryRRFe<"ldxtr", 0xB3DD, FP128, FP128>; +} // Extend floating-point values to wider representations. -def LDETR : BinaryRRFd<"ldetr", 0xB3D4, FP64, FP32>; -def LXDTR : BinaryRRFd<"lxdtr", 0xB3DC, FP128, FP64>; +let Uses = [FPC] in { + def LDETR : BinaryRRFd<"ldetr", 0xB3D4, FP64, FP32>; + def LXDTR : BinaryRRFd<"lxdtr", 0xB3DC, FP128, FP64>; +} // Convert a signed integer value to a floating-point one. -def CDGTR : UnaryRRE<"cdgtr", 0xB3F1, null_frag, FP64, GR64>; -def CXGTR : UnaryRRE<"cxgtr", 0xB3F9, null_frag, FP128, GR64>; -let Predicates = [FeatureFPExtension] in { - def CDGTRA : TernaryRRFe<"cdgtra", 0xB3F1, FP64, GR64>; - def CXGTRA : TernaryRRFe<"cxgtra", 0xB3F9, FP128, GR64>; - def CDFTR : TernaryRRFe<"cdftr", 0xB951, FP64, GR32>; - def CXFTR : TernaryRRFe<"cxftr", 0xB959, FP128, GR32>; +let Uses = [FPC] in { + def CDGTR : UnaryRRE<"cdgtr", 0xB3F1, null_frag, FP64, GR64>; + def CXGTR : UnaryRRE<"cxgtr", 0xB3F9, null_frag, FP128, GR64>; + let Predicates = [FeatureFPExtension] in { + def CDGTRA : TernaryRRFe<"cdgtra", 0xB3F1, FP64, GR64>; + def CXGTRA : TernaryRRFe<"cxgtra", 0xB3F9, FP128, GR64>; + def CDFTR : TernaryRRFe<"cdftr", 0xB951, FP64, GR32>; + def CXFTR : TernaryRRFe<"cxftr", 0xB959, FP128, GR32>; + } } // Convert an unsigned integer value to a floating-point one. -let Predicates = [FeatureFPExtension] in { +let Uses = [FPC], Predicates = [FeatureFPExtension] in { def CDLGTR : TernaryRRFe<"cdlgtr", 0xB952, FP64, GR64>; def CXLGTR : TernaryRRFe<"cxlgtr", 0xB95A, FP128, GR64>; def CDLFTR : TernaryRRFe<"cdlftr", 0xB953, FP64, GR32>; @@ -58,7 +63,7 @@ let Predicates = [FeatureFPExtension] in { } // Convert a floating-point value to a signed integer value. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { def CGDTR : BinaryRRFe<"cgdtr", 0xB3E1, GR64, FP64>; def CGXTR : BinaryRRFe<"cgxtr", 0xB3E9, GR64, FP128>; let Predicates = [FeatureFPExtension] in { @@ -70,7 +75,7 @@ let Defs = [CC] in { } // Convert a floating-point value to an unsigned integer value. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { let Predicates = [FeatureFPExtension] in { def CLGDTR : TernaryRRFe<"clgdtr", 0xB942, GR64, FP64>; def CLGXTR : TernaryRRFe<"clgxtr", 0xB94A, GR64, FP128>; @@ -108,7 +113,7 @@ let Predicates = [FeatureDFPPackedConversion] in { } // Perform floating-point operation. -let Defs = [CC, R1L, F0Q], Uses = [R0L, F4Q] in +let Defs = [CC, R1L, F0Q], Uses = [FPC, R0L, F4Q] in def PFPO : SideEffectInherentE<"pfpo", 0x010A>; @@ -118,8 +123,10 @@ let Defs = [CC, R1L, F0Q], Uses = [R0L, F4Q] in // Round to an integer, with the second operand (M3) specifying the rounding // mode. M4 can be set to 4 to suppress detection of inexact conditions. -def FIDTR : TernaryRRFe<"fidtr", 0xB3D7, FP64, FP64>; -def FIXTR : TernaryRRFe<"fixtr", 0xB3DF, FP128, FP128>; +let Uses = [FPC] in { + def FIDTR : TernaryRRFe<"fidtr", 0xB3D7, FP64, FP64>; + def FIXTR : TernaryRRFe<"fixtr", 0xB3DF, FP128, FP128>; +} // Extract biased exponent. def EEDTR : UnaryRRE<"eedtr", 0xB3E5, null_frag, FP64, FP64>; @@ -135,7 +142,7 @@ def ESXTR : UnaryRRE<"esxtr", 0xB3EF, null_frag, FP128, FP128>; //===----------------------------------------------------------------------===// // Addition. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { let isCommutable = 1 in { def ADTR : BinaryRRFa<"adtr", 0xB3D2, null_frag, FP64, FP64, FP64>; def AXTR : BinaryRRFa<"axtr", 0xB3DA, null_frag, FP128, FP128, FP128>; @@ -147,7 +154,7 @@ let Defs = [CC] in { } // Subtraction. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { def SDTR : BinaryRRFa<"sdtr", 0xB3D3, null_frag, FP64, FP64, FP64>; def SXTR : BinaryRRFa<"sxtr", 0xB3DB, null_frag, FP128, FP128, FP128>; let Predicates = [FeatureFPExtension] in { @@ -157,30 +164,38 @@ let Defs = [CC] in { } // Multiplication. -let isCommutable = 1 in { - def MDTR : BinaryRRFa<"mdtr", 0xB3D0, null_frag, FP64, FP64, FP64>; - def MXTR : BinaryRRFa<"mxtr", 0xB3D8, null_frag, FP128, FP128, FP128>; -} -let Predicates = [FeatureFPExtension] in { - def MDTRA : TernaryRRFa<"mdtra", 0xB3D0, FP64, FP64, FP64>; - def MXTRA : TernaryRRFa<"mxtra", 0xB3D8, FP128, FP128, FP128>; +let Uses = [FPC] in { + let isCommutable = 1 in { + def MDTR : BinaryRRFa<"mdtr", 0xB3D0, null_frag, FP64, FP64, FP64>; + def MXTR : BinaryRRFa<"mxtr", 0xB3D8, null_frag, FP128, FP128, FP128>; + } + let Predicates = [FeatureFPExtension] in { + def MDTRA : TernaryRRFa<"mdtra", 0xB3D0, FP64, FP64, FP64>; + def MXTRA : TernaryRRFa<"mxtra", 0xB3D8, FP128, FP128, FP128>; + } } // Division. -def DDTR : BinaryRRFa<"ddtr", 0xB3D1, null_frag, FP64, FP64, FP64>; -def DXTR : BinaryRRFa<"dxtr", 0xB3D9, null_frag, FP128, FP128, FP128>; -let Predicates = [FeatureFPExtension] in { - def DDTRA : TernaryRRFa<"ddtra", 0xB3D1, FP64, FP64, FP64>; - def DXTRA : TernaryRRFa<"dxtra", 0xB3D9, FP128, FP128, FP128>; +let Uses = [FPC] in { + def DDTR : BinaryRRFa<"ddtr", 0xB3D1, null_frag, FP64, FP64, FP64>; + def DXTR : BinaryRRFa<"dxtr", 0xB3D9, null_frag, FP128, FP128, FP128>; + let Predicates = [FeatureFPExtension] in { + def DDTRA : TernaryRRFa<"ddtra", 0xB3D1, FP64, FP64, FP64>; + def DXTRA : TernaryRRFa<"dxtra", 0xB3D9, FP128, FP128, FP128>; + } } // Quantize. -def QADTR : TernaryRRFb<"qadtr", 0xB3F5, FP64, FP64, FP64>; -def QAXTR : TernaryRRFb<"qaxtr", 0xB3FD, FP128, FP128, FP128>; +let Uses = [FPC] in { + def QADTR : TernaryRRFb<"qadtr", 0xB3F5, FP64, FP64, FP64>; + def QAXTR : TernaryRRFb<"qaxtr", 0xB3FD, FP128, FP128, FP128>; +} // Reround. -def RRDTR : TernaryRRFb<"rrdtr", 0xB3F7, FP64, FP64, FP64>; -def RRXTR : TernaryRRFb<"rrxtr", 0xB3FF, FP128, FP128, FP128>; +let Uses = [FPC] in { + def RRDTR : TernaryRRFb<"rrdtr", 0xB3F7, FP64, FP64, FP64>; + def RRXTR : TernaryRRFb<"rrxtr", 0xB3FF, FP128, FP128, FP128>; +} // Shift significand left/right. def SLDT : BinaryRXF<"sldt", 0xED40, null_frag, FP64, FP64, null_frag, 0>; @@ -198,13 +213,13 @@ def IEXTR : BinaryRRFb<"iextr", 0xB3FE, null_frag, FP128, FP128, FP128>; //===----------------------------------------------------------------------===// // Compare. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { def CDTR : CompareRRE<"cdtr", 0xB3E4, null_frag, FP64, FP64>; def CXTR : CompareRRE<"cxtr", 0xB3EC, null_frag, FP128, FP128>; } // Compare and signal. -let Defs = [CC] in { +let Uses = [FPC], Defs = [CC] in { def KDTR : CompareRRE<"kdtr", 0xB3E0, null_frag, FP64, FP64>; def KXTR : CompareRRE<"kxtr", 0xB3E8, null_frag, FP128, FP128>; } diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index 1374ee91fa29..19c7ec58ed3d 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -1,9 +1,8 @@ //==- SystemZInstrFP.td - Floating-point SystemZ instructions --*- tblgen-*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -53,7 +52,8 @@ let isCodeGenOnly = 1 in // Moves between two floating-point registers that also set the condition // codes. -let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { +let Uses = [FPC], mayRaiseFPException = 1, + Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { defm LTEBR : LoadAndTestRRE<"ltebr", 0xB302, FP32>; defm LTDBR : LoadAndTestRRE<"ltdbr", 0xB312, FP64>; defm LTXBR : LoadAndTestRRE<"ltxbr", 0xB342, FP128>; @@ -69,7 +69,8 @@ let Predicates = [FeatureNoVector] in { // Use a normal load-and-test for compare against zero in case of // vector support (via a pseudo to simplify instruction selection). -let Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { +let Uses = [FPC], mayRaiseFPException = 1, + Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>; def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>; def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>; @@ -174,56 +175,64 @@ let SimpleBDXStore = 1, mayStore = 1 in { // Convert floating-point values to narrower representations, rounding // according to the current mode. The destination of LEXBR and LDXBR // is a 128-bit value, but only the first register of the pair is used. -def LEDBR : UnaryRRE<"ledbr", 0xB344, fpround, FP32, FP64>; -def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>; -def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>; - -def LEDBRA : TernaryRRFe<"ledbra", 0xB344, FP32, FP64>, - Requires<[FeatureFPExtension]>; -def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>, - Requires<[FeatureFPExtension]>; -def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>, - Requires<[FeatureFPExtension]>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def LEDBR : UnaryRRE<"ledbr", 0xB344, any_fpround, FP32, FP64>; + def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>; + def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>; + + def LEDBRA : TernaryRRFe<"ledbra", 0xB344, FP32, FP64>, + Requires<[FeatureFPExtension]>; + def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>, + Requires<[FeatureFPExtension]>; + def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>, + Requires<[FeatureFPExtension]>; +} let Predicates = [FeatureNoVectorEnhancements1] in { - def : Pat<(f32 (fpround FP128:$src)), + def : Pat<(f32 (any_fpround FP128:$src)), (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>; - def : Pat<(f64 (fpround FP128:$src)), + def : Pat<(f64 (any_fpround FP128:$src)), (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>; } // Extend register floating-point values to wider representations. -def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>; -def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>; -def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def LDEBR : UnaryRRE<"ldebr", 0xB304, any_fpextend, FP64, FP32>; + def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>; + def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>; +} let Predicates = [FeatureNoVectorEnhancements1] in { - def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>; - def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>; + def : Pat<(f128 (any_fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>; + def : Pat<(f128 (any_fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>; } // Extend memory floating-point values to wider representations. -def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>; -def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>; -def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>; + def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>; + def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>; +} let Predicates = [FeatureNoVectorEnhancements1] in { - def : Pat<(f128 (extloadf32 bdxaddr12only:$src)), + def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)), (LXEB bdxaddr12only:$src)>; - def : Pat<(f128 (extloadf64 bdxaddr12only:$src)), + def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)), (LXDB bdxaddr12only:$src)>; } // Convert a signed integer register value to a floating-point one. -def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>; -def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64, GR32>; -def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>; - -def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32, GR64>; -def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64, GR64>; -def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>; + def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64, GR32>; + def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>; + + def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32, GR64>; + def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64, GR64>; + def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>; +} // The FP extension feature provides versions of the above that allow // specifying rounding mode and inexact-exception suppression flags. -let Predicates = [FeatureFPExtension] in { +let Uses = [FPC], mayRaiseFPException = 1, Predicates = [FeatureFPExtension] in { def CEFBRA : TernaryRRFe<"cefbra", 0xB394, FP32, GR32>; def CDFBRA : TernaryRRFe<"cdfbra", 0xB395, FP64, GR32>; def CXFBRA : TernaryRRFe<"cxfbra", 0xB396, FP128, GR32>; @@ -235,13 +244,15 @@ let Predicates = [FeatureFPExtension] in { // Convert am unsigned integer register value to a floating-point one. let Predicates = [FeatureFPExtension] in { - def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32, GR32>; - def CDLFBR : TernaryRRFe<"cdlfbr", 0xB391, FP64, GR32>; - def CXLFBR : TernaryRRFe<"cxlfbr", 0xB392, FP128, GR32>; - - def CELGBR : TernaryRRFe<"celgbr", 0xB3A0, FP32, GR64>; - def CDLGBR : TernaryRRFe<"cdlgbr", 0xB3A1, FP64, GR64>; - def CXLGBR : TernaryRRFe<"cxlgbr", 0xB3A2, FP128, GR64>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32, GR32>; + def CDLFBR : TernaryRRFe<"cdlfbr", 0xB391, FP64, GR32>; + def CXLFBR : TernaryRRFe<"cxlfbr", 0xB392, FP128, GR32>; + + def CELGBR : TernaryRRFe<"celgbr", 0xB3A0, FP32, GR64>; + def CDLGBR : TernaryRRFe<"cdlgbr", 0xB3A1, FP64, GR64>; + def CXLGBR : TernaryRRFe<"cxlgbr", 0xB3A2, FP128, GR64>; + } def : Pat<(f32 (uint_to_fp GR32:$src)), (CELFBR 0, GR32:$src, 0)>; def : Pat<(f64 (uint_to_fp GR32:$src)), (CDLFBR 0, GR32:$src, 0)>; @@ -254,7 +265,7 @@ let Predicates = [FeatureFPExtension] in { // Convert a floating-point register value to a signed integer value, // with the second operand (modifier M3) specifying the rounding mode. -let Defs = [CC] in { +let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in { def CFEBR : BinaryRRFe<"cfebr", 0xB398, GR32, FP32>; def CFDBR : BinaryRRFe<"cfdbr", 0xB399, GR32, FP64>; def CFXBR : BinaryRRFe<"cfxbr", 0xB39A, GR32, FP128>; @@ -275,7 +286,8 @@ def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>; // The FP extension feature provides versions of the above that allow // also specifying the inexact-exception suppression flag. -let Predicates = [FeatureFPExtension], Defs = [CC] in { +let Uses = [FPC], mayRaiseFPException = 1, + Predicates = [FeatureFPExtension], Defs = [CC] in { def CFEBRA : TernaryRRFe<"cfebra", 0xB398, GR32, FP32>; def CFDBRA : TernaryRRFe<"cfdbra", 0xB399, GR32, FP64>; def CFXBRA : TernaryRRFe<"cfxbra", 0xB39A, GR32, FP128>; @@ -287,7 +299,7 @@ let Predicates = [FeatureFPExtension], Defs = [CC] in { // Convert a floating-point register value to an unsigned integer value. let Predicates = [FeatureFPExtension] in { - let Defs = [CC] in { + let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in { def CLFEBR : TernaryRRFe<"clfebr", 0xB39C, GR32, FP32>; def CLFDBR : TernaryRRFe<"clfdbr", 0xB39D, GR32, FP64>; def CLFXBR : TernaryRRFe<"clfxbr", 0xB39E, GR32, FP128>; @@ -353,59 +365,65 @@ let isCodeGenOnly = 1 in def LNDFR_32 : UnaryRRE<"lndfr", 0xB371, fnabs, FP32, FP32>; // Square root. -def SQEBR : UnaryRRE<"sqebr", 0xB314, fsqrt, FP32, FP32>; -def SQDBR : UnaryRRE<"sqdbr", 0xB315, fsqrt, FP64, FP64>; -def SQXBR : UnaryRRE<"sqxbr", 0xB316, fsqrt, FP128, FP128>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def SQEBR : UnaryRRE<"sqebr", 0xB314, any_fsqrt, FP32, FP32>; + def SQDBR : UnaryRRE<"sqdbr", 0xB315, any_fsqrt, FP64, FP64>; + def SQXBR : UnaryRRE<"sqxbr", 0xB316, any_fsqrt, FP128, FP128>; -def SQEB : UnaryRXE<"sqeb", 0xED14, loadu, FP32, 4>; -def SQDB : UnaryRXE<"sqdb", 0xED15, loadu, FP64, 8>; + def SQEB : UnaryRXE<"sqeb", 0xED14, loadu, FP32, 4>; + def SQDB : UnaryRXE<"sqdb", 0xED15, loadu, FP64, 8>; +} // Round to an integer, with the second operand (modifier M3) specifying // the rounding mode. These forms always check for inexact conditions. -def FIEBR : BinaryRRFe<"fiebr", 0xB357, FP32, FP32>; -def FIDBR : BinaryRRFe<"fidbr", 0xB35F, FP64, FP64>; -def FIXBR : BinaryRRFe<"fixbr", 0xB347, FP128, FP128>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def FIEBR : BinaryRRFe<"fiebr", 0xB357, FP32, FP32>; + def FIDBR : BinaryRRFe<"fidbr", 0xB35F, FP64, FP64>; + def FIXBR : BinaryRRFe<"fixbr", 0xB347, FP128, FP128>; +} // frint rounds according to the current mode (modifier 0) and detects // inexact conditions. -def : Pat<(frint FP32:$src), (FIEBR 0, FP32:$src)>; -def : Pat<(frint FP64:$src), (FIDBR 0, FP64:$src)>; -def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>; +def : Pat<(any_frint FP32:$src), (FIEBR 0, FP32:$src)>; +def : Pat<(any_frint FP64:$src), (FIDBR 0, FP64:$src)>; +def : Pat<(any_frint FP128:$src), (FIXBR 0, FP128:$src)>; let Predicates = [FeatureFPExtension] in { // Extended forms of the FIxBR instructions. M4 can be set to 4 // to suppress detection of inexact conditions. - def FIEBRA : TernaryRRFe<"fiebra", 0xB357, FP32, FP32>; - def FIDBRA : TernaryRRFe<"fidbra", 0xB35F, FP64, FP64>; - def FIXBRA : TernaryRRFe<"fixbra", 0xB347, FP128, FP128>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def FIEBRA : TernaryRRFe<"fiebra", 0xB357, FP32, FP32>; + def FIDBRA : TernaryRRFe<"fidbra", 0xB35F, FP64, FP64>; + def FIXBRA : TernaryRRFe<"fixbra", 0xB347, FP128, FP128>; + } // fnearbyint is like frint but does not detect inexact conditions. - def : Pat<(fnearbyint FP32:$src), (FIEBRA 0, FP32:$src, 4)>; - def : Pat<(fnearbyint FP64:$src), (FIDBRA 0, FP64:$src, 4)>; - def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>; + def : Pat<(any_fnearbyint FP32:$src), (FIEBRA 0, FP32:$src, 4)>; + def : Pat<(any_fnearbyint FP64:$src), (FIDBRA 0, FP64:$src, 4)>; + def : Pat<(any_fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>; // floor is no longer allowed to raise an inexact condition, // so restrict it to the cases where the condition can be suppressed. // Mode 7 is round towards -inf. - def : Pat<(ffloor FP32:$src), (FIEBRA 7, FP32:$src, 4)>; - def : Pat<(ffloor FP64:$src), (FIDBRA 7, FP64:$src, 4)>; - def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>; + def : Pat<(any_ffloor FP32:$src), (FIEBRA 7, FP32:$src, 4)>; + def : Pat<(any_ffloor FP64:$src), (FIDBRA 7, FP64:$src, 4)>; + def : Pat<(any_ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>; // Same idea for ceil, where mode 6 is round towards +inf. - def : Pat<(fceil FP32:$src), (FIEBRA 6, FP32:$src, 4)>; - def : Pat<(fceil FP64:$src), (FIDBRA 6, FP64:$src, 4)>; - def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>; + def : Pat<(any_fceil FP32:$src), (FIEBRA 6, FP32:$src, 4)>; + def : Pat<(any_fceil FP64:$src), (FIDBRA 6, FP64:$src, 4)>; + def : Pat<(any_fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>; // Same idea for trunc, where mode 5 is round towards zero. - def : Pat<(ftrunc FP32:$src), (FIEBRA 5, FP32:$src, 4)>; - def : Pat<(ftrunc FP64:$src), (FIDBRA 5, FP64:$src, 4)>; - def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>; + def : Pat<(any_ftrunc FP32:$src), (FIEBRA 5, FP32:$src, 4)>; + def : Pat<(any_ftrunc FP64:$src), (FIDBRA 5, FP64:$src, 4)>; + def : Pat<(any_ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>; // Same idea for round, where mode 1 is round towards nearest with // ties away from zero. - def : Pat<(fround FP32:$src), (FIEBRA 1, FP32:$src, 4)>; - def : Pat<(fround FP64:$src), (FIDBRA 1, FP64:$src, 4)>; - def : Pat<(fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>; + def : Pat<(any_fround FP32:$src), (FIEBRA 1, FP32:$src, 4)>; + def : Pat<(any_fround FP64:$src), (FIDBRA 1, FP64:$src, 4)>; + def : Pat<(any_fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>; } //===----------------------------------------------------------------------===// @@ -413,87 +431,103 @@ let Predicates = [FeatureFPExtension] in { //===----------------------------------------------------------------------===// // Addition. -let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { +let Uses = [FPC], mayRaiseFPException = 1, + Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { let isCommutable = 1 in { - def AEBR : BinaryRRE<"aebr", 0xB30A, fadd, FP32, FP32>; - def ADBR : BinaryRRE<"adbr", 0xB31A, fadd, FP64, FP64>; - def AXBR : BinaryRRE<"axbr", 0xB34A, fadd, FP128, FP128>; + def AEBR : BinaryRRE<"aebr", 0xB30A, any_fadd, FP32, FP32>; + def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>; + def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>; } - def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load, 4>; - def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load, 8>; + def AEB : BinaryRXE<"aeb", 0xED0A, any_fadd, FP32, load, 4>; + def ADB : BinaryRXE<"adb", 0xED1A, any_fadd, FP64, load, 8>; } // Subtraction. -let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def SEBR : BinaryRRE<"sebr", 0xB30B, fsub, FP32, FP32>; - def SDBR : BinaryRRE<"sdbr", 0xB31B, fsub, FP64, FP64>; - def SXBR : BinaryRRE<"sxbr", 0xB34B, fsub, FP128, FP128>; - - def SEB : BinaryRXE<"seb", 0xED0B, fsub, FP32, load, 4>; - def SDB : BinaryRXE<"sdb", 0xED1B, fsub, FP64, load, 8>; +let Uses = [FPC], mayRaiseFPException = 1, + Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { + def SEBR : BinaryRRE<"sebr", 0xB30B, any_fsub, FP32, FP32>; + def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>; + def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>; + + def SEB : BinaryRXE<"seb", 0xED0B, any_fsub, FP32, load, 4>; + def SDB : BinaryRXE<"sdb", 0xED1B, any_fsub, FP64, load, 8>; } // Multiplication. -let isCommutable = 1 in { - def MEEBR : BinaryRRE<"meebr", 0xB317, fmul, FP32, FP32>; - def MDBR : BinaryRRE<"mdbr", 0xB31C, fmul, FP64, FP64>; - def MXBR : BinaryRRE<"mxbr", 0xB34C, fmul, FP128, FP128>; +let Uses = [FPC], mayRaiseFPException = 1 in { + let isCommutable = 1 in { + def MEEBR : BinaryRRE<"meebr", 0xB317, any_fmul, FP32, FP32>; + def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>; + def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>; + } + def MEEB : BinaryRXE<"meeb", 0xED17, any_fmul, FP32, load, 4>; + def MDB : BinaryRXE<"mdb", 0xED1C, any_fmul, FP64, load, 8>; } -def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load, 4>; -def MDB : BinaryRXE<"mdb", 0xED1C, fmul, FP64, load, 8>; // f64 multiplication of two FP32 registers. -def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>; -def : Pat<(fmul (f64 (fpextend FP32:$src1)), (f64 (fpextend FP32:$src2))), +let Uses = [FPC], mayRaiseFPException = 1 in + def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>; +def : Pat<(any_fmul (f64 (fpextend FP32:$src1)), + (f64 (fpextend FP32:$src2))), (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32), FP32:$src2)>; // f64 multiplication of an FP32 register and an f32 memory. -def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>; -def : Pat<(fmul (f64 (fpextend FP32:$src1)), - (f64 (extloadf32 bdxaddr12only:$addr))), +let Uses = [FPC], mayRaiseFPException = 1 in + def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>; +def : Pat<(any_fmul (f64 (fpextend FP32:$src1)), + (f64 (extloadf32 bdxaddr12only:$addr))), (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32), bdxaddr12only:$addr)>; // f128 multiplication of two FP64 registers. -def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>; +let Uses = [FPC], mayRaiseFPException = 1 in + def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>; let Predicates = [FeatureNoVectorEnhancements1] in - def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))), + def : Pat<(any_fmul (f128 (fpextend FP64:$src1)), + (f128 (fpextend FP64:$src2))), (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), FP64:$src2)>; // f128 multiplication of an FP64 register and an f64 memory. -def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>; +let Uses = [FPC], mayRaiseFPException = 1 in + def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>; let Predicates = [FeatureNoVectorEnhancements1] in - def : Pat<(fmul (f128 (fpextend FP64:$src1)), - (f128 (extloadf64 bdxaddr12only:$addr))), + def : Pat<(any_fmul (f128 (fpextend FP64:$src1)), + (f128 (extloadf64 bdxaddr12only:$addr))), (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), bdxaddr12only:$addr)>; // Fused multiply-add. -def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>; -def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64, FP64>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>; + def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>; -def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, FP32, load, 4>; -def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, FP64, load, 8>; + def MAEB : TernaryRXF<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>; + def MADB : TernaryRXF<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>; +} // Fused multiply-subtract. -def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32, FP32>; -def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64, FP64>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>; + def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>; -def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, FP32, load, 4>; -def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, FP64, load, 8>; + def MSEB : TernaryRXF<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>; + def MSDB : TernaryRXF<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>; +} // Division. -def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32, FP32>; -def DDBR : BinaryRRE<"ddbr", 0xB31D, fdiv, FP64, FP64>; -def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>; +let Uses = [FPC], mayRaiseFPException = 1 in { + def DEBR : BinaryRRE<"debr", 0xB30D, any_fdiv, FP32, FP32>; + def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64, FP64>; + def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>; -def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>; -def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>; + def DEB : BinaryRXE<"deb", 0xED0D, any_fdiv, FP32, load, 4>; + def DDB : BinaryRXE<"ddb", 0xED1D, any_fdiv, FP64, load, 8>; +} // Divide to integer. -let Defs = [CC] in { +let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in { def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>; def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>; } @@ -502,7 +536,7 @@ let Defs = [CC] in { // Comparisons //===----------------------------------------------------------------------===// -let Defs = [CC], CCValues = 0xF in { +let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in { def CEBR : CompareRRE<"cebr", 0xB309, z_fcmp, FP32, FP32>; def CDBR : CompareRRE<"cdbr", 0xB319, z_fcmp, FP64, FP64>; def CXBR : CompareRRE<"cxbr", 0xB349, z_fcmp, FP128, FP128>; @@ -532,20 +566,28 @@ let Defs = [CC], CCValues = 0xC in { let hasSideEffects = 1 in { let mayLoad = 1, mayStore = 1 in { // TODO: EFPC and SFPC do not touch memory at all - def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>; - def STFPC : StoreInherentS<"stfpc", 0xB29C, storei, 4>; - - def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>; - def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu, 4>; + let Uses = [FPC] in { + def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>; + def STFPC : StoreInherentS<"stfpc", 0xB29C, storei, 4>; + } + + let Defs = [FPC] in { + def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>; + def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu, 4>; + } } - def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>; - def LFAS : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>; + let Defs = [FPC], mayRaiseFPException = 1 in { + def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>; + def LFAS : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>; + } - def SRNMB : SideEffectAddressS<"srnmb", 0xB2B8, null_frag, shift12only>, - Requires<[FeatureFPExtension]>; - def SRNM : SideEffectAddressS<"srnm", 0xB299, null_frag, shift12only>; - def SRNMT : SideEffectAddressS<"srnmt", 0xB2B9, null_frag, shift12only>; + let Uses = [FPC], Defs = [FPC] in { + def SRNMB : SideEffectAddressS<"srnmb", 0xB2B8, null_frag, shift12only>, + Requires<[FeatureFPExtension]>; + def SRNM : SideEffectAddressS<"srnm", 0xB299, null_frag, shift12only>; + def SRNMT : SideEffectAddressS<"srnmt", 0xB2B9, null_frag, shift12only>; + } } //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 1e904a86ea79..2a1d14de3ddf 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -1,9 +1,8 @@ //==- SystemZInstrFormats.td - SystemZ Instruction Formats --*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -38,6 +37,12 @@ class InstSystemZ op, dag outs, dag ins, string asmstr, list pattern> bits<4> R1; bits<5> V2; bits<4> M3; + bits<4> M4; let Inst{47-40} = op{15-8}; let Inst{39-36} = R1; let Inst{35-32} = V2{3-0}; let Inst{31-24} = 0; let Inst{23-20} = M3; - let Inst{19-12} = 0; + let Inst{19-16} = M4; + let Inst{15-12} = 0; let Inst{11} = 0; let Inst{10} = V2{4}; let Inst{9-8} = 0; @@ -2410,11 +2427,16 @@ class LoadMultipleSSe opcode, RegisterOperand cls> let mayLoad = 1; } -class LoadMultipleVRSa opcode> - : InstVRSa { - let M4 = 0; - let mayLoad = 1; +multiclass LoadMultipleVRSaAlign opcode> { + let mayLoad = 1 in { + def Align : InstVRSa; + let M4 = 0 in + def "" : InstVRSa; + } } class StoreRILPC opcode, SDPatternOperator operator, @@ -2469,12 +2491,29 @@ class StoreVRX opcode, SDPatternOperator operator, TypedReg tr, bits<5> bytes, bits<4> type = 0> : InstVRX { + [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2)]> { let M3 = type; let mayStore = 1; let AccessBytes = bytes; } +class StoreVRXGeneric opcode> + : InstVRX { + let mayStore = 1; +} + +multiclass StoreVRXAlign opcode> { + let mayStore = 1, AccessBytes = 16 in { + def Align : InstVRX; + let M3 = 0 in + def "" : InstVRX; + } +} + class StoreLengthVRSb opcode, SDPatternOperator operator, bits<5> bytes> : InstVRSb rsOpcode, } } -class StoreMultipleVRSa opcode> - : InstVRSa { - let M4 = 0; - let mayStore = 1; +multiclass StoreMultipleVRSaAlign opcode> { + let mayStore = 1 in { + def Align : InstVRSa; + let M4 = 0 in + def "" : InstVRSa; + } } // StoreSI* instructions are used to store an integer to memory, but the @@ -2925,6 +2969,17 @@ class UnaryVRXGeneric opcode> let mayLoad = 1; } +multiclass UnaryVRXAlign opcode> { + let mayLoad = 1, AccessBytes = 16 in { + def Align : InstVRX; + let M3 = 0 in + def "" : InstVRX; + } +} + class SideEffectBinaryRX opcode, RegisterOperand cls> : InstRXa opcode, SDPatternOperator operator, mnemonic#"\t$R1, $R2, $R3", [(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> { let M4 = 0; + let OpKey = mnemonic#cls1; + let OpType = "reg"; } multiclass BinaryRRAndK opcode1, bits<16> opcode2, @@ -3074,9 +3131,9 @@ multiclass BinaryRRAndK opcode1, bits<16> opcode2, RegisterOperand cls2> { let NumOpsKey = mnemonic in { let NumOpsValue = "3" in - def K : BinaryRRFa, + def K : BinaryRRFa, Requires<[FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + let NumOpsValue = "2" in def "" : BinaryRR; } } @@ -3086,9 +3143,9 @@ multiclass BinaryRREAndK opcode1, bits<16> opcode2, RegisterOperand cls2> { let NumOpsKey = mnemonic in { let NumOpsValue = "3" in - def K : BinaryRRFa, + def K : BinaryRRFa, Requires<[FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + let NumOpsValue = "2" in def "" : BinaryRRE; } } @@ -3102,6 +3159,11 @@ class BinaryRRFb opcode, SDPatternOperator operator, let M4 = 0; } +class BinaryRRFc opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFc; + class BinaryMemRRFc opcode, RegisterOperand cls1, RegisterOperand cls2, Immediate imm> : InstRRFc opcode, def Asm : AsmCondBinaryRRF; } +class CondBinaryRRFa opcode, RegisterOperand cls1, + RegisterOperand cls2, RegisterOperand cls3> + : InstRRFa { + let CCMaskLast = 1; +} + +// Like CondBinaryRRFa, but used for the raw assembly form. The condition-code +// mask is the third operand rather than being part of the mnemonic. +class AsmCondBinaryRRFa opcode, RegisterOperand cls1, + RegisterOperand cls2, RegisterOperand cls3> + : InstRRFa; + +// Like CondBinaryRRFa, but with a fixed CC mask. +class FixedCondBinaryRRFa opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFa { + let isAsmParserOnly = V.alternate; + let M4 = V.ccmask; +} + +multiclass CondBinaryRRFaPair opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> { + let isCodeGenOnly = 1 in + def "" : CondBinaryRRFa; + def Asm : AsmCondBinaryRRFa; +} + class BinaryRI opcode, SDPatternOperator operator, RegisterOperand cls, Immediate imm> : InstRIa opcode1, bits<16> opcode2, Immediate imm> { let NumOpsKey = mnemonic in { let NumOpsValue = "3" in - def K : BinaryRIE, + def K : BinaryRIE, Requires<[FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + let NumOpsValue = "2" in def "" : BinaryRI; } } @@ -3266,9 +3363,9 @@ multiclass BinaryRSAndK opcode1, bits<16> opcode2, SDPatternOperator operator, RegisterOperand cls> { let NumOpsKey = mnemonic in { let NumOpsValue = "3" in - def K : BinaryRSY, + def K : BinaryRSY, Requires<[FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + let NumOpsValue = "2" in def "" : BinaryRS; } } @@ -3563,7 +3660,9 @@ class BinaryVRRf opcode, SDPatternOperator operator, class BinaryVRRi opcode, RegisterOperand cls> : InstVRRi; + mnemonic#"\t$R1, $V2, $M3", []> { + let M4 = 0; +} class BinaryVRSa opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type> @@ -3941,6 +4040,17 @@ class SideEffectTernaryRRFa opcode, let M4 = 0; } +class SideEffectTernaryMemMemRRFa opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFa { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; + let M4 = 0; +} + class SideEffectTernaryRRFb opcode, RegisterOperand cls1, RegisterOperand cls2, RegisterOperand cls3> @@ -4229,7 +4339,7 @@ class TernaryVRRcFloatGeneric opcode> mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>; class TernaryVRRd opcode, SDPatternOperator operator, - TypedReg tr1, TypedReg tr2, bits<4> type = 0> + TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m6 = 0> : InstVRRd opcode, SDPatternOperator operator, (tr2.vt tr2.op:$V3), (tr1.vt tr1.op:$V4)))]> { let M5 = type; - let M6 = 0; + let M6 = m6; } class TernaryVRRdGeneric opcode> @@ -4247,6 +4357,34 @@ class TernaryVRRdGeneric opcode> let M6 = 0; } +// Ternary operation where the assembler mnemonic has an extra operand to +// optionally allow specifiying arbitrary M6 values. +multiclass TernaryExtraVRRd opcode, + SDPatternOperator operator, + TypedReg tr1, TypedReg tr2, bits<4> type> { + let M5 = type, Defs = [CC] in + def "" : InstVRRd; + def : Pat<(operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3), + (tr1.vt tr1.op:$V4)), + (!cast(NAME) tr2.op:$V2, tr2.op:$V3, tr1.op:$V4, 0)>; + def : InstAlias(NAME) tr1.op:$V1, tr2.op:$V2, + tr2.op:$V3, tr1.op:$V4, 0)>; +} + +multiclass TernaryExtraVRRdGeneric opcode> { + let Defs = [CC] in + def "" : InstVRRd; + def : InstAlias(NAME) VR128:$V1, VR128:$V2, VR128:$V3, + VR128:$V4, imm32zx4:$M5, 0)>; +} + class TernaryVRRe opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0> : InstVRRe opcode, SDPatternOperator operator, let M4 = type; } +class TernaryVRRi opcode, RegisterOperand cls> + : InstVRRi; + class TernaryVRSbGeneric opcode> : InstVRSb { let NumOpsKey = key in { let NumOpsValue = "3" in - def K : BinaryRIEPseudo, + def K : BinaryRIEPseudo, Requires<[FeatureHighWord, FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + let NumOpsValue = "2" in def "" : BinaryRIPseudo, Requires<[FeatureHighWord]>; } } +// A pseudo that is used during register allocation when folding a memory +// operand. The 3-address register instruction with a spilled source cannot +// be converted directly to a target 2-address reg/mem instruction. +// Mapping: R -> MemFoldPseudo -> +class MemFoldPseudo bytes, + AddressingMode mode> + : Pseudo<(outs cls:$R1), (ins cls:$R2, mode:$XBD2), []> { + let OpKey = mnemonic#"rk"#cls; + let OpType = "mem"; + let MemKey = mnemonic#cls; + let MemType = "pseudo"; + let mayLoad = 1; + let AccessBytes = bytes; + let HasIndex = 1; + let hasNoSchedulingInfo = 1; +} + // Like CompareRI, but expanded after RA depending on the choice of register. class CompareRIPseudo @@ -4639,6 +4799,17 @@ class CondBinaryRRFPseudo let CCMaskLast = 1; } +// Like CondBinaryRRFa, but expanded after RA depending on the choice of +// register. +class CondBinaryRRFaPseudo + : Pseudo<(outs cls1:$R1), + (ins cls3:$R3, cls2:$R2, cond4:$valid, cond4:$M4), + [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls3:$R3, + cond4:$valid, cond4:$M4))]> { + let CCMaskLast = 1; +} + // Like CondBinaryRIE, but expanded after RA depending on the choice of // register. class CondBinaryRIEPseudo @@ -4776,58 +4947,6 @@ class AtomicLoadWBinaryReg class AtomicLoadWBinaryImm : AtomicLoadWBinary; -// Define an instruction that operates on two fixed-length blocks of memory, -// and associated pseudo instructions for operating on blocks of any size. -// The Sequence form uses a straight-line sequence of instructions and -// the Loop form uses a loop of length-256 instructions followed by -// another instruction to handle the excess. -multiclass MemorySS opcode, - SDPatternOperator sequence, SDPatternOperator loop> { - def "" : SideEffectBinarySSa; - let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in { - def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length), - [(sequence bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length)]>; - def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256), - [(loop bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256)]>; - } -} - -// The same, but setting a CC result as comparion operator. -multiclass CompareMemorySS opcode, - SDPatternOperator sequence, SDPatternOperator loop> { - def "" : SideEffectBinarySSa; - let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length), - [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length))]>; - def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256), - [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256))]>; - } -} - -// Define an instruction that operates on two strings, both terminated -// by the character in R0. The instruction processes a CPU-determinated -// number of bytes at a time and sets CC to 3 if the instruction needs -// to be repeated. Also define a pseudo instruction that represents -// the full loop (the main instruction plus the branch on CC==3). -multiclass StringRRE opcode, - SDPatternOperator operator> { - let Uses = [R0L] in - def "" : SideEffectBinaryMemMemRRE; - let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in - def Loop : Pseudo<(outs GR64:$end), - (ins GR64:$start1, GR64:$start2, GR32:$char), - [(set GR64:$end, (operator GR64:$start1, GR64:$start2, - GR32:$char))]>; -} - // A pseudo instruction that is a direct alias of a real instruction. // These aliases are used in cases where a particular register operand is // fixed or where the same instruction is used with different register sizes. @@ -4893,3 +5012,90 @@ class RotateSelectAliasRIEf imm32zx6:$I5), []> { let Constraints = "$R1 = $R1src"; } + +//===----------------------------------------------------------------------===// +// Multiclasses that emit both real and pseudo instructions +//===----------------------------------------------------------------------===// + +multiclass BinaryRXYAndPseudo opcode, + SDPatternOperator operator, RegisterOperand cls, + SDPatternOperator load, bits<5> bytes, + AddressingMode mode = bdxaddr20only> { + + def "" : BinaryRXY { + let MemKey = mnemonic#cls; + let MemType = "target"; + } + let Has20BitOffset = 1 in + def _MemFoldPseudo : MemFoldPseudo; +} + +multiclass BinaryRXPairAndPseudo rxOpcode, + bits<16> rxyOpcode, SDPatternOperator operator, + RegisterOperand cls, + SDPatternOperator load, bits<5> bytes> { + let DispKey = mnemonic ## #cls in { + def "" : BinaryRX { + let DispSize = "12"; + let MemKey = mnemonic#cls; + let MemType = "target"; + } + let DispSize = "20" in + def Y : BinaryRXY; + } + def _MemFoldPseudo : MemFoldPseudo; +} + +// Define an instruction that operates on two fixed-length blocks of memory, +// and associated pseudo instructions for operating on blocks of any size. +// The Sequence form uses a straight-line sequence of instructions and +// the Loop form uses a loop of length-256 instructions followed by +// another instruction to handle the excess. +multiclass MemorySS opcode, + SDPatternOperator sequence, SDPatternOperator loop> { + def "" : SideEffectBinarySSa; + let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in { + def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length), + [(sequence bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length)]>; + def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length, GR64:$count256), + [(loop bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length, GR64:$count256)]>; + } +} + +// The same, but setting a CC result as comparion operator. +multiclass CompareMemorySS opcode, + SDPatternOperator sequence, SDPatternOperator loop> { + def "" : SideEffectBinarySSa; + let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length), + [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length))]>; + def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length, GR64:$count256), + [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length, GR64:$count256))]>; + } +} + +// Define an instruction that operates on two strings, both terminated +// by the character in R0. The instruction processes a CPU-determinated +// number of bytes at a time and sets CC to 3 if the instruction needs +// to be repeated. Also define a pseudo instruction that represents +// the full loop (the main instruction plus the branch on CC==3). +multiclass StringRRE opcode, + SDPatternOperator operator> { + let Uses = [R0L] in + def "" : SideEffectBinaryMemMemRRE; + let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in + def Loop : Pseudo<(outs GR64:$end), + (ins GR64:$start1, GR64:$start2, GR32:$char), + [(set GR64:$end, (operator GR64:$start1, GR64:$start2, + GR32:$char))]>; +} diff --git a/lib/Target/SystemZ/SystemZInstrHFP.td b/lib/Target/SystemZ/SystemZInstrHFP.td index 6d5b4b92f650..2e3c9932d621 100644 --- a/lib/Target/SystemZ/SystemZInstrHFP.td +++ b/lib/Target/SystemZ/SystemZInstrHFP.td @@ -1,9 +1,8 @@ //==- SystemZInstrHFP.td - Floating-point SystemZ instructions -*- tblgen-*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index b03b4edaa4ab..57c1cf4ec70a 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- SystemZInstrInfo.cpp - SystemZ instruction information ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -169,11 +168,13 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode, if (!DestIsHigh && !SrcIsHigh) MI.setDesc(get(LowOpcodeK)); else { - emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg, - SystemZ::LR, 32, MI.getOperand(1).isKill(), - MI.getOperand(1).isUndef()); + if (DestReg != SrcReg) { + emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg, + SystemZ::LR, 32, MI.getOperand(1).isKill(), + MI.getOperand(1).isUndef()); + MI.getOperand(1).setReg(DestReg); + } MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode)); - MI.getOperand(1).setReg(DestReg); MI.tieOperands(0, 1); } } @@ -222,6 +223,65 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode, // correctly. This change is defered to the SystemZExpandPseudo pass. } +// MI is a select pseudo instruction. Replace it with LowOpcode if source +// and destination are all low GR32s and HighOpcode if source and destination +// are all high GR32s. Otherwise, use the two-operand MixedOpcode. +void SystemZInstrInfo::expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode, + unsigned HighOpcode, + unsigned MixedOpcode) const { + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned Src1Reg = MI.getOperand(1).getReg(); + unsigned Src2Reg = MI.getOperand(2).getReg(); + bool DestIsHigh = isHighReg(DestReg); + bool Src1IsHigh = isHighReg(Src1Reg); + bool Src2IsHigh = isHighReg(Src2Reg); + + // If sources and destination aren't all high or all low, we may be able to + // simplify the operation by moving one of the sources to the destination + // first. But only if this doesn't clobber the other source. + if (DestReg != Src1Reg && DestReg != Src2Reg) { + if (DestIsHigh != Src1IsHigh) { + emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src1Reg, + SystemZ::LR, 32, MI.getOperand(1).isKill(), + MI.getOperand(1).isUndef()); + MI.getOperand(1).setReg(DestReg); + Src1Reg = DestReg; + Src1IsHigh = DestIsHigh; + } else if (DestIsHigh != Src2IsHigh) { + emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src2Reg, + SystemZ::LR, 32, MI.getOperand(2).isKill(), + MI.getOperand(2).isUndef()); + MI.getOperand(2).setReg(DestReg); + Src2Reg = DestReg; + Src2IsHigh = DestIsHigh; + } + } + + // If the destination (now) matches one source, prefer this to be first. + if (DestReg != Src1Reg && DestReg == Src2Reg) { + commuteInstruction(MI, false, 1, 2); + std::swap(Src1Reg, Src2Reg); + std::swap(Src1IsHigh, Src2IsHigh); + } + + if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh) + MI.setDesc(get(LowOpcode)); + else if (DestIsHigh && Src1IsHigh && Src2IsHigh) + MI.setDesc(get(HighOpcode)); + else { + // Given the simplifcation above, we must already have a two-operand case. + assert (DestReg == Src1Reg); + MI.setDesc(get(MixedOpcode)); + MI.tieOperands(0, 1); + LOCRMuxJumps++; + } + + // If we were unable to implement the pseudo with a single instruction, we + // need to convert it back into a branch sequence. This cannot be done here + // since the caller of expandPostRAPseudo does not handle changes to the CFG + // correctly. This change is defered to the SystemZExpandPseudo pass. +} + // MI is an RR-style pseudo instruction that zero-extends the low Size bits // of one GRX32 into another. Replace it with LowOpcode if both operands // are low registers, otherwise use RISB[LH]G. @@ -311,6 +371,10 @@ MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI, }; switch (MI.getOpcode()) { + case SystemZ::SELRMux: + case SystemZ::SELFHR: + case SystemZ::SELR: + case SystemZ::SELGR: case SystemZ::LOCRMux: case SystemZ::LOCFHR: case SystemZ::LOCR: @@ -557,80 +621,6 @@ bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, return false; } -// If Reg is a virtual register, return its definition, otherwise return null. -static MachineInstr *getDef(unsigned Reg, - const MachineRegisterInfo *MRI) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) - return nullptr; - return MRI->getUniqueVRegDef(Reg); -} - -// Return true if MI is a shift of type Opcode by Imm bits. -static bool isShift(MachineInstr *MI, unsigned Opcode, int64_t Imm) { - return (MI->getOpcode() == Opcode && - !MI->getOperand(2).getReg() && - MI->getOperand(3).getImm() == Imm); -} - -// If the destination of MI has no uses, delete it as dead. -static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) { - if (MRI->use_nodbg_empty(MI->getOperand(0).getReg())) - MI->eraseFromParent(); -} - -// Compare compares SrcReg against zero. Check whether SrcReg contains -// the result of an IPM sequence whose input CC survives until Compare, -// and whether Compare is therefore redundant. Delete it and return -// true if so. -static bool removeIPMBasedCompare(MachineInstr &Compare, unsigned SrcReg, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI) { - MachineInstr *LGFR = nullptr; - MachineInstr *RLL = getDef(SrcReg, MRI); - if (RLL && RLL->getOpcode() == SystemZ::LGFR) { - LGFR = RLL; - RLL = getDef(LGFR->getOperand(1).getReg(), MRI); - } - if (!RLL || !isShift(RLL, SystemZ::RLL, 31)) - return false; - - MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI); - if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC)) - return false; - - MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI); - if (!IPM || IPM->getOpcode() != SystemZ::IPM) - return false; - - // Check that there are no assignments to CC between the IPM and Compare, - if (IPM->getParent() != Compare.getParent()) - return false; - MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare.getIterator(); - for (++MBBI; MBBI != MBBE; ++MBBI) { - MachineInstr &MI = *MBBI; - if (MI.modifiesRegister(SystemZ::CC, TRI)) - return false; - } - - Compare.eraseFromParent(); - if (LGFR) - eraseIfDead(LGFR, MRI); - eraseIfDead(RLL, MRI); - eraseIfDead(SRL, MRI); - eraseIfDead(IPM, MRI); - - return true; -} - -bool SystemZInstrInfo::optimizeCompareInstr( - MachineInstr &Compare, unsigned SrcReg, unsigned SrcReg2, int Mask, - int Value, const MachineRegisterInfo *MRI) const { - assert(!SrcReg2 && "Only optimizing constant comparisons so far"); - bool IsLogical = (Compare.getDesc().TSFlags & SystemZII::IsLogical) != 0; - return Value == 0 && !IsLogical && - removeIPMBasedCompare(Compare, SrcReg, MRI, &RI); -} - bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef Pred, unsigned TrueReg, unsigned FalseReg, @@ -679,7 +669,9 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB, unsigned Opc; if (SystemZ::GRX32BitRegClass.hasSubClassEq(RC)) { - if (STI.hasLoadStoreOnCond2()) + if (STI.hasMiscellaneousExtensions3()) + Opc = SystemZ::SELRMux; + else if (STI.hasLoadStoreOnCond2()) Opc = SystemZ::LOCRMux; else { Opc = SystemZ::LOCR; @@ -691,9 +683,12 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB, TrueReg = TReg; FalseReg = FReg; } - } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC)) - Opc = SystemZ::LOCGR; - else + } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC)) { + if (STI.hasMiscellaneousExtensions3()) + Opc = SystemZ::SELGR; + else + Opc = SystemZ::LOCGR; + } else llvm_unreachable("Invalid register class"); BuildMI(MBB, I, DL, get(Opc), DstReg) @@ -716,7 +711,11 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned NewUseOpc; unsigned UseIdx; int CommuteIdx = -1; + bool TieOps = false; switch (UseOpc) { + case SystemZ::SELRMux: + TieOps = true; + LLVM_FALLTHROUGH; case SystemZ::LOCRMux: if (!STI.hasLoadStoreOnCond2()) return false; @@ -728,6 +727,9 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, else return false; break; + case SystemZ::SELGR: + TieOps = true; + LLVM_FALLTHROUGH; case SystemZ::LOCGR: if (!STI.hasLoadStoreOnCond2()) return false; @@ -749,6 +751,8 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, bool DeleteDef = MRI->hasOneNonDBGUse(Reg); UseMI.setDesc(get(NewUseOpc)); + if (TieOps) + UseMI.tieOperands(0, 1); UseMI.getOperand(UseIdx).ChangeToImmediate(ImmVal); if (DeleteDef) DefMI.eraseFromParent(); @@ -1032,73 +1036,13 @@ static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { } } -// Used to return from convertToThreeAddress after replacing two-address -// instruction OldMI with three-address instruction NewMI. -static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI, - MachineInstr *NewMI, - LiveVariables *LV) { - if (LV) { - unsigned NumOps = OldMI->getNumOperands(); - for (unsigned I = 1; I < NumOps; ++I) { - MachineOperand &Op = OldMI->getOperand(I); - if (Op.isReg() && Op.isKill()) - LV->replaceKillInstruction(Op.getReg(), *OldMI, *NewMI); - } - } - transferDeadCC(OldMI, NewMI); - return NewMI; -} - MachineInstr *SystemZInstrInfo::convertToThreeAddress( MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const { MachineBasicBlock *MBB = MI.getParent(); - MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - unsigned Opcode = MI.getOpcode(); - unsigned NumOps = MI.getNumOperands(); - - // Try to convert something like SLL into SLLK, if supported. - // We prefer to keep the two-operand form where possible both - // because it tends to be shorter and because some instructions - // have memory forms that can be used during spilling. - if (STI.hasDistinctOps()) { - MachineOperand &Dest = MI.getOperand(0); - MachineOperand &Src = MI.getOperand(1); - unsigned DestReg = Dest.getReg(); - unsigned SrcReg = Src.getReg(); - // AHIMux is only really a three-operand instruction when both operands - // are low registers. Try to constrain both operands to be low if - // possible. - if (Opcode == SystemZ::AHIMux && - TargetRegisterInfo::isVirtualRegister(DestReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg) && - MRI.getRegClass(DestReg)->contains(SystemZ::R1L) && - MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) { - MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass); - MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass); - } - int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode); - if (ThreeOperandOpcode >= 0) { - // Create three address instruction without adding the implicit - // operands. Those will instead be copied over from the original - // instruction by the loop below. - MachineInstrBuilder MIB( - *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(), - /*NoImplicit=*/true)); - MIB.add(Dest); - // Keep the kill state, but drop the tied flag. - MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg()); - // Keep the remaining operands as-is. - for (unsigned I = 2; I < NumOps; ++I) - MIB.add(MI.getOperand(I)); - MBB->insert(MI, MIB); - return finishConvertToThreeAddress(&MI, MIB, LV); - } - } // Try to convert an AND into an RISBG-type instruction. - if (LogicOp And = interpretAndImmediate(Opcode)) { + // TODO: It might be beneficial to select RISBG and shorten to AND instead. + if (LogicOp And = interpretAndImmediate(MI.getOpcode())) { uint64_t Imm = MI.getOperand(2).getImm() << And.ImmLSB; // AND IMMEDIATE leaves the other bits of the register unchanged. Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB); @@ -1126,7 +1070,16 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress( .addImm(Start) .addImm(End + 128) .addImm(0); - return finishConvertToThreeAddress(&MI, MIB, LV); + if (LV) { + unsigned NumOps = MI.getNumOperands(); + for (unsigned I = 1; I < NumOps; ++I) { + MachineOperand &Op = MI.getOperand(I); + if (Op.isReg() && Op.isKill()) + LV->replaceKillInstruction(Op.getReg(), MI, *MIB); + } + } + transferDeadCC(&MI, MIB); + return MIB; } } return nullptr; @@ -1135,7 +1088,7 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress( MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS) const { + LiveIntervals *LIS, VirtRegMap *VRM) const { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Size = MFI.getObjectSize(FrameIndex); @@ -1263,7 +1216,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( // MVCs that turn out to be redundant. if (OpNum == 0 && MI.hasOneMemOperand()) { MachineMemOperand *MMO = *MI.memoperands_begin(); - if (MMO->getSize() == Size && !MMO->isVolatile()) { + if (MMO->getSize() == Size && !MMO->isVolatile() && !MMO->isAtomic()) { // Handle conversion of loads. if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXLoad)) { return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), @@ -1289,12 +1242,37 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( } } - // If the spilled operand is the final one, try to change R - // into . + // If the spilled operand is the final one or the instruction is + // commutable, try to change R into . + unsigned NumOps = MI.getNumExplicitOperands(); int MemOpcode = SystemZ::getMemOpcode(Opcode); + + // See if this is a 3-address instruction that is convertible to 2-address + // and suitable for folding below. Only try this with virtual registers + // and a provided VRM (during regalloc). + bool NeedsCommute = false; + if (SystemZ::getTwoOperandOpcode(Opcode) != -1 && MemOpcode != -1) { + if (VRM == nullptr) + MemOpcode = -1; + else { + assert(NumOps == 3 && "Expected two source registers."); + Register DstReg = MI.getOperand(0).getReg(); + Register DstPhys = + (TRI->isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg); + Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg() + : ((OpNum == 1 && MI.isCommutable()) + ? MI.getOperand(2).getReg() + : Register())); + if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg && + TRI->isVirtualRegister(SrcReg) && DstPhys == VRM->getPhys(SrcReg)) + NeedsCommute = (OpNum == 1); + else + MemOpcode = -1; + } + } + if (MemOpcode >= 0) { - unsigned NumOps = MI.getNumExplicitOperands(); - if (OpNum == NumOps - 1) { + if ((OpNum == NumOps - 1) || NeedsCommute) { const MCInstrDesc &MemDesc = get(MemOpcode); uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags); assert(AccessBytes != 0 && "Size of access should be known"); @@ -1302,8 +1280,12 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( uint64_t Offset = Size - AccessBytes; MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(MemOpcode)); - for (unsigned I = 0; I < OpNum; ++I) - MIB.add(MI.getOperand(I)); + MIB.add(MI.getOperand(0)); + if (NeedsCommute) + MIB.add(MI.getOperand(2)); + else + for (unsigned I = 1; I < OpNum; ++I) + MIB.add(MI.getOperand(I)); MIB.addFrameIndex(FrameIndex).addImm(Offset); if (MemDesc.TSFlags & SystemZII::HasIndex) MIB.addReg(0); @@ -1380,6 +1362,11 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR); return true; + case SystemZ::SELRMux: + expandSELRPseudo(MI, SystemZ::SELR, SystemZ::SELFHR, + SystemZ::LOCRMux); + return true; + case SystemZ::STCMux: expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH); return true; @@ -1506,7 +1493,7 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } unsigned SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { - if (MI.getOpcode() == TargetOpcode::INLINEASM) { + if (MI.isInlineAsm()) { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); @@ -1857,7 +1844,8 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB, } bool SystemZInstrInfo:: -areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, +areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA) const { if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index 216139eb7c79..2edde175542e 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -1,9 +1,8 @@ //===-- SystemZInstrInfo.h - SystemZ instruction information ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -142,6 +141,11 @@ enum FusedCompareType { } // end namespace SystemZII +namespace SystemZ { +int getTwoOperandOpcode(uint16_t Opcode); +int getTargetMemOpcode(uint16_t Opcode); +} + class SystemZInstrInfo : public SystemZGenInstrInfo { const SystemZRegisterInfo RI; SystemZSubtarget &STI; @@ -158,6 +162,8 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { unsigned HighOpcode) const; void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode) const; + void expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode, + unsigned HighOpcode, unsigned MixedOpcode) const; void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned Size) const; void expandLoadStackGuard(MachineInstr *MI) const; @@ -208,9 +214,6 @@ public: int *BytesAdded = nullptr) const override; bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const override; - bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int Mask, int Value, - const MachineRegisterInfo *MRI) const override; bool canInsertSelect(const MachineBasicBlock&, ArrayRef Cond, unsigned, unsigned, int&, int&, int&) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -252,7 +255,8 @@ public: foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS = nullptr) const override; + LiveIntervals *LIS = nullptr, + VirtRegMap *VRM = nullptr) const override; MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, @@ -317,7 +321,8 @@ public: // addresses. This function returns true if two MIs access different // memory addresses and false otherwise. bool - areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; }; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 8d3b1011d0a7..91856893e3bd 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -1,9 +1,8 @@ //===-- SystemZInstrInfo.td - General SystemZ instructions ----*- tblgen-*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -256,7 +255,7 @@ let isCall = 1, Defs = [CC] in { } // Regular calls. -let isCall = 1, Defs = [R14D, CC] in { +let isCall = 1, Defs = [R14D, CC], Uses = [FPC] in { def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops), [(z_call pcrel32:$I2)]>; def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops), @@ -362,9 +361,6 @@ defm CondStore64 : CondStores, - Requires<[FeatureHighWord]>; def LR : UnaryRR <"lr", 0x18, null_frag, GR32, GR32>; def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>; @@ -478,6 +474,11 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in { def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>; } +// Move right. +let Predicates = [FeatureMiscellaneousExtensions3], + mayLoad = 1, mayStore = 1, Uses = [R0L] in + def MVCRL : SideEffectBinarySSE<"mvcrl", 0xE50A>; + // String moves. let mayLoad = 1, mayStore = 1, Defs = [CC] in defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>; @@ -486,6 +487,29 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in // Conditional move instructions //===----------------------------------------------------------------------===// +let Predicates = [FeatureMiscellaneousExtensions3], Uses = [CC] in { + // Select. + let isCommutable = 1 in { + // Expands to SELR or SELFHR or a branch-and-move sequence, + // depending on the choice of registers. + def SELRMux : CondBinaryRRFaPseudo; + defm SELFHR : CondBinaryRRFaPair<"selfhr", 0xB9C0, GRH32, GRH32, GRH32>; + defm SELR : CondBinaryRRFaPair<"selr", 0xB9F0, GR32, GR32, GR32>; + defm SELGR : CondBinaryRRFaPair<"selgr", 0xB9E3, GR64, GR64, GR64>; + } + + // Define AsmParser extended mnemonics for each general condition-code mask. + foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE", + "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in { + def SELRAsm#V : FixedCondBinaryRRFa, "selr", 0xB9F0, + GR32, GR32, GR32>; + def SELFHRAsm#V : FixedCondBinaryRRFa, "selfhr", 0xB9C0, + GRH32, GRH32, GRH32>; + def SELGRAsm#V : FixedCondBinaryRRFa, "selgr", 0xB9E3, + GR64, GR64, GR64>; + } +} + let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in { // Load immediate on condition. Matched via DAG pattern and created // by the PeepholeOptimizer via FoldImmediate. @@ -920,11 +944,11 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { // Addition of memory. defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>; - defm A : BinaryRXPair<"a", 0x5A, 0xE35A, z_sadd, GR32, load, 4>; + defm A : BinaryRXPairAndPseudo<"a", 0x5A, 0xE35A, z_sadd, GR32, load, 4>; def AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>, Requires<[FeatureMiscellaneousExtensions2]>; def AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>; - def AG : BinaryRXY<"ag", 0xE308, z_sadd, GR64, load, 8>; + defm AG : BinaryRXYAndPseudo<"ag", 0xE308, z_sadd, GR64, load, 8>; // Addition to memory. def ASI : BinarySIY<"asi", 0xEB6A, add, imm32sx8>; @@ -962,9 +986,9 @@ let Defs = [CC] in { Requires<[FeatureHighWord]>; // Addition of memory. - defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>; + defm AL : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>; def ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>; - def ALG : BinaryRXY<"alg", 0xE30A, z_uadd, GR64, load, 8>; + defm ALG : BinaryRXYAndPseudo<"alg", 0xE30A, z_uadd, GR64, load, 8>; // Addition to memory. def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>; @@ -1007,11 +1031,11 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { // Subtraction of memory. defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>; - defm S : BinaryRXPair<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>; + defm S : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>; def SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>, Requires<[FeatureMiscellaneousExtensions2]>; def SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>; - def SG : BinaryRXY<"sg", 0xE309, z_ssub, GR64, load, 8>; + defm SG : BinaryRXYAndPseudo<"sg", 0xE309, z_ssub, GR64, load, 8>; } defm : SXB; @@ -1033,6 +1057,14 @@ let AddedComplexity = 1 in { (AGFI GR64:$src1, imm64sx32n:$src2)>; } +// And vice versa in one special case, where we need to load a +// constant into a register in any case, but the negated constant +// requires fewer instructions to load. +def : Pat<(z_saddo GR64:$src1, imm64lh16n:$src2), + (SGR GR64:$src1, (LLILH imm64lh16n:$src2))>; +def : Pat<(z_saddo GR64:$src1, imm64lf32n:$src2), + (SGR GR64:$src1, (LLILF imm64lf32n:$src2))>; + // Subtraction producing a carry. let Defs = [CC] in { // Subtraction of a register. @@ -1051,9 +1083,9 @@ let Defs = [CC] in { def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>; // Subtraction of memory. - defm SL : BinaryRXPair<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>; + defm SL : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>; def SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>; - def SLG : BinaryRXY<"slg", 0xE30B, z_usub, GR64, load, 8>; + defm SLG : BinaryRXYAndPseudo<"slg", 0xE30B, z_usub, GR64, load, 8>; } defm : ZXB; @@ -1128,8 +1160,8 @@ let Defs = [CC] in { // ANDs of memory. let CCValues = 0xC, CompareZeroCCMask = 0x8 in { - defm N : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>; - def NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>; + defm N : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, load, 4>; + defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, load, 8>; } // AND to memory @@ -1185,8 +1217,8 @@ let Defs = [CC] in { // ORs of memory. let CCValues = 0xC, CompareZeroCCMask = 0x8 in { - defm O : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>; - def OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>; + defm O : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, load, 4>; + defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, load, 8>; } // OR to memory @@ -1225,8 +1257,8 @@ let Defs = [CC] in { // XORs of memory. let CCValues = 0xC, CompareZeroCCMask = 0x8 in { - defm X : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>; - def XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>; + defm X : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, load, 4>; + defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, load, 8>; } // XOR to memory @@ -1239,6 +1271,43 @@ let Defs = [CC] in { defm : RMWIByte; defm : RMWIByte; +//===----------------------------------------------------------------------===// +// Combined logical operations +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureMiscellaneousExtensions3], + Defs = [CC] in { + // AND with complement. + let CCValues = 0xC, CompareZeroCCMask = 0x8 in { + def NCRK : BinaryRRFa<"ncrk", 0xB9F5, andc, GR32, GR32, GR32>; + def NCGRK : BinaryRRFa<"ncgrk", 0xB9E5, andc, GR64, GR64, GR64>; + } + + // OR with complement. + let CCValues = 0xC, CompareZeroCCMask = 0x8 in { + def OCRK : BinaryRRFa<"ocrk", 0xB975, orc, GR32, GR32, GR32>; + def OCGRK : BinaryRRFa<"ocgrk", 0xB965, orc, GR64, GR64, GR64>; + } + + // NAND. + let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in { + def NNRK : BinaryRRFa<"nnrk", 0xB974, nand, GR32, GR32, GR32>; + def NNGRK : BinaryRRFa<"nngrk", 0xB964, nand, GR64, GR64, GR64>; + } + + // NOR. + let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in { + def NORK : BinaryRRFa<"nork", 0xB976, nor, GR32, GR32, GR32>; + def NOGRK : BinaryRRFa<"nogrk", 0xB966, nor, GR64, GR64, GR64>; + } + + // NXOR. + let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in { + def NXRK : BinaryRRFa<"nxrk", 0xB977, nxor, GR32, GR32, GR32>; + def NXGRK : BinaryRRFa<"nxgrk", 0xB967, nxor, GR64, GR64, GR64>; + } +} + //===----------------------------------------------------------------------===// // Multiplication //===----------------------------------------------------------------------===// @@ -1833,6 +1902,9 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { let Predicates = [FeatureMessageSecurityAssist8] in def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929, GR128, GR128, GR128>; + + let Predicates = [FeatureMessageSecurityAssist9] in + def KDSA : SideEffectBinaryMemRRE<"kdsa", 0xB93A, GR64, GR128>; } //===----------------------------------------------------------------------===// @@ -2013,7 +2085,12 @@ let Defs = [CC] in def : Pat<(ctlz GR64:$src), (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>; -// Population count. Counts bits set per byte. +// Population count. Counts bits set per byte or doubleword. +let Predicates = [FeatureMiscellaneousExtensions3] in { + let Defs = [CC] in + def POPCNTOpt : BinaryRRFc<"popcnt", 0xB9E1, GR64, GR64>; + def : Pat<(ctpop GR64:$src), (POPCNTOpt GR64:$src, 8)>; +} let Predicates = [FeaturePopulationCount], Defs = [CC] in def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>; @@ -2044,6 +2121,17 @@ let mayLoad = 1, Defs = [CC] in let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>; +// Sort lists. +let Predicates = [FeatureEnhancedSort], + mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L, R1D] in + def SORTL : SideEffectBinaryMemMemRRE<"sortl", 0xB938, GR128, GR128>; + +// Deflate conversion call. +let Predicates = [FeatureDeflateConversion], + mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L, R1D] in + def DFLTCC : SideEffectTernaryMemMemRRFa<"dfltcc", 0xB939, + GR128, GR128, GR64>; + // Execute. let hasSideEffects = 1 in { def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; @@ -2186,6 +2274,22 @@ let AddedComplexity = 4 in { (RLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; } +// Substitute (x*64-s) with (-s), since shift/rotate instructions only +// use the last 6 bits of the second operand register (making it modulo 64). +let AddedComplexity = 4 in { + def : Pat<(shl GR64:$val, (sub imm32mod64, GR32:$shift)), + (SLLG GR64:$val, (LCR GR32:$shift), 0)>; + + def : Pat<(sra GR64:$val, (sub imm32mod64, GR32:$shift)), + (SRAG GR64:$val, (LCR GR32:$shift), 0)>; + + def : Pat<(srl GR64:$val, (sub imm32mod64, GR32:$shift)), + (SRLG GR64:$val, (LCR GR32:$shift), 0)>; + + def : Pat<(rotl GR64:$val, (sub imm32mod64, GR32:$shift)), + (RLLG GR64:$val, (LCR GR32:$shift), 0)>; +} + // Peepholes for turning scalar operations into block operations. defm : BlockLoadStore; diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td index c351577fa5bd..ecce16c9cd73 100644 --- a/lib/Target/SystemZ/SystemZInstrSystem.td +++ b/lib/Target/SystemZ/SystemZInstrSystem.td @@ -1,9 +1,8 @@ //==- SystemZInstrSystem.td - SystemZ system instructions -*- tblgen-*-----==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 6c97b85277c3..261727f89058 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -1,9 +1,8 @@ //==- SystemZInstrVector.td - SystemZ Vector instructions ------*- tblgen-*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -104,7 +103,7 @@ let Predicates = [FeatureVector] in { let Predicates = [FeatureVector] in { // Load. - def VL : UnaryVRX<"vl", 0xE706, null_frag, v128any, 16>; + defm VL : UnaryVRXAlign<"vl", 0xE706>; // Load to block boundary. The number of loaded bytes is only known // at run time. The instruction is really polymorphic, but v128b matches @@ -123,7 +122,7 @@ let Predicates = [FeatureVector] in { def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>; // Load multiple. - def VLM : LoadMultipleVRSa<"vlm", 0xE736>; + defm VLM : LoadMultipleVRSaAlign<"vlm", 0xE736>; // Load and replicate def VLREP : UnaryVRXGeneric<"vlrep", 0xE705>; @@ -208,13 +207,13 @@ defm : ReplicatePeephole; let Predicates = [FeatureVector] in { // Store. - def VST : StoreVRX<"vst", 0xE70E, null_frag, v128any, 16>; + defm VST : StoreVRXAlign<"vst", 0xE70E>; // Store with length. The number of stored bytes is only known at run time. def VSTL : StoreLengthVRSb<"vstl", 0xE73F, int_s390_vstl, 0>; // Store multiple. - def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>; + defm VSTM : StoreMultipleVRSaAlign<"vstm", 0xE73E>; // Store element. def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8, v128b, 1, imm32zx4>; @@ -249,6 +248,81 @@ let Predicates = [FeatureVectorPackedDecimal] in { def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>; } +//===----------------------------------------------------------------------===// +// Byte swaps +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureVectorEnhancements2] in { + // Load byte-reversed elements. + def VLBR : UnaryVRXGeneric<"vlbr", 0xE606>; + def VLBRH : UnaryVRX<"vlbrh", 0xE606, z_loadbswap, v128h, 16, 1>; + def VLBRF : UnaryVRX<"vlbrf", 0xE606, z_loadbswap, v128f, 16, 2>; + def VLBRG : UnaryVRX<"vlbrg", 0xE606, z_loadbswap, v128g, 16, 3>; + def VLBRQ : UnaryVRX<"vlbrq", 0xE606, null_frag, v128q, 16, 4>; + + // Load elements reversed. + def VLER : UnaryVRXGeneric<"vler", 0xE607>; + def VLERH : UnaryVRX<"vlerh", 0xE607, z_loadeswap, v128h, 16, 1>; + def VLERF : UnaryVRX<"vlerf", 0xE607, z_loadeswap, v128f, 16, 2>; + def VLERG : UnaryVRX<"vlerg", 0xE607, z_loadeswap, v128g, 16, 3>; + def : Pat<(v4f32 (z_loadeswap bdxaddr12only:$addr)), + (VLERF bdxaddr12only:$addr)>; + def : Pat<(v2f64 (z_loadeswap bdxaddr12only:$addr)), + (VLERG bdxaddr12only:$addr)>; + def : Pat<(v16i8 (z_loadeswap bdxaddr12only:$addr)), + (VLBRQ bdxaddr12only:$addr)>; + + // Load byte-reversed element. + def VLEBRH : TernaryVRX<"vlebrh", 0xE601, z_vlebri16, v128h, v128h, 2, imm32zx3>; + def VLEBRF : TernaryVRX<"vlebrf", 0xE603, z_vlebri32, v128f, v128f, 4, imm32zx2>; + def VLEBRG : TernaryVRX<"vlebrg", 0xE602, z_vlebri64, v128g, v128g, 8, imm32zx1>; + + // Load byte-reversed element and zero. + def VLLEBRZ : UnaryVRXGeneric<"vllebrz", 0xE604>; + def VLLEBRZH : UnaryVRX<"vllebrzh", 0xE604, z_vllebrzi16, v128h, 2, 1>; + def VLLEBRZF : UnaryVRX<"vllebrzf", 0xE604, z_vllebrzi32, v128f, 4, 2>; + def VLLEBRZG : UnaryVRX<"vllebrzg", 0xE604, z_vllebrzi64, v128g, 8, 3>; + def VLLEBRZE : UnaryVRX<"vllebrze", 0xE604, z_vllebrzli32, v128f, 4, 6>; + def : InstAlias<"lerv\t$V1, $XBD2", + (VLLEBRZE VR128:$V1, bdxaddr12only:$XBD2), 0>; + def : InstAlias<"ldrv\t$V1, $XBD2", + (VLLEBRZG VR128:$V1, bdxaddr12only:$XBD2), 0>; + + // Load byte-reversed element and replicate. + def VLBRREP : UnaryVRXGeneric<"vlbrrep", 0xE605>; + def VLBRREPH : UnaryVRX<"vlbrreph", 0xE605, z_replicate_loadbswapi16, v128h, 2, 1>; + def VLBRREPF : UnaryVRX<"vlbrrepf", 0xE605, z_replicate_loadbswapi32, v128f, 4, 2>; + def VLBRREPG : UnaryVRX<"vlbrrepg", 0xE605, z_replicate_loadbswapi64, v128g, 8, 3>; + + // Store byte-reversed elements. + def VSTBR : StoreVRXGeneric<"vstbr", 0xE60E>; + def VSTBRH : StoreVRX<"vstbrh", 0xE60E, z_storebswap, v128h, 16, 1>; + def VSTBRF : StoreVRX<"vstbrf", 0xE60E, z_storebswap, v128f, 16, 2>; + def VSTBRG : StoreVRX<"vstbrg", 0xE60E, z_storebswap, v128g, 16, 3>; + def VSTBRQ : StoreVRX<"vstbrq", 0xE60E, null_frag, v128q, 16, 4>; + + // Store elements reversed. + def VSTER : StoreVRXGeneric<"vster", 0xE60F>; + def VSTERH : StoreVRX<"vsterh", 0xE60F, z_storeeswap, v128h, 16, 1>; + def VSTERF : StoreVRX<"vsterf", 0xE60F, z_storeeswap, v128f, 16, 2>; + def VSTERG : StoreVRX<"vsterg", 0xE60F, z_storeeswap, v128g, 16, 3>; + def : Pat<(z_storeeswap (v4f32 VR128:$val), bdxaddr12only:$addr), + (VSTERF VR128:$val, bdxaddr12only:$addr)>; + def : Pat<(z_storeeswap (v2f64 VR128:$val), bdxaddr12only:$addr), + (VSTERG VR128:$val, bdxaddr12only:$addr)>; + def : Pat<(z_storeeswap (v16i8 VR128:$val), bdxaddr12only:$addr), + (VSTBRQ VR128:$val, bdxaddr12only:$addr)>; + + // Store byte-reversed element. + def VSTEBRH : StoreBinaryVRX<"vstebrh", 0xE609, z_vstebri16, v128h, 2, imm32zx3>; + def VSTEBRF : StoreBinaryVRX<"vstebrf", 0xE60B, z_vstebri32, v128f, 4, imm32zx2>; + def VSTEBRG : StoreBinaryVRX<"vstebrg", 0xE60A, z_vstebri64, v128g, 8, imm32zx1>; + def : InstAlias<"sterv\t$V1, $XBD2", + (VSTEBRF VR128:$V1, bdxaddr12only:$XBD2, 0), 0>; + def : InstAlias<"stdrv\t$V1, $XBD2", + (VSTEBRG VR128:$V1, bdxaddr12only:$XBD2, 0), 0>; +} + //===----------------------------------------------------------------------===// // Selects and permutes //===----------------------------------------------------------------------===// @@ -707,6 +781,10 @@ let Predicates = [FeatureVector] in { def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z), (VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>; + // Shift left double by bit. + let Predicates = [FeatureVectorEnhancements2] in + def VSLD : TernaryVRId<"vsld", 0xE786, int_s390_vsld, v128b, v128b, 0>; + // Shift right arithmetic. def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>; @@ -719,6 +797,10 @@ let Predicates = [FeatureVector] in { // Shift right logical by byte. def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>; + // Shift right double by bit. + let Predicates = [FeatureVectorEnhancements2] in + def VSRD : TernaryVRId<"vsrd", 0xE787, int_s390_vsrd, v128b, v128b, 0>; + // Subtract. def VS : BinaryVRRcGeneric<"vs", 0xE7F7>; def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>; @@ -925,126 +1007,190 @@ let Predicates = [FeatureVector] in { // See comments in SystemZInstrFP.td for the suppression flags and // rounding modes. multiclass VectorRounding { - def : FPConversion; - def : FPConversion; - def : FPConversion; - def : FPConversion; - def : FPConversion; - def : FPConversion; + def : FPConversion; + def : FPConversion; + def : FPConversion; + def : FPConversion; + def : FPConversion; + def : FPConversion; } let Predicates = [FeatureVector] in { // Add. - def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; - def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>; - def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>; - def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>; - def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; + def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; + def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; + def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8>; + def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; + } } - // Convert from fixed 64-bit. - def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>; - def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>; - def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>; + // Convert from fixed. + let Uses = [FPC], mayRaiseFPException = 1 in { + def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>; + def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>; + def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>; + } def : FPConversion; + let Predicates = [FeatureVectorEnhancements2] in { + let Uses = [FPC], mayRaiseFPException = 1 in { + let isAsmParserOnly = 1 in + def VCFPS : TernaryVRRaFloatGeneric<"vcfps", 0xE7C3>; + def VCEFB : TernaryVRRa<"vcefb", 0xE7C3, null_frag, v128sb, v128g, 2, 0>; + def WCEFB : TernaryVRRa<"wcefb", 0xE7C3, null_frag, v32sb, v32f, 2, 8>; + } + def : FPConversion; + } - // Convert from logical 64-bit. - def VCDLG : TernaryVRRaFloatGeneric<"vcdlg", 0xE7C1>; - def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>; - def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>; + // Convert from logical. + let Uses = [FPC], mayRaiseFPException = 1 in { + def VCDLG : TernaryVRRaFloatGeneric<"vcdlg", 0xE7C1>; + def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>; + def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>; + } def : FPConversion; + let Predicates = [FeatureVectorEnhancements2] in { + let Uses = [FPC], mayRaiseFPException = 1 in { + let isAsmParserOnly = 1 in + def VCFPL : TernaryVRRaFloatGeneric<"vcfpl", 0xE7C1>; + def VCELFB : TernaryVRRa<"vcelfb", 0xE7C1, null_frag, v128sb, v128g, 2, 0>; + def WCELFB : TernaryVRRa<"wcelfb", 0xE7C1, null_frag, v32sb, v32f, 2, 8>; + } + def : FPConversion; + } - // Convert to fixed 64-bit. - def VCGD : TernaryVRRaFloatGeneric<"vcgd", 0xE7C2>; - def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>; - def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>; + // Convert to fixed. + let Uses = [FPC], mayRaiseFPException = 1 in { + def VCGD : TernaryVRRaFloatGeneric<"vcgd", 0xE7C2>; + def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>; + def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>; + } // Rounding mode should agree with SystemZInstrFP.td. def : FPConversion; + let Predicates = [FeatureVectorEnhancements2] in { + let Uses = [FPC], mayRaiseFPException = 1 in { + let isAsmParserOnly = 1 in + def VCSFP : TernaryVRRaFloatGeneric<"vcsfp", 0xE7C2>; + def VCFEB : TernaryVRRa<"vcfeb", 0xE7C2, null_frag, v128sb, v128g, 2, 0>; + def WCFEB : TernaryVRRa<"wcfeb", 0xE7C2, null_frag, v32sb, v32f, 2, 8>; + } + // Rounding mode should agree with SystemZInstrFP.td. + def : FPConversion; + } - // Convert to logical 64-bit. - def VCLGD : TernaryVRRaFloatGeneric<"vclgd", 0xE7C0>; - def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>; - def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>; + // Convert to logical. + let Uses = [FPC], mayRaiseFPException = 1 in { + def VCLGD : TernaryVRRaFloatGeneric<"vclgd", 0xE7C0>; + def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>; + def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>; + } // Rounding mode should agree with SystemZInstrFP.td. def : FPConversion; + let Predicates = [FeatureVectorEnhancements2] in { + let Uses = [FPC], mayRaiseFPException = 1 in { + let isAsmParserOnly = 1 in + def VCLFP : TernaryVRRaFloatGeneric<"vclfp", 0xE7C0>; + def VCLFEB : TernaryVRRa<"vclfeb", 0xE7C0, null_frag, v128sb, v128g, 2, 0>; + def WCLFEB : TernaryVRRa<"wclfeb", 0xE7C0, null_frag, v32sb, v32f, 2, 8>; + } + // Rounding mode should agree with SystemZInstrFP.td. + def : FPConversion; + } // Divide. - def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>; - def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>; - def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>; - def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>; - def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>; + def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, any_fdiv, v128db, v128db, 3, 0>; + def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, any_fdiv, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, any_fdiv, v128sb, v128sb, 2, 0>; + def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, any_fdiv, v32sb, v32sb, 2, 8>; + def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, any_fdiv, v128xb, v128xb, 4, 8>; + } } // Load FP integer. - def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>; - def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>; - def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>; + def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>; + def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>; + } defm : VectorRounding; defm : VectorRounding; let Predicates = [FeatureVectorEnhancements1] in { - def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>; - def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>; - def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>; + def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>; + def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>; + } defm : VectorRounding; defm : VectorRounding; defm : VectorRounding; } // Load lengthened. - def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>; - def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>; - def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>; + def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>; + def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, any_fpextend, v64db, v32sb, 2, 8>; + } let Predicates = [FeatureVectorEnhancements1] in { - let isAsmParserOnly = 1 in { - def VFLL : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>; - def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>; - def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + let isAsmParserOnly = 1 in { + def VFLL : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>; + def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>; + def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>; + } + def WFLLD : UnaryVRRa<"wflld", 0xE7C4, any_fpextend, v128xb, v64db, 3, 8>; } - def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>; - def : Pat<(f128 (fpextend (f32 VR32:$src))), + def : Pat<(f128 (any_fpextend (f32 VR32:$src))), (WFLLD (WLDEB VR32:$src))>; } // Load rounded. - def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>; - def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; - def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>; + def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; + def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; + } def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>; - def : FPConversion; + def : FPConversion; let Predicates = [FeatureVectorEnhancements1] in { - let isAsmParserOnly = 1 in { - def VFLR : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>; - def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; - def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + let isAsmParserOnly = 1 in { + def VFLR : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>; + def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; + def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; + } + def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>; } - def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>; - def : FPConversion; - def : Pat<(f32 (fpround (f128 VR128:$src))), + def : FPConversion; + def : Pat<(f32 (any_fpround (f128 VR128:$src))), (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>; } // Maximum. multiclass VectorMax { - def : FPMinMax; + def : FPMinMax; def : FPMinMax; } let Predicates = [FeatureVectorEnhancements1] in { - def VFMAX : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>; - def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb, - v128db, v128db, 3, 0>; - def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag, - v64db, v64db, 3, 8>; - def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb, - v128sb, v128sb, 2, 0>; - def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag, - v32sb, v32sb, 2, 8>; - def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag, - v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFMAX : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>; + def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb, + v128db, v128db, 3, 0>; + def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag, + v64db, v64db, 3, 8>; + def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb, + v128sb, v128sb, 2, 0>; + def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag, + v32sb, v32sb, 2, 8>; + def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag, + v128xb, v128xb, 4, 8>; + } defm : VectorMax; defm : VectorMax; defm : VectorMax; @@ -1054,21 +1200,23 @@ let Predicates = [FeatureVector] in { // Minimum. multiclass VectorMin { - def : FPMinMax; + def : FPMinMax; def : FPMinMax; } let Predicates = [FeatureVectorEnhancements1] in { - def VFMIN : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>; - def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb, - v128db, v128db, 3, 0>; - def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag, - v64db, v64db, 3, 8>; - def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb, - v128sb, v128sb, 2, 0>; - def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag, - v32sb, v32sb, 2, 8>; - def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag, - v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFMIN : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>; + def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb, + v128db, v128db, 3, 0>; + def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag, + v64db, v64db, 3, 8>; + def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb, + v128sb, v128sb, 2, 0>; + def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag, + v32sb, v32sb, 2, 8>; + def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag, + v128xb, v128xb, 4, 8>; + } defm : VectorMin; defm : VectorMin; defm : VectorMin; @@ -1077,53 +1225,61 @@ let Predicates = [FeatureVector] in { } // Multiply. - def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; - def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>; - def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>; - def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>; - def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; + def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; + def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; + def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8>; + def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; + } } // Multiply and add. - def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>; - def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>; - def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>; - def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>; - def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>; + def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>; + def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>; + def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2>; + def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>; + } } // Multiply and subtract. - def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>; - def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>; - def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>; - def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>; - def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>; + def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, any_fms, v128db, v128db, 0, 3>; + def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, any_fms, v64db, v64db, 8, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, any_fms, v128sb, v128sb, 0, 2>; + def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, any_fms, v32sb, v32sb, 8, 2>; + def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, any_fms, v128xb, v128xb, 8, 4>; + } } // Negative multiply and add. - let Predicates = [FeatureVectorEnhancements1] in { + let Uses = [FPC], mayRaiseFPException = 1, + Predicates = [FeatureVectorEnhancements1] in { def VFNMA : TernaryVRReFloatGeneric<"vfnma", 0xE79F>; - def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>; - def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>; - def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>; - def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>; - def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>; + def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, any_fnma, v128db, v128db, 0, 3>; + def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, any_fnma, v64db, v64db, 8, 3>; + def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, any_fnma, v128sb, v128sb, 0, 2>; + def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, any_fnma, v32sb, v32sb, 8, 2>; + def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, any_fnma, v128xb, v128xb, 8, 4>; } // Negative multiply and subtract. - let Predicates = [FeatureVectorEnhancements1] in { + let Uses = [FPC], mayRaiseFPException = 1, + Predicates = [FeatureVectorEnhancements1] in { def VFNMS : TernaryVRReFloatGeneric<"vfnms", 0xE79E>; - def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>; - def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>; - def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>; - def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>; - def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>; + def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, any_fnms, v128db, v128db, 0, 3>; + def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, any_fnms, v64db, v64db, 8, 3>; + def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, any_fnms, v128sb, v128sb, 0, 2>; + def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, any_fnms, v32sb, v32sb, 8, 2>; + def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, any_fnms, v128xb, v128xb, 8, 4>; } // Perform sign operation. @@ -1164,23 +1320,27 @@ let Predicates = [FeatureVector] in { } // Square root. - def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>; - def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>; - def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>; - def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>; - def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>; + def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, any_fsqrt, v128db, v128db, 3, 0>; + def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, any_fsqrt, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, any_fsqrt, v128sb, v128sb, 2, 0>; + def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, any_fsqrt, v32sb, v32sb, 2, 8>; + def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, any_fsqrt, v128xb, v128xb, 4, 8>; + } } // Subtract. - def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; - def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>; - def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>; - def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>; - def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; + def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; + def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; + def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8>; + def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; + } } // Test data class immediate. @@ -1202,7 +1362,7 @@ let Predicates = [FeatureVector] in { let Predicates = [FeatureVector] in { // Compare scalar. - let Defs = [CC] in { + let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in { def WFC : CompareVRRaFloatGeneric<"wfc", 0xE7CB>; def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>; let Predicates = [FeatureVectorEnhancements1] in { @@ -1212,7 +1372,7 @@ let Predicates = [FeatureVector] in { } // Compare and signal scalar. - let Defs = [CC] in { + let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in { def WFK : CompareVRRaFloatGeneric<"wfk", 0xE7CA>; def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>; let Predicates = [FeatureVectorEnhancements1] in { @@ -1222,22 +1382,25 @@ let Predicates = [FeatureVector] in { } // Compare equal. - def VFCE : BinaryVRRcSPairFloatGeneric<"vfce", 0xE7E8>; - defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes, - v128g, v128db, 3, 0>; - defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag, - v64g, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes, - v128f, v128sb, 2, 0>; - defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag, - v32f, v32sb, 2, 8>; - defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag, - v128q, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFCE : BinaryVRRcSPairFloatGeneric<"vfce", 0xE7E8>; + defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes, + v128g, v128db, 3, 0>; + defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag, + v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes, + v128f, v128sb, 2, 0>; + defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } } // Compare and signal equal. - let Predicates = [FeatureVectorEnhancements1] in { + let Uses = [FPC], mayRaiseFPException = 1, + Predicates = [FeatureVectorEnhancements1] in { defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag, v128g, v128db, 3, 4>; defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag, @@ -1251,22 +1414,25 @@ let Predicates = [FeatureVector] in { } // Compare high. - def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>; - defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs, - v128g, v128db, 3, 0>; - defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag, - v64g, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs, - v128f, v128sb, 2, 0>; - defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag, - v32f, v32sb, 2, 8>; - defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag, - v128q, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>; + defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs, + v128g, v128db, 3, 0>; + defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag, + v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs, + v128f, v128sb, 2, 0>; + defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } } // Compare and signal high. - let Predicates = [FeatureVectorEnhancements1] in { + let Uses = [FPC], mayRaiseFPException = 1, + Predicates = [FeatureVectorEnhancements1] in { defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag, v128g, v128db, 3, 4>; defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag, @@ -1280,22 +1446,25 @@ let Predicates = [FeatureVector] in { } // Compare high or equal. - def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>; - defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes, - v128g, v128db, 3, 0>; - defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag, - v64g, v64db, 3, 8>; - let Predicates = [FeatureVectorEnhancements1] in { - defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes, - v128f, v128sb, 2, 0>; - defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag, - v32f, v32sb, 2, 8>; - defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag, - v128q, v128xb, 4, 8>; + let Uses = [FPC], mayRaiseFPException = 1 in { + def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>; + defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes, + v128g, v128db, 3, 0>; + defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag, + v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes, + v128f, v128sb, 2, 0>; + defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } } // Compare and signal high or equal. - let Predicates = [FeatureVectorEnhancements1] in { + let Uses = [FPC], mayRaiseFPException = 1, + Predicates = [FeatureVectorEnhancements1] in { defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag, v128g, v128db, 3, 4>; defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag, @@ -1520,6 +1689,24 @@ let Predicates = [FeatureVector] in { z_vstrcz_cc, v128f, v128f, 2, 2>; } +let Predicates = [FeatureVectorEnhancements2] in { + defm VSTRS : TernaryExtraVRRdGeneric<"vstrs", 0xE78B>; + defm VSTRSB : TernaryExtraVRRd<"vstrsb", 0xE78B, + z_vstrs_cc, v128b, v128b, 0>; + defm VSTRSH : TernaryExtraVRRd<"vstrsh", 0xE78B, + z_vstrs_cc, v128b, v128h, 1>; + defm VSTRSF : TernaryExtraVRRd<"vstrsf", 0xE78B, + z_vstrs_cc, v128b, v128f, 2>; + let Defs = [CC] in { + def VSTRSZB : TernaryVRRd<"vstrszb", 0xE78B, + z_vstrsz_cc, v128b, v128b, 0, 2>; + def VSTRSZH : TernaryVRRd<"vstrszh", 0xE78B, + z_vstrsz_cc, v128b, v128h, 1, 2>; + def VSTRSZF : TernaryVRRd<"vstrszf", 0xE78B, + z_vstrsz_cc, v128b, v128f, 2, 2>; + } +} + //===----------------------------------------------------------------------===// // Packed-decimal instructions //===----------------------------------------------------------------------===// @@ -1531,6 +1718,10 @@ let Predicates = [FeatureVectorPackedDecimal] in { def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>; let Defs = [CC] in { + let Predicates = [FeatureVectorPackedDecimalEnhancement] in { + def VCVBOpt : TernaryVRRi<"vcvb", 0xE650, GR32>; + def VCVBGOpt : TernaryVRRi<"vcvbg", 0xE652, GR64>; + } def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>; def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>; def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>; diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp index f532e9e23b1f..06d893d043e9 100644 --- a/lib/Target/SystemZ/SystemZLDCleanup.cpp +++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp @@ -1,9 +1,8 @@ //===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp index 802962bd4db0..95d7e22dec32 100644 --- a/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -1,9 +1,8 @@ //===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp index 2655e4866b20..ef39f80a94ef 100644 --- a/lib/Target/SystemZ/SystemZMCInstLower.cpp +++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- SystemZMCInstLower.cpp - Lower MachineInstr to MCInst -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h index 7173cfa42959..14ad06488312 100644 --- a/lib/Target/SystemZ/SystemZMCInstLower.h +++ b/lib/Target/SystemZ/SystemZMCInstLower.h @@ -1,9 +1,8 @@ //===-- SystemZMCInstLower.h - Lower MachineInstr to MCInst ----*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp index 1a7c0d7f687a..9b6aa3593ce0 100644 --- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp +++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h index 4f64f4c65f1d..9eec3f37bc28 100644 --- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h +++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h @@ -1,9 +1,8 @@ //=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp index 98e761ef87fe..0becfaa1d49c 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -1,9 +1,8 @@ //-- SystemZMachineScheduler.cpp - SystemZ Scheduler Interface -*- C++ -*---==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h index ab820e5d3e63..0d5cc2e03e8d 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.h +++ b/lib/Target/SystemZ/SystemZMachineScheduler.h @@ -1,9 +1,8 @@ //==- SystemZMachineScheduler.h - SystemZ Scheduler Interface ----*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td index 7bf32bf19a4a..56632e1529a2 100644 --- a/lib/Target/SystemZ/SystemZOperands.td +++ b/lib/Target/SystemZ/SystemZOperands.td @@ -1,9 +1,8 @@ //===-- SystemZOperands.td - SystemZ instruction operands ----*- tblgen-*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -189,6 +188,17 @@ def HF32 : SDNodeXFormgetTargetConstant(Value, SDLoc(N), MVT::i64); }]>; +// Negated variants. +def NEGLH16 : SDNodeXFormgetZExtValue() & 0x00000000FFFF0000ULL) >> 16; + return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64); +}]>; + +def NEGLF32 : SDNodeXFormgetZExtValue() & 0x00000000FFFFFFFFULL; + return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64); +}]>; + // Truncate an immediate to a 8-bit signed quantity. def SIMM8 : SDNodeXFormgetTargetConstant(int8_t(N->getZExtValue()), SDLoc(N), @@ -431,6 +441,15 @@ def imm64hf32c : ImmediategetZExtValue())); }], HF32, "U32Imm">; +// Negated immediates that fit LF32 or LH16. +def imm64lh16n : ImmediategetZExtValue())); +}], NEGLH16, "U16Imm">; + +def imm64lf32n : ImmediategetZExtValue())); +}], NEGLF32, "U32Imm">; + // Short immediates. def imm64sx8 : Immediate(N->getSExtValue()); diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 626675bfb70c..15bd12bc98a4 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -1,9 +1,8 @@ //===-- SystemZOperators.td - SystemZ-specific operators ------*- tblgen-*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -192,6 +191,12 @@ def SDT_ZVecTernary : SDTypeProfile<1, 3, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; +def SDT_ZVecTernaryConvCC : SDTypeProfile<2, 3, + [SDTCisVec<0>, + SDTCisVT<1, i32>, + SDTCisVec<2>, + SDTCisSameAs<2, 3>, + SDTCisSameAs<0, 4>]>; def SDT_ZVecTernaryInt : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, @@ -279,6 +284,10 @@ def z_loadbswap : SDNode<"SystemZISD::LRV", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def z_storebswap : SDNode<"SystemZISD::STRV", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def z_loadeswap : SDNode<"SystemZISD::VLER", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def z_storeeswap : SDNode<"SystemZISD::VSTER", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def z_tdc : SDNode<"SystemZISD::TDC", SDT_ZTest>; @@ -338,6 +347,10 @@ def z_vstrc_cc : SDNode<"SystemZISD::VSTRC_CC", SDT_ZVecQuaternaryIntCC>; def z_vstrcz_cc : SDNode<"SystemZISD::VSTRCZ_CC", SDT_ZVecQuaternaryIntCC>; +def z_vstrs_cc : SDNode<"SystemZISD::VSTRS_CC", + SDT_ZVecTernaryConvCC>; +def z_vstrsz_cc : SDNode<"SystemZISD::VSTRSZ_CC", + SDT_ZVecTernaryConvCC>; def z_vftci : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>; class AtomicWOp @@ -662,22 +675,34 @@ def z_usub : PatFrags<(ops node:$src1, node:$src2), [(z_usubo node:$src1, node:$src2), (sub node:$src1, node:$src2)]>; +// Combined logical operations. +def andc : PatFrag<(ops node:$src1, node:$src2), + (and node:$src1, (not node:$src2))>; +def orc : PatFrag<(ops node:$src1, node:$src2), + (or node:$src1, (not node:$src2))>; +def nand : PatFrag<(ops node:$src1, node:$src2), + (not (and node:$src1, node:$src2))>; +def nor : PatFrag<(ops node:$src1, node:$src2), + (not (or node:$src1, node:$src2))>; +def nxor : PatFrag<(ops node:$src1, node:$src2), + (not (xor node:$src1, node:$src2))>; + // Fused multiply-subtract, using the natural operand order. -def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (fma node:$src1, node:$src2, (fneg node:$src3))>; +def any_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (any_fma node:$src1, node:$src2, (fneg node:$src3))>; // Fused multiply-add and multiply-subtract, but with the order of the // operands matching SystemZ's MA and MS instructions. -def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (fma node:$src2, node:$src3, node:$src1)>; -def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (fma node:$src2, node:$src3, (fneg node:$src1))>; +def z_any_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (any_fma node:$src2, node:$src3, node:$src1)>; +def z_any_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (any_fma node:$src2, node:$src3, (fneg node:$src1))>; // Negative fused multiply-add and multiply-subtract. -def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (fneg (fma node:$src1, node:$src2, node:$src3))>; -def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (fneg (fms node:$src1, node:$src2, node:$src3))>; +def any_fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fneg (any_fma node:$src1, node:$src2, node:$src3))>; +def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fneg (any_fms node:$src1, node:$src2, node:$src3))>; // Floating-point negative absolute. def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>; @@ -709,9 +734,9 @@ class shiftop [(operator node:$val, node:$count), (operator node:$val, (and node:$count, imm32bottom6set))]>; -// Vector representation of all-zeros and all-ones. -def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>; -def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>; +def imm32mod64 : PatLeaf<(i32 imm), [{ + return (N->getZExtValue() % 64 == 0); +}]>; // Load a scalar and replicate it in all elements of a vector. class z_replicate_load @@ -723,6 +748,10 @@ def z_replicate_loadi32 : z_replicate_load; def z_replicate_loadi64 : z_replicate_load; def z_replicate_loadf32 : z_replicate_load; def z_replicate_loadf64 : z_replicate_load; +// Byte-swapped replicated vector element loads. +def z_replicate_loadbswapi16 : z_replicate_load; +def z_replicate_loadbswapi32 : z_replicate_load; +def z_replicate_loadbswapi64 : z_replicate_load; // Load a scalar and insert it into a single element of a vector. class z_vle @@ -735,18 +764,22 @@ def z_vlei32 : z_vle; def z_vlei64 : z_vle; def z_vlef32 : z_vle; def z_vlef64 : z_vle; +// Byte-swapped vector element loads. +def z_vlebri16 : z_vle; +def z_vlebri32 : z_vle; +def z_vlebri64 : z_vle; // Load a scalar and insert it into the low element of the high i64 of a // zeroed vector. class z_vllez : PatFrag<(ops node:$addr), - (z_vector_insert (z_vzero), + (z_vector_insert immAllZerosV, (scalartype (load node:$addr)), (i32 index))>; def z_vllezi8 : z_vllez; def z_vllezi16 : z_vllez; def z_vllezi32 : z_vllez; def z_vllezi64 : PatFrags<(ops node:$addr), - [(z_vector_insert (z_vzero), + [(z_vector_insert immAllZerosV, (i64 (load node:$addr)), (i32 0)), (z_join_dwords (i64 (load node:$addr)), (i64 0))]>; // We use high merges to form a v4f32 from four f32s. Propagating zero @@ -759,11 +792,12 @@ def z_vllezf32 : PatFrag<(ops node:$addr), (bitconvert (v4f32 (scalar_to_vector (f32 (load node:$addr)))))))), - (v2i64 (z_vzero)))>; + (v2i64 + (bitconvert (v4f32 immAllZerosV))))>; def z_vllezf64 : PatFrag<(ops node:$addr), (z_merge_high (v2f64 (scalar_to_vector (f64 (load node:$addr)))), - (z_vzero))>; + immAllZerosV)>; // Similarly for the high element of a zeroed vector. def z_vllezli32 : z_vllez; @@ -774,8 +808,21 @@ def z_vllezlf32 : PatFrag<(ops node:$addr), (z_merge_high (v4f32 (scalar_to_vector (f32 (load node:$addr)))), - (v4f32 (z_vzero))))), - (v2i64 (z_vzero)))>; + (v4f32 immAllZerosV)))), + (v2i64 + (bitconvert (v4f32 immAllZerosV))))>; + +// Byte-swapped variants. +def z_vllebrzi16 : z_vllez; +def z_vllebrzi32 : z_vllez; +def z_vllebrzli32 : z_vllez; +def z_vllebrzi64 : PatFrags<(ops node:$addr), + [(z_vector_insert immAllZerosV, + (i64 (z_loadbswap64 node:$addr)), + (i32 0)), + (z_join_dwords (i64 (z_loadbswap64 node:$addr)), + (i64 0))]>; + // Store one element of a vector. class z_vste @@ -788,18 +835,22 @@ def z_vstei32 : z_vste; def z_vstei64 : z_vste; def z_vstef32 : z_vste; def z_vstef64 : z_vste; +// Byte-swapped vector element stores. +def z_vstebri16 : z_vste; +def z_vstebri32 : z_vste; +def z_vstebri64 : z_vste; // Arithmetic negation on vectors. -def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>; +def z_vneg : PatFrag<(ops node:$x), (sub immAllZerosV, node:$x)>; // Bitwise negation on vectors. -def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>; +def z_vnot : PatFrag<(ops node:$x), (xor node:$x, immAllOnesV)>; // Signed "integer greater than zero" on vectors. -def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>; +def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>; // Signed "integer less than zero" on vectors. -def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>; +def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph immAllZerosV, node:$x)>; // Integer absolute on vectors. class z_viabs diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td index 152521fb66a8..beaf4de285a3 100644 --- a/lib/Target/SystemZ/SystemZPatterns.td +++ b/lib/Target/SystemZ/SystemZPatterns.td @@ -1,9 +1,8 @@ //===-- SystemZPatterns.td - SystemZ-specific pattern rules ---*- tblgen-*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZPostRewrite.cpp b/lib/Target/SystemZ/SystemZPostRewrite.cpp new file mode 100644 index 000000000000..8e4060eac74c --- /dev/null +++ b/lib/Target/SystemZ/SystemZPostRewrite.cpp @@ -0,0 +1,124 @@ +//==---- SystemZPostRewrite.cpp - Select pseudos after RegAlloc ---*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that is run immediately after VirtRegRewriter +// but before MachineCopyPropagation. The purpose is to lower pseudos to +// target instructions before any later pass might substitute a register for +// another. +// +//===----------------------------------------------------------------------===// + +#include "SystemZ.h" +#include "SystemZInstrInfo.h" +#include "SystemZSubtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +using namespace llvm; + +#define SYSTEMZ_POSTREWRITE_NAME "SystemZ Post Rewrite pass" + +#define DEBUG_TYPE "systemz-postrewrite" +STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops."); + +namespace llvm { + void initializeSystemZPostRewritePass(PassRegistry&); +} + +namespace { + +class SystemZPostRewrite : public MachineFunctionPass { +public: + static char ID; + SystemZPostRewrite() : MachineFunctionPass(ID) { + initializeSystemZPostRewritePass(*PassRegistry::getPassRegistry()); + } + + const SystemZInstrInfo *TII; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool selectMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool selectMBB(MachineBasicBlock &MBB); +}; + +char SystemZPostRewrite::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(SystemZPostRewrite, "systemz-post-rewrite", + SYSTEMZ_POSTREWRITE_NAME, false, false) + +/// Returns an instance of the Post Rewrite pass. +FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) { + return new SystemZPostRewrite(); +} + +/// If MBBI references a pseudo instruction that should be selected here, +/// do it and return true. Otherwise return false. +bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + + // Note: If this could be done during regalloc in foldMemoryOperandImpl() + // while also updating the LiveIntervals, there would be no need for the + // MemFoldPseudo to begin with. + int TargetMemOpcode = SystemZ::getTargetMemOpcode(Opcode); + if (TargetMemOpcode != -1) { + MI.setDesc(TII->get(TargetMemOpcode)); + MI.tieOperands(0, 1); + unsigned DstReg = MI.getOperand(0).getReg(); + MachineOperand &SrcMO = MI.getOperand(1); + if (DstReg != SrcMO.getReg()) { + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), DstReg) + .addReg(SrcMO.getReg()); + SrcMO.setReg(DstReg); + MemFoldCopies++; + } + return true; + } + + return false; +} + +/// Iterate over the instructions in basic block MBB and select any +/// pseudo instructions. Return true if anything was modified. +bool SystemZPostRewrite::selectMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= selectMI(MBB, MBBI, NMBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool SystemZPostRewrite::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + + bool Modified = false; + for (auto &MBB : MF) + Modified |= selectMBB(MBB); + + return Modified; +} + diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td index 0dca4582dc0d..b27c25beb58c 100644 --- a/lib/Target/SystemZ/SystemZProcessors.td +++ b/lib/Target/SystemZ/SystemZProcessors.td @@ -1,9 +1,8 @@ //===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -36,3 +35,5 @@ def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>; def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>; def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>; +def : ProcessorModel<"arch13", Arch13Model, Arch13SupportedFeatures.List>; + diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index e9f9188048da..e7cd6871dbb4 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- SystemZRegisterInfo.cpp - SystemZ register information ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -54,6 +53,26 @@ static const TargetRegisterClass *getRC32(MachineOperand &MO, return RC; } +// Pass the registers of RC as hints while making sure that if any of these +// registers are copy hints (and therefore already in Hints), hint them +// first. +static void addHints(ArrayRef Order, + SmallVectorImpl &Hints, + const TargetRegisterClass *RC, + const MachineRegisterInfo *MRI) { + SmallSet CopyHints; + CopyHints.insert(Hints.begin(), Hints.end()); + Hints.clear(); + for (MCPhysReg Reg : Order) + if (CopyHints.count(Reg) && + RC->contains(Reg) && !MRI->isReserved(Reg)) + Hints.push_back(Reg); + for (MCPhysReg Reg : Order) + if (!CopyHints.count(Reg) && + RC->contains(Reg) && !MRI->isReserved(Reg)) + Hints.push_back(Reg); +} + bool SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, ArrayRef Order, @@ -62,7 +81,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( VirtReg, Order, Hints, MF, VRM, Matrix); @@ -76,31 +96,23 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, if (!DoneRegs.insert(Reg).second) continue; - for (auto &Use : MRI->use_instructions(Reg)) + for (auto &Use : MRI->reg_instructions(Reg)) { // For LOCRMux, see if the other operand is already a high or low - // register, and in that case give the correpsonding hints for + // register, and in that case give the corresponding hints for // VirtReg. LOCR instructions need both operands in either high or - // low parts. - if (Use.getOpcode() == SystemZ::LOCRMux) { + // low parts. Same handling for SELRMux. + if (Use.getOpcode() == SystemZ::LOCRMux || + Use.getOpcode() == SystemZ::SELRMux) { MachineOperand &TrueMO = Use.getOperand(1); MachineOperand &FalseMO = Use.getOperand(2); const TargetRegisterClass *RC = TRI->getCommonSubClass(getRC32(FalseMO, VRM, MRI), getRC32(TrueMO, VRM, MRI)); + if (Use.getOpcode() == SystemZ::SELRMux) + RC = TRI->getCommonSubClass(RC, + getRC32(Use.getOperand(0), VRM, MRI)); if (RC && RC != &SystemZ::GRX32BitRegClass) { - // Pass the registers of RC as hints while making sure that if - // any of these registers are copy hints, hint them first. - SmallSet CopyHints; - CopyHints.insert(Hints.begin(), Hints.end()); - Hints.clear(); - for (MCPhysReg Reg : Order) - if (CopyHints.count(Reg) && - RC->contains(Reg) && !MRI->isReserved(Reg)) - Hints.push_back(Reg); - for (MCPhysReg Reg : Order) - if (!CopyHints.count(Reg) && - RC->contains(Reg) && !MRI->isReserved(Reg)) - Hints.push_back(Reg); + addHints(Order, Hints, RC, MRI); // Return true to make these hints the only regs available to // RA. This may mean extra spilling but since the alternative is // a jump sequence expansion of the LOCRMux, it is preferred. @@ -112,10 +124,70 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg()); if (MRI->getRegClass(OtherReg) == &SystemZ::GRX32BitRegClass) Worklist.push_back(OtherReg); - } + } // end LOCRMux + else if (Use.getOpcode() == SystemZ::CHIMux || + Use.getOpcode() == SystemZ::CFIMux) { + if (Use.getOperand(1).getImm() == 0) { + bool OnlyLMuxes = true; + for (MachineInstr &DefMI : MRI->def_instructions(VirtReg)) + if (DefMI.getOpcode() != SystemZ::LMux) + OnlyLMuxes = false; + if (OnlyLMuxes) { + addHints(Order, Hints, &SystemZ::GR32BitRegClass, MRI); + // Return false to make these hints preferred but not obligatory. + return false; + } + } + } // end CHIMux / CFIMux + } } } + if (VRM == nullptr) + return BaseImplRetVal; + + // Add any two address hints after any copy hints. + SmallSet TwoAddrHints; + for (auto &Use : MRI->reg_nodbg_instructions(VirtReg)) + if (SystemZ::getTwoOperandOpcode(Use.getOpcode()) != -1) { + const MachineOperand *VRRegMO = nullptr; + const MachineOperand *OtherMO = nullptr; + const MachineOperand *CommuMO = nullptr; + if (VirtReg == Use.getOperand(0).getReg()) { + VRRegMO = &Use.getOperand(0); + OtherMO = &Use.getOperand(1); + if (Use.isCommutable()) + CommuMO = &Use.getOperand(2); + } else if (VirtReg == Use.getOperand(1).getReg()) { + VRRegMO = &Use.getOperand(1); + OtherMO = &Use.getOperand(0); + } else if (VirtReg == Use.getOperand(2).getReg() && Use.isCommutable()) { + VRRegMO = &Use.getOperand(2); + OtherMO = &Use.getOperand(0); + } else + continue; + + auto tryAddHint = [&](const MachineOperand *MO) -> void { + Register Reg = MO->getReg(); + Register PhysReg = isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg); + if (PhysReg) { + if (MO->getSubReg()) + PhysReg = getSubReg(PhysReg, MO->getSubReg()); + if (VRRegMO->getSubReg()) + PhysReg = getMatchingSuperReg(PhysReg, VRRegMO->getSubReg(), + MRI->getRegClass(VirtReg)); + if (!MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg)) + TwoAddrHints.insert(PhysReg); + } + }; + tryAddHint(OtherMO); + if (CommuMO) + tryAddHint(CommuMO); + } + for (MCPhysReg OrderReg : Order) + if (TwoAddrHints.count(OrderReg)) + Hints.push_back(OrderReg); + return BaseImplRetVal; } @@ -169,6 +241,9 @@ SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(SystemZ::A0); Reserved.set(SystemZ::A1); + // FPC is the floating-point control register. + Reserved.set(SystemZ::FPC); + return Reserved; } @@ -328,7 +403,7 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI, return true; } -unsigned +Register SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const SystemZFrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index 9fd2e4ae4f00..4f721ec23e53 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -1,9 +1,8 @@ //===-- SystemZRegisterInfo.h - SystemZ register information ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -84,7 +83,7 @@ public: const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index cea88c088b86..3567b0f3acf8 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -1,9 +1,8 @@ //==- SystemZRegisterInfo.td - SystemZ register definitions -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -296,6 +295,13 @@ def CC : SystemZReg<"cc">; let isAllocatable = 0, CopyCost = -1 in def CCR : RegisterClass<"SystemZ", [i32], 32, (add CC)>; +// The floating-point control register. +// Note: We only model the current rounding modes and the IEEE masks. +// IEEE flags and DXC are not modeled here. +def FPC : SystemZReg<"fpc">; +let isAllocatable = 0 in + def FPCRegs : RegisterClass<"SystemZ", [i32], 32, (add FPC)>; + // Access registers. class ACR32 num, string n> : SystemZReg { let HWEncoding = num; diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td index 83bf97e6841a..98eca2802242 100644 --- a/lib/Target/SystemZ/SystemZSchedule.td +++ b/lib/Target/SystemZ/SystemZSchedule.td @@ -1,9 +1,8 @@ //==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -60,6 +59,7 @@ def VBU : SchedWrite; // Virtual branching unit def MCD : SchedWrite; // Millicode +include "SystemZScheduleArch13.td" include "SystemZScheduleZ14.td" include "SystemZScheduleZ13.td" include "SystemZScheduleZEC12.td" diff --git a/lib/Target/SystemZ/SystemZScheduleArch13.td b/lib/Target/SystemZ/SystemZScheduleArch13.td new file mode 100644 index 000000000000..9f82f24d0e8f --- /dev/null +++ b/lib/Target/SystemZ/SystemZScheduleArch13.td @@ -0,0 +1,1695 @@ +//-- SystemZScheduleArch13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Arch13 to support instruction +// scheduling and other instruction cost heuristics. +// +// Pseudos expanded right after isel do not need to be modelled here. +// +//===----------------------------------------------------------------------===// + +def Arch13Model : SchedMachineModel { + + let UnsupportedFeatures = Arch13UnsupportedFeatures.List; + + let IssueWidth = 6; // Number of instructions decoded per cycle. + let MicroOpBufferSize = 60; // Issue queues + let LoadLatency = 1; // Optimistic load latency. + + let PostRAScheduler = 1; + + // Extra cycles for a mispredicted branch. + let MispredictPenalty = 20; +} + +let SchedModel = Arch13Model in { +// These definitions need the SchedModel value. They could be put in a +// subtarget common include file, but it seems the include system in Tablegen +// currently (2016) rejects multiple includes of same file. + +// Decoder grouping rules +let NumMicroOps = 1 in { + def : WriteRes; + def : WriteRes { let BeginGroup = 1; } + def : WriteRes { let EndGroup = 1; } +} +def : WriteRes { + let NumMicroOps = 2; + let BeginGroup = 1; +} +def : WriteRes { + let NumMicroOps = 3; + let BeginGroup = 1; + let EndGroup = 1; +} +def : WriteRes { + let NumMicroOps = 6; + let BeginGroup = 1; + let EndGroup = 1; +} +def : WriteRes { + let NumMicroOps = 9; + let BeginGroup = 1; + let EndGroup = 1; +} + +// Incoming latency removed from the register operand which is used together +// with a memory operand by the instruction. +def : ReadAdvance; + +// LoadLatency (above) is not used for instructions in this file. This is +// instead the role of LSULatency, which is the latency value added to the +// result of loads and instructions with folded memory operands. +def : WriteRes { let Latency = 4; let NumMicroOps = 0; } + +let NumMicroOps = 0 in { + foreach L = 1-30 in + def : WriteRes("WLat"#L), []> { let Latency = L; } +} + +// Execution units. +def Arch13_FXaUnit : ProcResource<2>; +def Arch13_FXbUnit : ProcResource<2>; +def Arch13_LSUnit : ProcResource<2>; +def Arch13_VecUnit : ProcResource<2>; +def Arch13_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ } +def Arch13_VBUnit : ProcResource<2>; +def Arch13_MCD : ProcResource<1>; + +// Subtarget specific definitions of scheduling resources. +let NumMicroOps = 0 in { + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + foreach Num = 2-5 in { let ResourceCycles = [Num] in { + def : WriteRes("FXa"#Num), [Arch13_FXaUnit]>; + def : WriteRes("FXb"#Num), [Arch13_FXbUnit]>; + def : WriteRes("LSU"#Num), [Arch13_LSUnit]>; + def : WriteRes("VecBF"#Num), [Arch13_VecUnit]>; + def : WriteRes("VecDF"#Num), [Arch13_VecUnit]>; + def : WriteRes("VecDFX"#Num), [Arch13_VecUnit]>; + def : WriteRes("VecMul"#Num), [Arch13_VecUnit]>; + def : WriteRes("VecStr"#Num), [Arch13_VecUnit]>; + def : WriteRes("VecXsPm"#Num), [Arch13_VecUnit]>; + }} + + def : WriteRes { let ResourceCycles = [30]; } + + def : WriteRes; // Virtual Branching Unit +} + +def : WriteRes { let NumMicroOps = 3; + let BeginGroup = 1; + let EndGroup = 1; } + +// -------------------------- INSTRUCTIONS ---------------------------------- // + +// InstRW constructs have been used in order to preserve the +// readability of the InstrInfo files. + +// For each instruction, as matched by a regexp, provide a list of +// resources that it needs. These will be combined into a SchedClass. + +//===----------------------------------------------------------------------===// +// Stack allocation +//===----------------------------------------------------------------------===// + +// Pseudo -> LA / LAY +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>; + +//===----------------------------------------------------------------------===// +// Branch instructions +//===----------------------------------------------------------------------===// + +// Branch +def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>; +def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>; +def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>; +def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>; +def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>; +def : InstRW<[WLat1, FXa2, FXb2, GroupAlone2], + (instregex "B(R)?X(H|L).*$")>; + +// Compare and branch +def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>; +def : InstRW<[WLat1, FXb2, GroupAlone], + (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Trap instructions +//===----------------------------------------------------------------------===// + +// Trap +def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>; + +// Compare and trap +def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Call and return instructions +//===----------------------------------------------------------------------===// + +// Call +def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>; +def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>; +def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>; +def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>; + +// Return +def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>; + +//===----------------------------------------------------------------------===// +// Move instructions +//===----------------------------------------------------------------------===// + +// Moves +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>; + +// Move character +def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>; +def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "MVCRL$")>; + +// Pseudo -> reg move +def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>; + +// Loads +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>; +def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>; + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>; + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>; + +// Load and zero rightmost byte +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>; + +// Load and trap +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>; + +// Load and test +def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>; + +// Stores +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>; + +// String moves. +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>; + +//===----------------------------------------------------------------------===// +// Conditional move instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>; +def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>; +def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>; +def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], + (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>; + +def : InstRW<[WLat2, FXa, NormalGr], (instregex "SELRMux$")>; +def : InstRW<[WLat2, FXa, NormalGr], (instregex "SEL(G|FH)?R(Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Sign extensions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>; + +def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>; + +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>; +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>; +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>; +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>; +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>; + +//===----------------------------------------------------------------------===// +// Zero extensions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>; +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>; + +// Load and zero rightmost byte +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>; + +// Load and trap +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>; + +//===----------------------------------------------------------------------===// +// Truncations +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Multi-register moves +//===----------------------------------------------------------------------===// + +// Load multiple (estimated average of 5 ops) +def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>; + +// Load multiple disjoint +def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>; + +// Store multiple +def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Byte swaps +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>; +def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>; +def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>; + +//===----------------------------------------------------------------------===// +// Load address instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>; + +// Load the Global Offset Table address ( -> larl ) +def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>; + +//===----------------------------------------------------------------------===// +// Absolute and Negation +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>; +def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>; +def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>; +def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>; + +//===----------------------------------------------------------------------===// +// Insertion +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>; +def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "IC32(Y)?$")>; +def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr], + (instregex "ICM(H|Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>; + +//===----------------------------------------------------------------------===// +// Addition +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "A(Y)?$")>; +def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "AH(Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>; +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "AG$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>; +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "AL(Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>; +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "ALG(F)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>; +def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>; + +// Logical addition with carry +def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone], + (instregex "ALC(G)?$")>; +def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>; + +// Add with sign extension (16/32 -> 64) +def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "AG(F|H)$")>; +def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>; + +//===----------------------------------------------------------------------===// +// Subtraction +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "S(G|Y)?$")>; +def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "SH(Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>; +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "SL(G|GF|Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>; +def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>; + +// Subtraction with borrow +def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone], + (instregex "SLB(G)?$")>; +def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>; + +// Subtraction with sign extension (16/32 -> 64) +def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "SG(F|H)$")>; +def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>; + +//===----------------------------------------------------------------------===// +// AND +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "N(G|Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>; +def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>; + +//===----------------------------------------------------------------------===// +// OR +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "O(G|Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>; +def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>; + +//===----------------------------------------------------------------------===// +// XOR +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "X(G|Y)?$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>; +def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>; + +//===----------------------------------------------------------------------===// +// Combined logical operations +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>; + +//===----------------------------------------------------------------------===// +// Multiplication +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "MS(GF|Y)?$")>; +def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>; +def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>; +def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>; +def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>; +def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>; +def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>; +def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>; +def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>; +def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone], + (instregex "M(FY|L)?$")>; +def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>; +def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>; +def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>; +def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "MSC$")>; +def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr], + (instregex "MSGC$")>; +def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>; +def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>; + +//===----------------------------------------------------------------------===// +// Division and remainder +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>; +def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], (instregex "D$")>; +def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>; +def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone2], + (instregex "DSG(F)?$")>; +def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>; +def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>; +def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], + (instregex "DL(G)?$")>; + +//===----------------------------------------------------------------------===// +// Shifts +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>; +def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2], + (instregex "S(L|R)D(A|L)$")>; + +// Rotate +def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>; + +// Rotate and insert +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>; + +// Rotate and Select +def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>; + +//===----------------------------------------------------------------------===// +// Comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], + (instregex "C(G|Y|Mux)?$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>; +def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>; +def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], + (instregex "CL(Y|Mux)?$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>; +def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>; +def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>; +def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>; +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>; +def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>; + +// Compare halfword +def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>; +def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>; +def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>; + +// Compare with sign extension (32 -> 64) +def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>; +def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>; +def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>; + +// Compare logical character +def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>; + +// Test under mask +def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>; + +// Compare logical characters under mask +def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], + (instregex "CLM(H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Prefetch and execution hint +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>; +def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>; + +//===----------------------------------------------------------------------===// +// Atomic operations +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>; + +def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>; +def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>; +def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>; +def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>; +def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>; + +// Test and set +def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>; + +// Compare and swap +def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone], + (instregex "CS(G|Y)?$")>; + +// Compare double and swap +def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2], + (instregex "CDS(Y)?$")>; +def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, + GroupAlone3], (instregex "CDSG$")>; + +// Compare and swap and store +def : InstRW<[WLat30, MCD], (instregex "CSST$")>; + +// Perform locked operation +def : InstRW<[WLat30, MCD], (instregex "PLO$")>; + +// Load/store pair from/to quadword +def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>; +def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>; + +// Load pair disjoint +def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>; + +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>; +def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone2], + (instregex "TRT$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], + (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], + (instregex "KM(C|F|O|CTR|A)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], + (instregex "(KIMD|KLMD|KMAC|KDSA)$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], + (instregex "(PCC|PPNO|PRNO)$")>; + +//===----------------------------------------------------------------------===// +// Guarded storage +//===----------------------------------------------------------------------===// + +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>; +def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat20, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone2], + (instregex "CVBG$")>; +def : InstRW<[WLat20, RegReadAdv, FXb, VecDF, LSU, GroupAlone2], + (instregex "CVB(Y)?$")>; +def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone3], (instregex "CVDG$")>; +def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone2], (instregex "CVD(Y)?$")>; +def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>; +def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>; +def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>; + +def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[WLat1, FXb, VecDFX2, LSU3, GroupAlone2], (instregex "MP$")>; +def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "DP$")>; +def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>; +def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>; +def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>; +def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>; + +//===----------------------------------------------------------------------===// +// Access registers +//===----------------------------------------------------------------------===// + +// Extract/set/copy access register +def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>; + +// Load address extended +def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>; + +// Load/store access multiple (not modeled precisely) +def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>; +def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>; + +//===----------------------------------------------------------------------===// +// Program mask and addressing mode +//===----------------------------------------------------------------------===// + +// Insert Program Mask +def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>; + +// Set Program Mask +def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>; + +// Branch and link +def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>; + +// Test addressing mode +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>; + +// Set addressing mode +def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>; + +// Branch (and save) and set mode. +def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>; +def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>; + +//===----------------------------------------------------------------------===// +// Transactional execution +//===----------------------------------------------------------------------===// + +// Transaction begin +def : InstRW<[WLat9, LSU2, FXb5, GroupAlone2], (instregex "TBEGIN(C)?$")>; + +// Transaction end +def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>; + +// Transaction abort +def : InstRW<[WLat30, MCD], (instregex "TABORT$")>; + +// Extract Transaction Nesting Depth +def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>; + +// Nontransactional store +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>; + +//===----------------------------------------------------------------------===// +// Processor assist +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +//===----------------------------------------------------------------------===// + +// Find leftmost one +def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>; + +// Population count +def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT(Opt)?$")>; + +// String instructions +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>; +def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD], + (instregex "UPT$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>; +def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>; +def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "SORTL$")>; +def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "DFLTCC$")>; + +// Execute +def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>; + +//===----------------------------------------------------------------------===// +// .insn directive instructions +//===----------------------------------------------------------------------===// + +// An "empty" sched-class will be assigned instead of the "invalid sched-class". +// getNumDecoderSlots() will then return 1 instead of 0. +def : InstRW<[], (instregex "Insn.*")>; + + +// ----------------------------- Floating point ----------------------------- // + +//===----------------------------------------------------------------------===// +// FP: Move instructions +//===----------------------------------------------------------------------===// + +// Load zero +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; + +// Load +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; +def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; + +// Load and Test +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>; +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], + (instregex "LTXBR(Compare)?$")>; + +// Copy sign +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>; + +//===----------------------------------------------------------------------===// +// FP: Load instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; + +//===----------------------------------------------------------------------===// +// FP: Store instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>; + +//===----------------------------------------------------------------------===// +// FP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEDBR(A)?$")>; +def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>; + +// Load lengthened +def : InstRW<[WLat6LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LDEBR$")>; +def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>; +def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>; + +// Convert from fixed / logical +def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>; +def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)BR(A)?$")>; +def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>; +def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CXL(F|G)BR$")>; + +// Convert to fixed / logical +def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], + (instregex "C(F|G)(E|D)BR(A)?$")>; +def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], + (instregex "C(F|G)XBR(A)?$")>; +def : InstRW<[WLat9, WLat9, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>; +def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLFDBR$")>; +def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>; +def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; + +//===----------------------------------------------------------------------===// +// FP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load Complement / Negative / Positive +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; + +// Square root +def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>; +def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>; + +// Load FP integer +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>; +def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>; + +//===----------------------------------------------------------------------===// +// FP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "A(E|D)B$")>; +def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D)BR$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>; + +// Subtraction +def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "S(E|D)B$")>; +def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D)BR$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>; + +// Multiply +def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "M(D|DE|EE)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>; +def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], + (instregex "MXDB$")>; +def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDBR$")>; +def : InstRW<[WLat15, VecDF4, GroupAlone], (instregex "MXBR$")>; + +// Multiply and add / subtract +def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone], + (instregex "M(A|S)EB$")>; +def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>; +def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone], + (instregex "M(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(A|S)DBR$")>; + +// Division +def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], + (instregex "D(E|D)B$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>; +def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>; + +// Divide to integer +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>; + +//===----------------------------------------------------------------------===// +// FP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr], + (instregex "(K|C)(E|D)B$")>; +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>; +def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>; + +// Test Data Class +def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>; +def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>; + +//===----------------------------------------------------------------------===// +// FP: Floating-point control register instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>; +def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>; +def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>; +def : InstRW<[WLat30, MCD], (instregex "SFASR$")>; +def : InstRW<[WLat30, MCD], (instregex "LFAS$")>; +def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>; + + +// --------------------- Hexadecimal floating point ------------------------- // + +//===----------------------------------------------------------------------===// +// HFP: Move instructions +//===----------------------------------------------------------------------===// + +// Load and Test +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEXR$")>; +def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>; + +// Load lengthened +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>; +def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>; +def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>; + +// Convert from fixed +def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>; +def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)R$")>; + +// Convert to fixed +def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>; +def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>; + +// Convert BFP to HFP / HFP to BFP. +def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "THD(E)?R$")>; +def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "TB(E)?DR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load Complement / Negative / Positive +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>; + +// Halve +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "H(E|D)R$")>; + +// Square root +def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>; +def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>; + +// Load FP integer +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)R$")>; +def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "A(E|D|U|W)$")>; +def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>; + +// Subtraction +def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "S(E|D|U|W)$")>; +def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>; + +// Multiply +def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "M(D|DE|E|EE)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>; +def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], + (instregex "MXD$")>; +def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDR$")>; +def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>; +def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>; +def : InstRW<[WLat6LSU, RegReadAdv, VecBF2, LSU, GroupAlone], + (instregex "MY(H|L)$")>; +def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MYR$")>; +def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MY(H|L)R$")>; + +// Multiply and add / subtract +def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone], + (instregex "M(A|S)(E|D)$")>; +def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>; +def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone], + (instregex "MAY$")>; +def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone], + (instregex "MAY(H|L)$")>; +def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MAYR$")>; +def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>; + +// Division +def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "D(E|D)$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>; +def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr], + (instregex "C(E|D)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "C(E|D)R$")>; +def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>; + + +// ------------------------ Decimal floating point -------------------------- // + +//===----------------------------------------------------------------------===// +// DFP: Move instructions +//===----------------------------------------------------------------------===// + +// Load and Test +def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>; +def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>; + +// Load lengthened +def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>; +def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>; + +// Convert from fixed / logical +def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDFTR(A)?$")>; +def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDGTR(A)?$")>; +def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXFTR(A)?$")>; +def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXGTR(A)?$")>; +def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDLFTR$")>; +def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDLGTR$")>; +def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXLFTR$")>; +def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXLGTR$")>; + +// Convert to fixed / logical +def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], + (instregex "C(F|G)DTR(A)?$")>; +def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], + (instregex "C(F|G)XTR(A)?$")>; +def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>; +def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>; + +// Convert from / to signed / unsigned packed +def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>; +def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone2], (instregex "CX(S|U)TR$")>; +def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>; +def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone2], (instregex "C(S|U)XTR$")>; + +// Convert from / to zoned +def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>; +def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXZT$")>; +def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>; +def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>; + +// Convert from / to packed +def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>; +def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXPT$")>; +def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>; +def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>; + +// Perform floating-point operation +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>; + +//===----------------------------------------------------------------------===// +// DFP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load FP integer +def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>; +def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>; + +// Extract biased exponent +def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>; +def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>; + +// Extract significance +def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>; +def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>; + +// Subtraction +def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>; + +// Multiply +def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>; +def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>; + +// Division +def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>; +def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>; + +// Quantize +def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>; +def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>; + +// Reround +def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>; +def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone2], (instregex "RRXTR$")>; + +// Shift significand left/right +def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>; +def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>; + +// Insert biased exponent +def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>; +def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "IEXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>; +def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>; + +// Compare biased exponent +def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>; +def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>; + +// Test Data Class/Group +def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>; +def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>; + + +// --------------------------------- Vector --------------------------------- // + +//===----------------------------------------------------------------------===// +// Vector: Move instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Immediate instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; + +//===----------------------------------------------------------------------===// +// Vector: Loads +//===----------------------------------------------------------------------===// + +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; +def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], + (instregex "VLE(B|F|G|H)$")>; +def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked], + (instregex "VGE(F|G)$")>; +def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], + (instregex "VLM(Align)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Stores +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; +def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; +def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; +def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Byte swaps +//===----------------------------------------------------------------------===// + +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBR(H|F|G|Q)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLER(H|F|G)?$")>; +def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], + (instregex "VLEBR(H|F|G)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEBRZ(H|F|G|E)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBRREP(H|F|G)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTBR(H|F|G|Q)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTER(H|F|G)?$")>; +def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTEBRH$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTEBR(F|G)$")>; + +//===----------------------------------------------------------------------===// +// Vector: Selects and permutes +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>; + +//===----------------------------------------------------------------------===// +// Vector: Widening and narrowing +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Integer arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VMSL(G)?$")>; + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>; + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>; + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLD$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSRD$")>; + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>; + +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>; +def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Integer comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>; +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>; +def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point arithmetic +//===----------------------------------------------------------------------===// + +// Conversion and rounding +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCFP(S|L)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?G$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?GB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCD(L)?GB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCE(L)?FB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCE(L)?FB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(S|L)FP$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GD$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GDB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?GDB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?FEB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?FEB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(L|R)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFLLD$")>; +def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFI(DB)?$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFIDB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFISB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFISB$")>; +def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>; + +// Sign operations +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>; + +// Minimum / maximum +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>; +def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>; + +// Test data class +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>; +def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>; + +// Add / subtract +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; + +// Multiply / multiply-and-add/subtract +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; +def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>; +def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>; + +// Divide / square root +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDSB$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQSB$")>; +def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>; +def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>; +def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], + (instregex "WF(C|K)(E|H|HE)DBS$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], + (instregex "VF(C|K)(E|H|HE)SBS$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>; +def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>; +def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>; +def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>; +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>; +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>; +def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>; +def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point insertion and extraction +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>; + +//===----------------------------------------------------------------------===// +// Vector: String instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], + (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], + (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>; +def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRS(B|F|H)?$")>; +def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRSZ(B|F|H)$")>; + +//===----------------------------------------------------------------------===// +// Vector: Packed-decimal instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "VLIP$")>; +def : InstRW<[WLat6, VecDFX, LSU, GroupAlone2], (instregex "VPKZ$")>; +def : InstRW<[WLat1, VecDFX, FXb, LSU2, GroupAlone2], (instregex "VUPKZ$")>; +def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone], + (instregex "VCVB(G)?(Opt)?$")>; +def : InstRW<[WLat15, WLat15, VecDF2, FXb, GroupAlone], + (instregex "VCVD(G)?$")>; +def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>; +def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>; +def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>; +def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VSDP$")>; +def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>; +def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>; +def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>; + + +// -------------------------------- System ---------------------------------- // + +//===----------------------------------------------------------------------===// +// System: Program-Status Word Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>; +def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?$")>; +def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>; +def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; +def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>; +def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>; + +//===----------------------------------------------------------------------===// +// System: Control Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>; +def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>; +def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>; +def : InstRW<[WLat30, MCD], (instregex "ESEA$")>; + +//===----------------------------------------------------------------------===// +// System: Prefix-Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>; + +//===----------------------------------------------------------------------===// +// System: Storage-Key and Real Memory Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "ISKE$")>; +def : InstRW<[WLat30, MCD], (instregex "IVSK$")>; +def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>; +def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>; +def : InstRW<[WLat30, MCD], (instregex "IRBM$")>; +def : InstRW<[WLat30, MCD], (instregex "PFMF$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>; +def : InstRW<[WLat30, MCD], (instregex "PGIN$")>; +def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>; + +//===----------------------------------------------------------------------===// +// System: Dynamic-Address-Translation Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>; +def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>; +def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>; +def : InstRW<[WLat30, MCD], (instregex "PTLB$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>; +def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>; +def : InstRW<[WLat30, MCD], (instregex "STRAG$")>; +def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>; +def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>; +def : InstRW<[WLat30, MCD], (instregex "TPROT$")>; + +//===----------------------------------------------------------------------===// +// System: Memory-move Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>; +def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>; +def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>; +def : InstRW<[WLat30, MCD], (instregex "MVPG$")>; + +//===----------------------------------------------------------------------===// +// System: Address-Space Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "LASP$")>; +def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>; +def : InstRW<[WLat30, MCD], (instregex "PC$")>; +def : InstRW<[WLat30, MCD], (instregex "PR$")>; +def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>; +def : InstRW<[WLat30, MCD], (instregex "RP$")>; +def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>; +def : InstRW<[WLat30, MCD], (instregex "TAR$")>; + +//===----------------------------------------------------------------------===// +// System: Linkage-Stack Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "BAKR$")>; +def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>; + +//===----------------------------------------------------------------------===// +// System: Time-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "PTFF$")>; +def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>; +def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>; +def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone2], (instregex "STCK(F)?$")>; +def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone3], (instregex "STCKE$")>; +def : InstRW<[WLat30, MCD], (instregex "STCKC$")>; +def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "STAP$")>; +def : InstRW<[WLat30, MCD], (instregex "STIDP$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>; +def : InstRW<[WLat30, MCD], (instregex "ECAG$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>; +def : InstRW<[WLat30, MCD], (instregex "PTF$")>; +def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>; + +//===----------------------------------------------------------------------===// +// System: Miscellaneous Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "SVC$")>; +def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>; +def : InstRW<[WLat30, MCD], (instregex "DIAG$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>; +def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>; +def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>; +def : InstRW<[WLat30, MCD], (instregex "SIE$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Measurement Facility Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>; +def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>; +def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>; +def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>; +def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>; +def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>; + +//===----------------------------------------------------------------------===// +// System: I/O Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>; +def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>; +def : InstRW<[WLat30, MCD], (instregex "RCHP$")>; +def : InstRW<[WLat30, MCD], (instregex "SCHM$")>; +def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; +def : InstRW<[WLat30, MCD], (instregex "TPI$")>; +def : InstRW<[WLat30, MCD], (instregex "SAL$")>; + +} + diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td index 74e1dad87908..b3266051da4e 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -1,9 +1,8 @@ //-- SystemZScheduleZ13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1192,8 +1191,8 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; // Vector: Loads //===----------------------------------------------------------------------===// -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; @@ -1201,16 +1200,17 @@ def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], (instregex "VLE(B|F|G|H)$")>; def : InstRW<[WLat6LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked], (instregex "VGE(F|G)$")>; -def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>; +def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], + (instregex "VLM(Align)?$")>; //===----------------------------------------------------------------------===// // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; -def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>; +def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td index 1962fdf3a1d1..df7282a2961b 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ14.td +++ b/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -1,9 +1,8 @@ //-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1210,8 +1209,8 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; // Vector: Loads //===----------------------------------------------------------------------===// -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; @@ -1219,17 +1218,18 @@ def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], (instregex "VLE(B|F|G|H)$")>; def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked], (instregex "VGE(F|G)$")>; -def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>; +def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], + (instregex "VLM(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; //===----------------------------------------------------------------------===// // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; -def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>; +def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index 7535739f813a..ca714ef1a702 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -1,9 +1,8 @@ //=- SystemZScheduleZ196.td - SystemZ Scheduling Definitions ---*- tblgen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index a21d2c4cef70..fb226be678da 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -1,9 +1,8 @@ //=- SystemZScheduleZEC12.td - SystemZ Scheduling Definitions --*- tblgen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index e0d7bca9a94b..a50e6aa59711 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- SystemZSelectionDAGInfo.cpp - SystemZ SelectionDAG Info -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -164,17 +163,17 @@ static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, } // Convert the current CC value into an integer that is 0 if CC == 0, -// less than zero if CC == 1 and greater than zero if CC >= 2. +// greater than zero if CC == 1 and less than zero if CC >= 2. // The sequence starts with IPM, which puts CC into bits 29 and 28 // of an integer and clears bits 30 and 31. static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg, SelectionDAG &DAG) { SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg); - SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM, - DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32)); - SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL, - DAG.getConstant(31, DL, MVT::i32)); - return ROTL; + SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, IPM, + DAG.getConstant(30 - SystemZ::IPM_CC, DL, MVT::i32)); + SDValue SRA = DAG.getNode(ISD::SRA, DL, MVT::i32, SHL, + DAG.getConstant(30, DL, MVT::i32)); + return SRA; } std::pair SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp( @@ -184,7 +183,8 @@ std::pair SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp( if (auto *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); assert(Bytes > 0 && "Caller should have handled 0-size case"); - SDValue CCReg = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes); + // Swap operands to invert CC == 1 vs. CC == 2 cases. + SDValue CCReg = emitCLC(DAG, DL, Chain, Src2, Src1, Bytes); Chain = CCReg.getValue(1); return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain); } @@ -232,7 +232,8 @@ std::pair SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp( SDValue Src2, MachinePointerInfo Op1PtrInfo, MachinePointerInfo Op2PtrInfo) const { SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::i32, MVT::Other); - SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2, + // Swap operands to invert CC == 1 vs. CC == 2 cases. + SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src2, Src1, DAG.getConstant(0, DL, MVT::i32)); SDValue CCReg = Unused.getValue(1); Chain = Unused.getValue(2); diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index 93cd970c30c6..7d63bae83cf3 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- SystemZSelectionDAGInfo.h - SystemZ SelectionDAG Info ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp index 195fa20a2c90..e79dfc5b4b9e 100644 --- a/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -1,9 +1,8 @@ //===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -47,6 +46,7 @@ private: bool shortenOn001(MachineInstr &MI, unsigned Opcode); bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode); bool shortenFPConv(MachineInstr &MI, unsigned Opcode); + bool shortenSelect(MachineInstr &MI, unsigned Opcode); const SystemZInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -176,6 +176,23 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) { return false; } +// MI is a three-operand select instruction. If one of the sources match +// the destination, convert to the equivalent load-on-condition. +bool SystemZShortenInst::shortenSelect(MachineInstr &MI, unsigned Opcode) { + if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) { + MI.setDesc(TII->get(Opcode)); + MI.tieOperands(0, 1); + return true; + } + if (MI.getOperand(0).getReg() == MI.getOperand(2).getReg()) { + TII->commuteInstruction(MI, false, 1, 2); + MI.setDesc(TII->get(Opcode)); + MI.tieOperands(0, 1); + return true; + } + return false; +} + // Process all instructions in MBB. Return true if something changed. bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { bool Changed = false; @@ -196,6 +213,18 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH); break; + case SystemZ::SELR: + Changed |= shortenSelect(MI, SystemZ::LOCR); + break; + + case SystemZ::SELFHR: + Changed |= shortenSelect(MI, SystemZ::LOCFHR); + break; + + case SystemZ::SELGR: + Changed |= shortenSelect(MI, SystemZ::LOCGR); + break; + case SystemZ::WFADB: Changed |= shortenOn001AddCC(MI, SystemZ::ADBR); break; @@ -300,6 +329,31 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { case SystemZ::VST64: Changed |= shortenOn0(MI, SystemZ::STD); break; + + default: { + int TwoOperandOpcode = SystemZ::getTwoOperandOpcode(MI.getOpcode()); + if (TwoOperandOpcode == -1) + break; + + if ((MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) && + (!MI.isCommutable() || + MI.getOperand(0).getReg() != MI.getOperand(2).getReg() || + !TII->commuteInstruction(MI, false, 1, 2))) + break; + + MI.setDesc(TII->get(TwoOperandOpcode)); + MI.tieOperands(0, 1); + if (TwoOperandOpcode == SystemZ::SLL || + TwoOperandOpcode == SystemZ::SLA || + TwoOperandOpcode == SystemZ::SRL || + TwoOperandOpcode == SystemZ::SRA) { + // These shifts only use the low 6 bits of the shift count. + MachineOperand &ImmMO = MI.getOperand(3); + ImmMO.setImm(ImmMO.getImm() & 0xfff); + } + Changed = true; + break; + } } LiveRegs.stepBackward(MI); diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index fb030a207bc7..5e8af81842c4 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -1,9 +1,8 @@ //===-- SystemZSubtarget.cpp - SystemZ subtarget information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -56,6 +55,9 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false), HasVectorEnhancements1(false), HasVectorPackedDecimal(false), HasInsertReferenceBitsMultiple(false), + HasMiscellaneousExtensions3(false), HasMessageSecurityAssist9(false), + HasVectorEnhancements2(false), HasVectorPackedDecimalEnhancement(false), + HasEnhancedSort(false), HasDeflateConversion(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(), FrameLowering() {} diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index cb6b21a1d465..fa3f65d93c91 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -1,9 +1,8 @@ //===-- SystemZSubtarget.h - SystemZ subtarget information -----*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -63,6 +62,12 @@ protected: bool HasVectorEnhancements1; bool HasVectorPackedDecimal; bool HasInsertReferenceBitsMultiple; + bool HasMiscellaneousExtensions3; + bool HasMessageSecurityAssist9; + bool HasVectorEnhancements2; + bool HasVectorPackedDecimalEnhancement; + bool HasEnhancedSort; + bool HasDeflateConversion; private: Triple TargetTriple; @@ -210,6 +215,30 @@ public: return HasInsertReferenceBitsMultiple; } + // Return true if the target has the miscellaneous-extensions facility 3. + bool hasMiscellaneousExtensions3() const { + return HasMiscellaneousExtensions3; + } + + // Return true if the target has the message-security-assist + // extension facility 9. + bool hasMessageSecurityAssist9() const { return HasMessageSecurityAssist9; } + + // Return true if the target has the vector-enhancements facility 2. + bool hasVectorEnhancements2() const { return HasVectorEnhancements2; } + + // Return true if the target has the vector-packed-decimal + // enhancement facility. + bool hasVectorPackedDecimalEnhancement() const { + return HasVectorPackedDecimalEnhancement; + } + + // Return true if the target has the enhanced-sort facility. + bool hasEnhancedSort() const { return HasEnhancedSort; } + + // Return true if the target has the deflate-conversion facility. + bool hasDeflateConversion() const { return HasDeflateConversion; } + // Return true if GV can be accessed using LARL for reloc model RM // and code model CM. bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const; diff --git a/lib/Target/SystemZ/SystemZTDC.cpp b/lib/Target/SystemZ/SystemZTDC.cpp index 5dbd23d420a3..478848c30701 100644 --- a/lib/Target/SystemZ/SystemZTDC.cpp +++ b/lib/Target/SystemZ/SystemZTDC.cpp @@ -1,9 +1,8 @@ //===-- SystemZTDC.cpp - Utilize Test Data Class instruction --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -356,8 +355,8 @@ bool SystemZTDCPass::runOnFunction(Function &F) { if (!Worthy) continue; // Call the intrinsic, compare result with 0. - Value *TDCFunc = Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc, - V->getType()); + Function *TDCFunc = + Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc, V->getType()); IRBuilder<> IRB(I); Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask); Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal}); diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index 9596a2b6388d..5c49e6eff0bf 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -12,6 +11,7 @@ #include "SystemZ.h" #include "SystemZMachineScheduler.h" #include "SystemZTargetTransformInfo.h" +#include "TargetInfo/SystemZTargetInfo.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -133,9 +133,9 @@ getEffectiveSystemZCodeModel(Optional CM, Reloc::Model RM, bool JIT) { if (CM) { if (*CM == CodeModel::Tiny) - report_fatal_error("Target does not support the tiny CodeModel"); + report_fatal_error("Target does not support the tiny CodeModel", false); if (*CM == CodeModel::Kernel) - report_fatal_error("Target does not support the kernel CodeModel"); + report_fatal_error("Target does not support the kernel CodeModel", false); return *CM; } if (JIT) @@ -183,6 +183,7 @@ public: void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; + void addPostRewrite() override; void addPreSched2() override; void addPreEmitPass() override; }; @@ -212,7 +213,16 @@ bool SystemZPassConfig::addILPOpts() { return true; } +void SystemZPassConfig::addPostRewrite() { + addPass(createSystemZPostRewritePass(getSystemZTargetMachine())); +} + void SystemZPassConfig::addPreSched2() { + // PostRewrite needs to be run at -O0 also (in which case addPostRewrite() + // is not called). + if (getOptLevel() == CodeGenOpt::None) + addPass(createSystemZPostRewritePass(getSystemZTargetMachine())); + addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine())); if (getOptLevel() != CodeGenOpt::None) diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 52bf8bba55de..ac04a080f580 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -1,9 +1,8 @@ //=- SystemZTargetMachine.h - Define TargetMachine for SystemZ ----*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 129610fe095b..145cf87ef9f5 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -467,6 +466,27 @@ int SystemZTTIImpl::getArithmeticInstrCost( if (Opcode == Instruction::FRem) return LIBCALL_COST; + // Give discount for some combined logical operations if supported. + if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) { + if (Opcode == Instruction::Xor) { + for (const Value *A : Args) { + if (const Instruction *I = dyn_cast(A)) + if (I->hasOneUse() && + (I->getOpcode() == Instruction::And || + I->getOpcode() == Instruction::Or || + I->getOpcode() == Instruction::Xor)) + return 0; + } + } + else if (Opcode == Instruction::Or || Opcode == Instruction::And) { + for (const Value *A : Args) { + if (const Instruction *I = dyn_cast(A)) + if (I->hasOneUse() && I->getOpcode() == Instruction::Xor) + return 0; + } + } + } + // Or requires one instruction, although it has custom handling for i64. if (Opcode == Instruction::Or) return 1; @@ -687,9 +707,9 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // TODO: Fix base implementation which could simplify things a bit here // (seems to miss on differentiating on scalar/vector types). - // Only 64 bit vector conversions are natively supported. - if (DstScalarBits == 64) { - if (SrcScalarBits == 64) + // Only 64 bit vector conversions are natively supported before arch13. + if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { + if (SrcScalarBits == DstScalarBits) return NumDstVectors; if (SrcScalarBits == 1) @@ -857,7 +877,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, case Instruction::Select: if (ValTy->isFloatingPointTy()) return 4; // No load on condition for FP - costs a conditional jump. - return 1; // Load On Condition. + return 1; // Load On Condition / Select Register. } } @@ -1010,7 +1030,8 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); // Store/Load reversed saves one instruction. - if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) { + if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && + I != nullptr) { if (Opcode == Instruction::Load && I->hasOneUse()) { const Instruction *LdUser = cast(*I->user_begin()); // In case of load -> bswap -> store, return normal cost for the load. diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index e79bee1ea3a8..16ce2ef1d7a0 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- SystemZTargetTransformInfo.h - SystemZ-specific TTI ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp index e2b9efd35d3e..713a55ee8400 100644 --- a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp +++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp @@ -1,13 +1,12 @@ //===-- SystemZTargetInfo.cpp - SystemZ target implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "SystemZ.h" +#include "TargetInfo/SystemZTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h new file mode 100644 index 000000000000..cad141c81e6b --- /dev/null +++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h @@ -0,0 +1,20 @@ +//===-- SystemZTargetInfo.h - SystemZ target implementation -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SYSTEMZ_TARGETINFO_SYSTEMZTARGETINFO_H +#define LLVM_LIB_TARGET_SYSTEMZ_TARGETINFO_SYSTEMZTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheSystemZTarget(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_SYSTEMZ_TARGETINFO_SYSTEMZTARGETINFO_H diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp index f23ea72eb513..8a46c77492c5 100644 --- a/lib/Target/Target.cpp +++ b/lib/Target/Target.cpp @@ -1,9 +1,8 @@ //===-- Target.cpp --------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/TargetIntrinsicInfo.cpp b/lib/Target/TargetIntrinsicInfo.cpp index e8b71924e0d9..256514c8c22d 100644 --- a/lib/Target/TargetIntrinsicInfo.cpp +++ b/lib/Target/TargetIntrinsicInfo.cpp @@ -1,9 +1,8 @@ //===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index bb937923b47e..17274e1c2c6e 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -1,9 +1,8 @@ //===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -48,6 +47,7 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx, // Reset various EH DWARF encodings. PersonalityEncoding = LSDAEncoding = TTypeEncoding = dwarf::DW_EH_PE_absptr; + CallSiteEncoding = dwarf::DW_EH_PE_uleb128; } TargetLoweringObjectFile::~TargetLoweringObjectFile() { diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 39d5705b2a53..634866d93570 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -1,9 +1,8 @@ //===-- TargetMachine.cpp - General Target Information ---------------------==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -145,6 +144,12 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M, isa(GV)) return false; + // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols + // remain unresolved in the link, they can be resolved to zero, which is + // outside the current DSO. + if (TT.isOSBinFormatCOFF() && GV && GV->hasExternalWeakLinkage()) + return false; + // Every other GV is local on COFF. // Make an exception for windows OS in the triple: Some firmware builds use // *-win32-macho triples. This (accidentally?) produced windows relocations @@ -168,7 +173,12 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M, return GV && GV->isStrongDefinitionForLinker(); } - assert(TT.isOSBinFormatELF()); + // Due to the AIX linkage model, any global with default visibility is + // considered non-local. + if (TT.isOSBinFormatXCOFF()) + return false; + + assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm()); assert(RM != Reloc::DynamicNoPIC); bool IsExecutable = @@ -196,7 +206,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M, return true; } - // ELF supports preemption of other symbols. + // ELF & wasm support preemption of other symbols. return false; } diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp index bae45ae28c45..5d9029682fdd 100644 --- a/lib/Target/TargetMachineC.cpp +++ b/lib/Target/TargetMachineC.cpp @@ -1,9 +1,8 @@ //===-- TargetMachine.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 0a5908f43790..09628e872dd5 100644 --- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -1,9 +1,8 @@ //==- WebAssemblyAsmParser.cpp - Assembler for WebAssembly -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -16,12 +15,15 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "MCTargetDesc/WebAssemblyTargetStreamer.h" +#include "TargetInfo/WebAssemblyTargetInfo.h" #include "WebAssembly.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -87,9 +89,8 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { } bool isToken() const override { return Kind == Token; } - bool isImm() const override { - return Kind == Integer || Kind == Float || Kind == Symbol; - } + bool isImm() const override { return Kind == Integer || Kind == Symbol; } + bool isFPImm() const { return Kind == Float; } bool isMem() const override { return false; } bool isReg() const override { return false; } bool isBrList() const { return Kind == BrList; } @@ -116,12 +117,18 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { assert(N == 1 && "Invalid number of operands!"); if (Kind == Integer) Inst.addOperand(MCOperand::createImm(Int.Val)); - else if (Kind == Float) - Inst.addOperand(MCOperand::createFPImm(Flt.Val)); else if (Kind == Symbol) Inst.addOperand(MCOperand::createExpr(Sym.Exp)); else - llvm_unreachable("Should be immediate or symbol!"); + llvm_unreachable("Should be integer immediate or symbol!"); + } + + void addFPImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + if (Kind == Float) + Inst.addOperand(MCOperand::createFPImm(Flt.Val)); + else + llvm_unreachable("Should be float immediate!"); } void addBrListOperands(MCInst &Inst, unsigned N) const { @@ -170,6 +177,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { FunctionStart, FunctionLocals, Instructions, + EndFunction, + DataSection, } CurrentState = FileStart; // For ensuring blocks are properly nested. @@ -187,6 +196,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { // We track this to see if a .functype following a label is the same, // as this is how we recognize the start of a function. MCSymbol *LastLabel = nullptr; + MCSymbol *LastFunctionLabel = nullptr; public: WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, @@ -250,13 +260,13 @@ public: } bool ensureEmptyNestingStack() { - auto err = !NestingStack.empty(); + auto Err = !NestingStack.empty(); while (!NestingStack.empty()) { error(Twine("Unmatched block construct(s) at function end: ") + nestingString(NestingStack.back()).first); NestingStack.pop_back(); } - return err; + return Err; } bool isNext(AsmToken::TokenKind Kind) { @@ -298,6 +308,8 @@ public: Type == "i32x4" || Type == "i64x2" || Type == "f32x4" || Type == "f64x2") return wasm::ValType::V128; + if (Type == "exnref") + return wasm::ValType::EXNREF; return Optional(); } @@ -308,7 +320,7 @@ public: .Case("f32", WebAssembly::ExprType::F32) .Case("f64", WebAssembly::ExprType::F64) .Case("v128", WebAssembly::ExprType::V128) - .Case("except_ref", WebAssembly::ExprType::ExceptRef) + .Case("exnref", WebAssembly::ExprType::Exnref) .Case("void", WebAssembly::ExprType::Void) .Default(WebAssembly::ExprType::Invalid); } @@ -317,7 +329,7 @@ public: while (Lexer.is(AsmToken::Identifier)) { auto Type = parseType(Lexer.getTok().getString()); if (!Type) - return true; + return error("unknown type: ", Lexer.getTok()); Types.push_back(Type.getValue()); Parser.Lex(); if (!isNext(AsmToken::Comma)) @@ -337,27 +349,67 @@ public: Parser.Lex(); } - bool parseOperandStartingWithInteger(bool IsNegative, OperandVector &Operands, - StringRef InstName) { - parseSingleInteger(IsNegative, Operands); + bool parseSingleFloat(bool IsNegative, OperandVector &Operands) { + auto &Flt = Lexer.getTok(); + double Val; + if (Flt.getString().getAsDouble(Val, false)) + return error("Cannot parse real: ", Flt); + if (IsNegative) + Val = -Val; + Operands.push_back(make_unique( + WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(), + WebAssemblyOperand::FltOp{Val})); + Parser.Lex(); + return false; + } + + bool parseSpecialFloatMaybe(bool IsNegative, OperandVector &Operands) { + if (Lexer.isNot(AsmToken::Identifier)) + return true; + auto &Flt = Lexer.getTok(); + auto S = Flt.getString(); + double Val; + if (S.compare_lower("infinity") == 0) { + Val = std::numeric_limits::infinity(); + } else if (S.compare_lower("nan") == 0) { + Val = std::numeric_limits::quiet_NaN(); + } else { + return true; + } + if (IsNegative) + Val = -Val; + Operands.push_back(make_unique( + WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(), + WebAssemblyOperand::FltOp{Val})); + Parser.Lex(); + return false; + } + + bool checkForP2AlignIfLoadStore(OperandVector &Operands, StringRef InstName) { // FIXME: there is probably a cleaner way to do this. - auto IsLoadStore = InstName.startswith("load") || - InstName.startswith("store") || - InstName.startswith("atomic_load") || - InstName.startswith("atomic_store"); - if (IsLoadStore) { - // Parse load/store operands of the form: offset align - auto &Offset = Lexer.getTok(); - if (Offset.is(AsmToken::Integer)) { + auto IsLoadStore = InstName.find(".load") != StringRef::npos || + InstName.find(".store") != StringRef::npos; + auto IsAtomic = InstName.find("atomic.") != StringRef::npos; + if (IsLoadStore || IsAtomic) { + // Parse load/store operands of the form: offset:p2align=align + if (IsLoadStore && isNext(AsmToken::Colon)) { + auto Id = expectIdent(); + if (Id != "p2align") + return error("Expected p2align, instead got: " + Id); + if (expect(AsmToken::Equal, "=")) + return true; + if (!Lexer.is(AsmToken::Integer)) + return error("Expected integer constant"); parseSingleInteger(false, Operands); } else { - // Alignment not specified. - // FIXME: correctly derive a default from the instruction. + // Alignment not specified (or atomics, must use default alignment). // We can't just call WebAssembly::GetDefaultP2Align since we don't have - // an opcode until after the assembly matcher. + // an opcode until after the assembly matcher, so set a default to fix + // up later. + auto Tok = Lexer.getTok(); Operands.push_back(make_unique( - WebAssemblyOperand::Integer, Offset.getLoc(), Offset.getEndLoc(), - WebAssemblyOperand::IntOp{0})); + WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(), + WebAssemblyOperand::IntOp{-1})); } } return false; @@ -400,51 +452,45 @@ public: Operands.push_back(make_unique( WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()), WebAssemblyOperand::TokOp{Name})); - auto NamePair = Name.split('.'); - // If no '.', there is no type prefix. - auto BaseName = NamePair.second.empty() ? NamePair.first : NamePair.second; // If this instruction is part of a control flow structure, ensure // proper nesting. bool ExpectBlockType = false; - if (BaseName == "block") { + if (Name == "block") { push(Block); ExpectBlockType = true; - } else if (BaseName == "loop") { + } else if (Name == "loop") { push(Loop); ExpectBlockType = true; - } else if (BaseName == "try") { + } else if (Name == "try") { push(Try); ExpectBlockType = true; - } else if (BaseName == "if") { + } else if (Name == "if") { push(If); ExpectBlockType = true; - } else if (BaseName == "else") { - if (pop(BaseName, If)) + } else if (Name == "else") { + if (pop(Name, If)) return true; push(Else); - } else if (BaseName == "catch") { - if (pop(BaseName, Try)) - return true; - push(Try); - } else if (BaseName == "catch_all") { - if (pop(BaseName, Try)) + } else if (Name == "catch") { + if (pop(Name, Try)) return true; push(Try); - } else if (BaseName == "end_if") { - if (pop(BaseName, If, Else)) + } else if (Name == "end_if") { + if (pop(Name, If, Else)) return true; - } else if (BaseName == "end_try") { - if (pop(BaseName, Try)) + } else if (Name == "end_try") { + if (pop(Name, Try)) return true; - } else if (BaseName == "end_loop") { - if (pop(BaseName, Loop)) + } else if (Name == "end_loop") { + if (pop(Name, Loop)) return true; - } else if (BaseName == "end_block") { - if (pop(BaseName, Block)) + } else if (Name == "end_block") { + if (pop(Name, Block)) return true; - } else if (BaseName == "end_function") { - if (pop(BaseName, Function) || ensureEmptyNestingStack()) + } else if (Name == "end_function") { + CurrentState = EndFunction; + if (pop(Name, Function) || ensureEmptyNestingStack()) return true; } @@ -452,6 +498,8 @@ public: auto &Tok = Lexer.getTok(); switch (Tok.getKind()) { case AsmToken::Identifier: { + if (!parseSpecialFloatMaybe(false, Operands)) + break; auto &Id = Lexer.getTok(); if (ExpectBlockType) { // Assume this identifier is a block_type. @@ -464,33 +512,39 @@ public: // Assume this identifier is a label. const MCExpr *Val; SMLoc End; - if (Parser.parsePrimaryExpr(Val, End)) + if (Parser.parseExpression(Val, End)) return error("Cannot parse symbol: ", Lexer.getTok()); Operands.push_back(make_unique( WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(), WebAssemblyOperand::SymOp{Val})); + if (checkForP2AlignIfLoadStore(Operands, Name)) + return true; } break; } case AsmToken::Minus: Parser.Lex(); - if (Lexer.isNot(AsmToken::Integer)) - return error("Expected integer instead got: ", Lexer.getTok()); - if (parseOperandStartingWithInteger(true, Operands, BaseName)) - return true; + if (Lexer.is(AsmToken::Integer)) { + parseSingleInteger(true, Operands); + if (checkForP2AlignIfLoadStore(Operands, Name)) + return true; + } else if(Lexer.is(AsmToken::Real)) { + if (parseSingleFloat(true, Operands)) + return true; + } else if (!parseSpecialFloatMaybe(true, Operands)) { + } else { + return error("Expected numeric constant instead got: ", + Lexer.getTok()); + } break; case AsmToken::Integer: - if (parseOperandStartingWithInteger(false, Operands, BaseName)) + parseSingleInteger(false, Operands); + if (checkForP2AlignIfLoadStore(Operands, Name)) return true; break; case AsmToken::Real: { - double Val; - if (Tok.getString().getAsDouble(Val, false)) - return error("Cannot parse real: ", Tok); - Operands.push_back(make_unique( - WebAssemblyOperand::Float, Tok.getLoc(), Tok.getEndLoc(), - WebAssemblyOperand::FltOp{Val})); - Parser.Lex(); + if (parseSingleFloat(false, Operands)) + return true; break; } case AsmToken::LCurly: { @@ -547,6 +601,17 @@ public: return false; } + bool CheckDataSection() { + if (CurrentState != DataSection) { + auto WS = cast(getStreamer().getCurrentSection().first); + if (WS && WS->getKind().isText()) + return error("data directive must occur in a data segment: ", + Lexer.getTok()); + } + CurrentState = DataSection; + return false; + } + // This function processes wasm-specific directives streamed to // WebAssemblyTargetStreamer, all others go to the generic parser // (see WasmAsmParser). @@ -561,6 +626,7 @@ public: auto &Out = getStreamer(); auto &TOut = reinterpret_cast(*Out.getTargetStreamer()); + auto &Ctx = Out.getContext(); // TODO: any time we return an error, at least one token must have been // consumed, otherwise this will not signal an error to the caller. @@ -578,8 +644,7 @@ public: if (!Type) return error("Unknown type in .globaltype directive: ", TypeTok); // Now set this symbol with the correct type. - auto WasmSym = cast( - TOut.getStreamer().getContext().getOrCreateSymbol(SymName)); + auto WasmSym = cast(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); WasmSym->setGlobalType( wasm::WasmGlobalType{uint8_t(Type.getValue()), true}); @@ -597,13 +662,13 @@ public: auto SymName = expectIdent(); if (SymName.empty()) return true; - auto WasmSym = cast( - TOut.getStreamer().getContext().getOrCreateSymbol(SymName)); + auto WasmSym = cast(Ctx.getOrCreateSymbol(SymName)); if (CurrentState == Label && WasmSym == LastLabel) { // This .functype indicates a start of a function. if (ensureEmptyNestingStack()) return true; CurrentState = FunctionStart; + LastFunctionLabel = LastLabel; push(Function); } auto Signature = make_unique(); @@ -621,8 +686,7 @@ public: auto SymName = expectIdent(); if (SymName.empty()) return true; - auto WasmSym = cast( - TOut.getStreamer().getContext().getOrCreateSymbol(SymName)); + auto WasmSym = cast(Ctx.getOrCreateSymbol(SymName)); auto Signature = make_unique(); if (parseRegTypeList(Signature->Params)) return true; @@ -646,6 +710,30 @@ public: return expect(AsmToken::EndOfStatement, "EOL"); } + if (DirectiveID.getString() == ".int8" || + DirectiveID.getString() == ".int16" || + DirectiveID.getString() == ".int32" || + DirectiveID.getString() == ".int64") { + if (CheckDataSection()) return true; + const MCExpr *Val; + SMLoc End; + if (Parser.parseExpression(Val, End)) + return error("Cannot parse .int expression: ", Lexer.getTok()); + size_t NumBits = 0; + DirectiveID.getString().drop_front(4).getAsInteger(10, NumBits); + Out.EmitValue(Val, NumBits / 8, End); + return expect(AsmToken::EndOfStatement, "EOL"); + } + + if (DirectiveID.getString() == ".asciz") { + if (CheckDataSection()) return true; + std::string S; + if (Parser.parseEscapedString(S)) + return error("Cannot parse string constant: ", Lexer.getTok()); + Out.EmitBytes(StringRef(S.c_str(), S.length() + 1)); + return expect(AsmToken::EndOfStatement, "EOL"); + } + return true; // We didn't process this directive. } @@ -667,8 +755,19 @@ public: *Out.getTargetStreamer()); TOut.emitLocal(SmallVector()); } - CurrentState = Instructions; + // Fix unknown p2align operands. + auto Align = WebAssembly::GetDefaultP2AlignAny(Inst.getOpcode()); + if (Align != -1U) { + auto &Op0 = Inst.getOperand(0); + if (Op0.getImm() == -1) + Op0.setImm(Align); + } Out.EmitInstruction(Inst, getSTI()); + if (CurrentState == EndFunction) { + onEndOfFunction(); + } else { + CurrentState = Instructions; + } return false; } case Match_MissingFeature: @@ -694,6 +793,35 @@ public: llvm_unreachable("Implement any new match types added!"); } + void doBeforeLabelEmit(MCSymbol *Symbol) override { + // Start a new section for the next function automatically, since our + // object writer expects each function to have its own section. This way + // The user can't forget this "convention". + auto SymName = Symbol->getName(); + if (SymName.startswith(".L")) + return; // Local Symbol. + // Only create a new text section if we're already in one. + auto CWS = cast(getStreamer().getCurrentSection().first); + if (!CWS || !CWS->getKind().isText()) + return; + auto SecName = ".text." + SymName; + auto WS = getContext().getWasmSection(SecName, SectionKind::getText()); + getStreamer().SwitchSection(WS); + } + + void onEndOfFunction() { + // Automatically output a .size directive, so it becomes optional for the + // user. + if (!LastFunctionLabel) return; + auto TempSym = getContext().createLinkerPrivateTempSymbol(); + getStreamer().EmitLabel(TempSym); + auto Start = MCSymbolRefExpr::create(LastFunctionLabel, getContext()); + auto End = MCSymbolRefExpr::create(TempSym, getContext()); + auto Expr = + MCBinaryExpr::create(MCBinaryExpr::Sub, End, Start, getContext()); + getStreamer().emitELFSize(LastFunctionLabel, Expr); + } + void onEndOfFile() override { ensureEmptyNestingStack(); } }; } // end anonymous namespace diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 6acc9b20eed2..f9bf3f85d30f 100644 --- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -1,9 +1,8 @@ //==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -15,7 +14,9 @@ /// //===----------------------------------------------------------------------===// +#include "MCTargetDesc/WebAssemblyInstPrinter.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "TargetInfo/WebAssemblyTargetInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" @@ -45,6 +46,10 @@ class WebAssemblyDisassembler final : public MCDisassembler { ArrayRef Bytes, uint64_t Address, raw_ostream &VStream, raw_ostream &CStream) const override; + DecodeStatus onSymbolStart(StringRef Name, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const override; public: WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, @@ -77,7 +82,7 @@ static int nextByte(ArrayRef Bytes, uint64_t &Size) { } static bool nextLEB(int64_t &Val, ArrayRef Bytes, uint64_t &Size, - bool Signed = false) { + bool Signed) { unsigned N = 0; const char *Error = nullptr; Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N, @@ -104,9 +109,8 @@ template bool parseImmediate(MCInst &MI, uint64_t &Size, ArrayRef Bytes) { if (Size + sizeof(T) > Bytes.size()) return false; - T Val; - memcpy(&Val, Bytes.data() + Size, sizeof(T)); - support::endian::byte_swap(Val); + T Val = support::endian::read( + Bytes.data() + Size); Size += sizeof(T); if (std::is_floating_point::value) { MI.addOperand(MCOperand::createFPImm(static_cast(Val))); @@ -116,6 +120,41 @@ bool parseImmediate(MCInst &MI, uint64_t &Size, ArrayRef Bytes) { return true; } +MCDisassembler::DecodeStatus WebAssemblyDisassembler::onSymbolStart( + StringRef Name, uint64_t &Size, ArrayRef Bytes, uint64_t Address, + raw_ostream &VStream, raw_ostream &CStream) const { + Size = 0; + if (Address == 0) { + // Start of a code section: we're parsing only the function count. + int64_t FunctionCount; + if (!nextLEB(FunctionCount, Bytes, Size, false)) + return MCDisassembler::Fail; + outs() << " # " << FunctionCount << " functions in section."; + } else { + // Parse the start of a single function. + int64_t BodySize, LocalEntryCount; + if (!nextLEB(BodySize, Bytes, Size, false) || + !nextLEB(LocalEntryCount, Bytes, Size, false)) + return MCDisassembler::Fail; + if (LocalEntryCount) { + outs() << " .local "; + for (int64_t I = 0; I < LocalEntryCount; I++) { + int64_t Count, Type; + if (!nextLEB(Count, Bytes, Size, false) || + !nextLEB(Type, Bytes, Size, false)) + return MCDisassembler::Fail; + for (int64_t J = 0; J < Count; J++) { + if (I || J) + outs() << ", "; + outs() << WebAssembly::anyTypeToString(Type); + } + } + } + } + outs() << "\n"; + return MCDisassembler::Success; +} + MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( MCInst &MI, uint64_t &Size, ArrayRef Bytes, uint64_t /*Address*/, raw_ostream & /*OS*/, raw_ostream &CS) const { @@ -138,7 +177,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( if (!WasmInst) return MCDisassembler::Fail; int64_t PrefixedOpc; - if (!nextLEB(PrefixedOpc, Bytes, Size)) + if (!nextLEB(PrefixedOpc, Bytes, Size, false)) return MCDisassembler::Fail; if (PrefixedOpc < 0 || PrefixedOpc >= WebAssemblyInstructionTableSize) return MCDisassembler::Fail; @@ -161,6 +200,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( case WebAssembly::OPERAND_OFFSET32: case WebAssembly::OPERAND_P2ALIGN: case WebAssembly::OPERAND_TYPEINDEX: + case WebAssembly::OPERAND_EVENT: case MCOI::OPERAND_IMMEDIATE: { if (!parseLEBImmediate(MI, Size, Bytes, false)) return MCDisassembler::Fail; diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp deleted file mode 100644 index 15532d7ff1a6..000000000000 --- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp +++ /dev/null @@ -1,310 +0,0 @@ -//=- WebAssemblyInstPrinter.cpp - WebAssembly assembly instruction printing -=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// Print MCInst instructions to wasm format. -/// -//===----------------------------------------------------------------------===// - -#include "InstPrinter/WebAssemblyInstPrinter.h" -#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" -#include "WebAssembly.h" -#include "WebAssemblyMachineFunctionInfo.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "WebAssemblyGenAsmWriter.inc" - -WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - -void WebAssemblyInstPrinter::printRegName(raw_ostream &OS, - unsigned RegNo) const { - assert(RegNo != WebAssemblyFunctionInfo::UnusedReg); - // Note that there's an implicit local.get/local.set here! - OS << "$" << RegNo; -} - -void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, - const MCSubtargetInfo &STI) { - // Print the instruction (this uses the AsmStrings from the .td files). - printInstruction(MI, OS); - - // Print any additional variadic operands. - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - if (Desc.isVariadic()) - for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) { - // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because - // we have an extra flags operand which is not currently printed, for - // compatiblity reasons. - if (i != 0 && ((MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID && - MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID_S) || - i != Desc.getNumOperands())) - OS << ", "; - printOperand(MI, i, OS); - } - - // Print any added annotation. - printAnnotation(OS, Annot); - - if (CommentStream) { - // Observe any effects on the control flow stack, for use in annotating - // control flow label references. - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - break; - - case WebAssembly::LOOP: - case WebAssembly::LOOP_S: - printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':'); - ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true)); - break; - - case WebAssembly::BLOCK: - case WebAssembly::BLOCK_S: - ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); - break; - - case WebAssembly::TRY: - case WebAssembly::TRY_S: - ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); - EHPadStack.push_back(EHPadStackCounter++); - LastSeenEHInst = TRY; - break; - - case WebAssembly::END_LOOP: - case WebAssembly::END_LOOP_S: - if (ControlFlowStack.empty()) { - printAnnotation(OS, "End marker mismatch!"); - } else { - ControlFlowStack.pop_back(); - } - break; - - case WebAssembly::END_BLOCK: - case WebAssembly::END_BLOCK_S: - if (ControlFlowStack.empty()) { - printAnnotation(OS, "End marker mismatch!"); - } else { - printAnnotation( - OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); - } - break; - - case WebAssembly::END_TRY: - case WebAssembly::END_TRY_S: - if (ControlFlowStack.empty()) { - printAnnotation(OS, "End marker mismatch!"); - } else { - printAnnotation( - OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); - LastSeenEHInst = END_TRY; - } - break; - - case WebAssembly::CATCH_I32: - case WebAssembly::CATCH_I32_S: - case WebAssembly::CATCH_I64: - case WebAssembly::CATCH_I64_S: - case WebAssembly::CATCH_ALL: - case WebAssembly::CATCH_ALL_S: - // There can be multiple catch instructions for one try instruction, so we - // print a label only for the first 'catch' label. - if (LastSeenEHInst != CATCH) { - if (EHPadStack.empty()) { - printAnnotation(OS, "try-catch mismatch!"); - } else { - printAnnotation(OS, - "catch" + utostr(EHPadStack.pop_back_val()) + ':'); - } - } - LastSeenEHInst = CATCH; - break; - } - - // Annotate any control flow label references. - unsigned NumFixedOperands = Desc.NumOperands; - SmallSet Printed; - for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { - // See if this operand denotes a basic block target. - if (i < NumFixedOperands) { - // A non-variable_ops operand, check its type. - if (Desc.OpInfo[i].OperandType != WebAssembly::OPERAND_BASIC_BLOCK) - continue; - } else { - // A variable_ops operand, which currently can be immediates (used in - // br_table) which are basic block targets, or for call instructions - // when using -wasm-keep-registers (in which case they are registers, - // and should not be processed). - if (!MI->getOperand(i).isImm()) - continue; - } - uint64_t Depth = MI->getOperand(i).getImm(); - if (!Printed.insert(Depth).second) - continue; - - if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) { - if (Depth > EHPadStack.size()) { - printAnnotation(OS, "Invalid depth argument!"); - } else if (Depth == EHPadStack.size()) { - // This can happen when rethrow instruction breaks out of all nests - // and throws up to the current function's caller. - printAnnotation(OS, utostr(Depth) + ": " + "to caller"); - } else { - uint64_t CatchNo = EHPadStack.rbegin()[Depth]; - printAnnotation(OS, utostr(Depth) + ": " + "down to catch" + - utostr(CatchNo)); - } - - } else { - if (Depth >= ControlFlowStack.size()) { - printAnnotation(OS, "Invalid depth argument!"); - } else { - const auto &Pair = ControlFlowStack.rbegin()[Depth]; - printAnnotation(OS, utostr(Depth) + ": " + - (Pair.second ? "up" : "down") + " to label" + - utostr(Pair.first)); - } - } - } - } -} - -static std::string toString(const APFloat &FP) { - // Print NaNs with custom payloads specially. - if (FP.isNaN() && !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) && - !FP.bitwiseIsEqual( - APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) { - APInt AI = FP.bitcastToAPInt(); - return std::string(AI.isNegative() ? "-" : "") + "nan:0x" + - utohexstr(AI.getZExtValue() & - (AI.getBitWidth() == 32 ? INT64_C(0x007fffff) - : INT64_C(0x000fffffffffffff)), - /*LowerCase=*/true); - } - - // Use C99's hexadecimal floating-point representation. - static const size_t BufBytes = 128; - char buf[BufBytes]; - auto Written = FP.convertToHexString( - buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven); - (void)Written; - assert(Written != 0); - assert(Written < BufBytes); - return buf; -} - -void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned WAReg = Op.getReg(); - if (int(WAReg) >= 0) - printRegName(O, WAReg); - else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs()) - O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg); - else if (WAReg != WebAssemblyFunctionInfo::UnusedReg) - O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg); - else - O << "$drop"; - // Add a '=' suffix if this is a def. - if (OpNo < MII.get(MI->getOpcode()).getNumDefs()) - O << '='; - } else if (Op.isImm()) { - O << Op.getImm(); - } else if (Op.isFPImm()) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - const MCOperandInfo &Info = Desc.OpInfo[OpNo]; - if (Info.OperandType == WebAssembly::OPERAND_F32IMM) { - // TODO: MC converts all floating point immediate operands to double. - // This is fine for numeric values, but may cause NaNs to change bits. - O << ::toString(APFloat(float(Op.getFPImm()))); - } else { - assert(Info.OperandType == WebAssembly::OPERAND_F64IMM); - O << ::toString(APFloat(Op.getFPImm())); - } - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI); - } -} - -void WebAssemblyInstPrinter::printBrList(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << "{"; - for (unsigned I = OpNo, E = MI->getNumOperands(); I != E; ++I) { - if (I != OpNo) - O << ", "; - O << MI->getOperand(I).getImm(); - } - O << "}"; -} - -void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI, - unsigned OpNo, - raw_ostream &O) { - int64_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode())) - return; - O << ":p2align=" << Imm; -} - -void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, - unsigned OpNo, - raw_ostream &O) { - auto Imm = static_cast(MI->getOperand(OpNo).getImm()); - if (Imm != wasm::WASM_TYPE_NORESULT) - O << WebAssembly::anyTypeToString(Imm); -} - -// We have various enums representing a subset of these types, use this -// function to convert any of them to text. -const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) { - switch (Ty) { - case wasm::WASM_TYPE_I32: - return "i32"; - case wasm::WASM_TYPE_I64: - return "i64"; - case wasm::WASM_TYPE_F32: - return "f32"; - case wasm::WASM_TYPE_F64: - return "f64"; - case wasm::WASM_TYPE_V128: - return "v128"; - case wasm::WASM_TYPE_FUNCREF: - return "funcref"; - case wasm::WASM_TYPE_FUNC: - return "func"; - case wasm::WASM_TYPE_EXCEPT_REF: - return "except_ref"; - case wasm::WASM_TYPE_NORESULT: - return "void"; - default: - return "invalid_type"; - } -} - -const char *llvm::WebAssembly::typeToString(wasm::ValType Ty) { - return anyTypeToString(static_cast(Ty)); -} diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h deleted file mode 100644 index 5ad45c7d5c7f..000000000000 --- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h +++ /dev/null @@ -1,66 +0,0 @@ -// WebAssemblyInstPrinter.h - Print wasm MCInst to assembly syntax -*- C++ -*-// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This class prints an WebAssembly MCInst to wasm file syntax. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H -#define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H - -#include "llvm/ADT/SmallVector.h" -#include "llvm/BinaryFormat/Wasm.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/MachineValueType.h" - -namespace llvm { - -class MCSubtargetInfo; - -class WebAssemblyInstPrinter final : public MCInstPrinter { - uint64_t ControlFlowCounter = 0; - uint64_t EHPadStackCounter = 0; - SmallVector, 4> ControlFlowStack; - SmallVector EHPadStack; - - enum EHInstKind { TRY, CATCH, END_TRY }; - EHInstKind LastSeenEHInst = END_TRY; - -public: - WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI); - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) override; - - // Used by tblegen code. - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printBrList(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O); - void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O); - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); -}; - -namespace WebAssembly { - -const char *typeToString(wasm::ValType Ty); -const char *anyTypeToString(unsigned Ty); - -} // end namespace WebAssembly - -} // end namespace llvm - -#endif diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 0726dd481174..70b409cf4a90 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -36,7 +35,6 @@ class WebAssemblyAsmBackend final : public MCAsmBackend { public: explicit WebAssemblyAsmBackend(bool Is64Bit) : MCAsmBackend(support::little), Is64Bit(Is64Bit) {} - ~WebAssemblyAsmBackend() override {} unsigned getNumFixupKinds() const override { return WebAssembly::NumTargetFixupKinds; @@ -77,9 +75,9 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { // WebAssemblyFixupKinds.h. // // Name Offset (bits) Size (bits) Flags - {"fixup_code_sleb128_i32", 0, 5 * 8, 0}, - {"fixup_code_sleb128_i64", 0, 10 * 8, 0}, - {"fixup_code_uleb128_i32", 0, 5 * 8, 0}, + {"fixup_sleb128_i32", 0, 5 * 8, 0}, + {"fixup_sleb128_i64", 0, 10 * 8, 0}, + {"fixup_uleb128_i32", 0, 5 * 8, 0}, }; if (Kind < FirstTargetFixupKind) @@ -92,7 +90,7 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { - for (uint64_t i = 0; i < Count; ++i) + for (uint64_t I = 0; I < Count; ++I) OS << char(WebAssembly::Nop); return true; @@ -119,8 +117,8 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm, // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. - for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + for (unsigned I = 0; I != NumBytes; ++I) + Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); } std::unique_ptr diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h index c2fac5f93a2f..33e8de282955 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h @@ -1,9 +1,8 @@ //=- WebAssemblyFixupKinds.h - WebAssembly Specific Fixup Entries -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,9 +14,9 @@ namespace llvm { namespace WebAssembly { enum Fixups { - fixup_code_sleb128_i32 = FirstTargetFixupKind, // 32-bit signed - fixup_code_sleb128_i64, // 64-bit signed - fixup_code_uleb128_i32, // 32-bit unsigned + fixup_sleb128_i32 = FirstTargetFixupKind, // 32-bit signed + fixup_sleb128_i64, // 64-bit signed + fixup_uleb128_i32, // 32-bit unsigned // Marker LastTargetFixupKind, diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp new file mode 100644 index 000000000000..b5d4d369b726 --- /dev/null +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -0,0 +1,296 @@ +//=- WebAssemblyInstPrinter.cpp - WebAssembly assembly instruction printing -=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Print MCInst instructions to wasm format. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "WebAssemblyGenAsmWriter.inc" + +WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + +void WebAssemblyInstPrinter::printRegName(raw_ostream &OS, + unsigned RegNo) const { + assert(RegNo != WebAssemblyFunctionInfo::UnusedReg); + // Note that there's an implicit local.get/local.set here! + OS << "$" << RegNo; +} + +void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, + const MCSubtargetInfo &STI) { + // Print the instruction (this uses the AsmStrings from the .td files). + printInstruction(MI, OS); + + // Print any additional variadic operands. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (Desc.isVariadic()) + for (auto I = Desc.getNumOperands(), E = MI->getNumOperands(); I < E; ++I) { + // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because + // we have an extra flags operand which is not currently printed, for + // compatiblity reasons. + if (I != 0 && ((MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID && + MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID_S) || + I != Desc.getNumOperands())) + OS << ", "; + printOperand(MI, I, OS); + } + + // Print any added annotation. + printAnnotation(OS, Annot); + + if (CommentStream) { + // Observe any effects on the control flow stack, for use in annotating + // control flow label references. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + break; + + case WebAssembly::LOOP: + case WebAssembly::LOOP_S: + printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':'); + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true)); + break; + + case WebAssembly::BLOCK: + case WebAssembly::BLOCK_S: + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); + break; + + case WebAssembly::TRY: + case WebAssembly::TRY_S: + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); + EHPadStack.push_back(EHPadStackCounter++); + LastSeenEHInst = TRY; + break; + + case WebAssembly::END_LOOP: + case WebAssembly::END_LOOP_S: + if (ControlFlowStack.empty()) { + printAnnotation(OS, "End marker mismatch!"); + } else { + ControlFlowStack.pop_back(); + } + break; + + case WebAssembly::END_BLOCK: + case WebAssembly::END_BLOCK_S: + if (ControlFlowStack.empty()) { + printAnnotation(OS, "End marker mismatch!"); + } else { + printAnnotation( + OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); + } + break; + + case WebAssembly::END_TRY: + case WebAssembly::END_TRY_S: + if (ControlFlowStack.empty()) { + printAnnotation(OS, "End marker mismatch!"); + } else { + printAnnotation( + OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); + LastSeenEHInst = END_TRY; + } + break; + + case WebAssembly::CATCH: + case WebAssembly::CATCH_S: + if (EHPadStack.empty()) { + printAnnotation(OS, "try-catch mismatch!"); + } else { + printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':'); + } + break; + } + + // Annotate any control flow label references. + + // rethrow instruction does not take any depth argument and rethrows to the + // nearest enclosing catch scope, if any. If there's no enclosing catch + // scope, it throws up to the caller. + if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) { + if (EHPadStack.empty()) { + printAnnotation(OS, "to caller"); + } else { + printAnnotation(OS, "down to catch" + utostr(EHPadStack.back())); + } + + } else { + unsigned NumFixedOperands = Desc.NumOperands; + SmallSet Printed; + for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) { + // See if this operand denotes a basic block target. + if (I < NumFixedOperands) { + // A non-variable_ops operand, check its type. + if (Desc.OpInfo[I].OperandType != WebAssembly::OPERAND_BASIC_BLOCK) + continue; + } else { + // A variable_ops operand, which currently can be immediates (used in + // br_table) which are basic block targets, or for call instructions + // when using -wasm-keep-registers (in which case they are registers, + // and should not be processed). + if (!MI->getOperand(I).isImm()) + continue; + } + uint64_t Depth = MI->getOperand(I).getImm(); + if (!Printed.insert(Depth).second) + continue; + if (Depth >= ControlFlowStack.size()) { + printAnnotation(OS, "Invalid depth argument!"); + } else { + const auto &Pair = ControlFlowStack.rbegin()[Depth]; + printAnnotation(OS, utostr(Depth) + ": " + + (Pair.second ? "up" : "down") + " to label" + + utostr(Pair.first)); + } + } + } + } +} + +static std::string toString(const APFloat &FP) { + // Print NaNs with custom payloads specially. + if (FP.isNaN() && !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) && + !FP.bitwiseIsEqual( + APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) { + APInt AI = FP.bitcastToAPInt(); + return std::string(AI.isNegative() ? "-" : "") + "nan:0x" + + utohexstr(AI.getZExtValue() & + (AI.getBitWidth() == 32 ? INT64_C(0x007fffff) + : INT64_C(0x000fffffffffffff)), + /*LowerCase=*/true); + } + + // Use C99's hexadecimal floating-point representation. + static const size_t BufBytes = 128; + char Buf[BufBytes]; + auto Written = FP.convertToHexString( + Buf, /*HexDigits=*/0, /*UpperCase=*/false, APFloat::rmNearestTiesToEven); + (void)Written; + assert(Written != 0); + assert(Written < BufBytes); + return Buf; +} + +void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned WAReg = Op.getReg(); + if (int(WAReg) >= 0) + printRegName(O, WAReg); + else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs()) + O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg); + else if (WAReg != WebAssemblyFunctionInfo::UnusedReg) + O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg); + else + O << "$drop"; + // Add a '=' suffix if this is a def. + if (OpNo < MII.get(MI->getOpcode()).getNumDefs()) + O << '='; + } else if (Op.isImm()) { + O << Op.getImm(); + } else if (Op.isFPImm()) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + const MCOperandInfo &Info = Desc.OpInfo[OpNo]; + if (Info.OperandType == WebAssembly::OPERAND_F32IMM) { + // TODO: MC converts all floating point immediate operands to double. + // This is fine for numeric values, but may cause NaNs to change bits. + O << ::toString(APFloat(float(Op.getFPImm()))); + } else { + assert(Info.OperandType == WebAssembly::OPERAND_F64IMM); + O << ::toString(APFloat(Op.getFPImm())); + } + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +void WebAssemblyInstPrinter::printBrList(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "{"; + for (unsigned I = OpNo, E = MI->getNumOperands(); I != E; ++I) { + if (I != OpNo) + O << ", "; + O << MI->getOperand(I).getImm(); + } + O << "}"; +} + +void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI, + unsigned OpNo, + raw_ostream &O) { + int64_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode())) + return; + O << ":p2align=" << Imm; +} + +void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, + unsigned OpNo, + raw_ostream &O) { + auto Imm = static_cast(MI->getOperand(OpNo).getImm()); + if (Imm != wasm::WASM_TYPE_NORESULT) + O << WebAssembly::anyTypeToString(Imm); +} + +// We have various enums representing a subset of these types, use this +// function to convert any of them to text. +const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) { + switch (Ty) { + case wasm::WASM_TYPE_I32: + return "i32"; + case wasm::WASM_TYPE_I64: + return "i64"; + case wasm::WASM_TYPE_F32: + return "f32"; + case wasm::WASM_TYPE_F64: + return "f64"; + case wasm::WASM_TYPE_V128: + return "v128"; + case wasm::WASM_TYPE_FUNCREF: + return "funcref"; + case wasm::WASM_TYPE_FUNC: + return "func"; + case wasm::WASM_TYPE_EXNREF: + return "exnref"; + case wasm::WASM_TYPE_NORESULT: + return "void"; + default: + return "invalid_type"; + } +} + +const char *llvm::WebAssembly::typeToString(wasm::ValType Ty) { + return anyTypeToString(static_cast(Ty)); +} diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h new file mode 100644 index 000000000000..b979de5028bf --- /dev/null +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h @@ -0,0 +1,65 @@ +// WebAssemblyInstPrinter.h - Print wasm MCInst to assembly syntax -*- C++ -*-// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This class prints an WebAssembly MCInst to wasm file syntax. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/BinaryFormat/Wasm.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/MachineValueType.h" + +namespace llvm { + +class MCSubtargetInfo; + +class WebAssemblyInstPrinter final : public MCInstPrinter { + uint64_t ControlFlowCounter = 0; + uint64_t EHPadStackCounter = 0; + SmallVector, 4> ControlFlowStack; + SmallVector EHPadStack; + + enum EHInstKind { TRY, CATCH, END_TRY }; + EHInstKind LastSeenEHInst = END_TRY; + +public: + WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Used by tblegen code. + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBrList(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O); + void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); +}; + +namespace WebAssembly { + +const char *typeToString(wasm::ValType Ty); +const char *anyTypeToString(unsigned Ty); + +} // end namespace WebAssembly + +} // end namespace llvm + +#endif diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp index 44fcc129c39e..8f6531563e1b 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyMCAsmInfo.cpp - WebAssembly asm properties -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -20,7 +19,7 @@ using namespace llvm; #define DEBUG_TYPE "wasm-mc-asm-info" -WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {} +WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() = default; // anchor. WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h index 8627a6e40c6a..9efbbf881f59 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- WebAssemblyMCAsmInfo.h - WebAssembly asm properties -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index 065a4dc94ca6..44b6d6a968a9 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -49,7 +48,7 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter { const MCSubtargetInfo &STI) const override; public: - WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {} + WebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) : MCII(MCII) {} }; } // end anonymous namespace @@ -82,14 +81,14 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( encodeULEB128(MI.getNumOperands() - 2, OS); const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { - const MCOperand &MO = MI.getOperand(i); + for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) { + const MCOperand &MO = MI.getOperand(I); if (MO.isReg()) { /* nothing to encode */ } else if (MO.isImm()) { - if (i < Desc.getNumOperands()) { - const MCOperandInfo &Info = Desc.OpInfo[i]; + if (I < Desc.getNumOperands()) { + const MCOperandInfo &Info = Desc.OpInfo[I]; LLVM_DEBUG(dbgs() << "Encoding immediate: type=" << int(Info.OperandType) << "\n"); switch (Info.OperandType) { @@ -127,28 +126,28 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( } } else if (MO.isFPImm()) { - const MCOperandInfo &Info = Desc.OpInfo[i]; + const MCOperandInfo &Info = Desc.OpInfo[I]; if (Info.OperandType == WebAssembly::OPERAND_F32IMM) { // TODO: MC converts all floating point immediate operands to double. // This is fine for numeric values, but may cause NaNs to change bits. - float f = float(MO.getFPImm()); - support::endian::write(OS, f, support::little); + auto F = float(MO.getFPImm()); + support::endian::write(OS, F, support::little); } else { assert(Info.OperandType == WebAssembly::OPERAND_F64IMM); - double d = MO.getFPImm(); - support::endian::write(OS, d, support::little); + double D = MO.getFPImm(); + support::endian::write(OS, D, support::little); } } else if (MO.isExpr()) { - const MCOperandInfo &Info = Desc.OpInfo[i]; + const MCOperandInfo &Info = Desc.OpInfo[I]; llvm::MCFixupKind FixupKind; size_t PaddedSize = 5; switch (Info.OperandType) { case WebAssembly::OPERAND_I32IMM: - FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32); + FixupKind = MCFixupKind(WebAssembly::fixup_sleb128_i32); break; case WebAssembly::OPERAND_I64IMM: - FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64); + FixupKind = MCFixupKind(WebAssembly::fixup_sleb128_i64); PaddedSize = 10; break; case WebAssembly::OPERAND_FUNCTION32: @@ -156,7 +155,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( case WebAssembly::OPERAND_TYPEINDEX: case WebAssembly::OPERAND_GLOBAL: case WebAssembly::OPERAND_EVENT: - FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32); + FixupKind = MCFixupKind(WebAssembly::fixup_uleb128_i32); break; default: llvm_unreachable("unexpected symbolic operand kind"); diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 390f367c2978..9c8ca1f13b18 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyMCTargetDesc.cpp - WebAssembly Target Descriptions -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -12,10 +11,11 @@ /// //===----------------------------------------------------------------------===// -#include "WebAssemblyMCTargetDesc.h" -#include "InstPrinter/WebAssemblyInstPrinter.h" -#include "WebAssemblyMCAsmInfo.h" -#include "WebAssemblyTargetStreamer.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "MCTargetDesc/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCAsmInfo.h" +#include "MCTargetDesc/WebAssemblyTargetStreamer.h" +#include "TargetInfo/WebAssemblyTargetInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -40,13 +40,13 @@ static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/, } static MCInstrInfo *createMCInstrInfo() { - MCInstrInfo *X = new MCInstrInfo(); + auto *X = new MCInstrInfo(); InitWebAssemblyMCInstrInfo(X); return X; } static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) { - MCRegisterInfo *X = new MCRegisterInfo(); + auto *X = new MCRegisterInfo(); InitWebAssemblyMCRegisterInfo(X, 0); return X; } @@ -146,8 +146,8 @@ wasm::ValType WebAssembly::toValType(const MVT &Ty) { case MVT::v4f32: case MVT::v2f64: return wasm::ValType::V128; - case MVT::ExceptRef: - return wasm::ValType::EXCEPT_REF; + case MVT::exnref: + return wasm::ValType::EXNREF; default: llvm_unreachable("unexpected type"); } diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index a01517fb90c3..7a9f59b1a4f2 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -1,9 +1,8 @@ //==- WebAssemblyMCTargetDesc.h - WebAssembly Target Descriptions -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -15,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H +#include "../WebAssemblySubtarget.h" #include "llvm/BinaryFormat/Wasm.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/DataTypes.h" @@ -33,9 +33,6 @@ class Target; class Triple; class raw_pwrite_stream; -Target &getTheWebAssemblyTarget32(); -Target &getTheWebAssemblyTarget64(); - MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII); MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT); @@ -90,12 +87,23 @@ namespace WebAssemblyII { enum TOF { MO_NO_FLAG = 0, - // Flags to indicate the type of the symbol being referenced - MO_SYMBOL_FUNCTION = 0x1, - MO_SYMBOL_GLOBAL = 0x2, - MO_SYMBOL_EVENT = 0x4, - MO_SYMBOL_MASK = 0x7, + // On a symbol operand this indicates that the immediate is a wasm global + // index. The value of the wasm global will be set to the symbol address at + // runtime. This adds a level of indirection similar to the GOT on native + // platforms. + MO_GOT, + + // On a symbol operand this indicates that the immediate is the symbol + // address relative the __memory_base wasm global. + // Only applicable to data symbols. + MO_MEMORY_BASE_REL, + + // On a symbol operand this indicates that the immediate is the symbol + // address relative the __table_base wasm global. + // Only applicable to function symbols. + MO_TABLE_BASE_REL, }; + } // end namespace WebAssemblyII } // end namespace llvm @@ -111,15 +119,30 @@ enum TOF { #define GET_INSTRINFO_ENUM #include "WebAssemblyGenInstrInfo.inc" -#define GET_SUBTARGETINFO_ENUM -#include "WebAssemblyGenSubtargetInfo.inc" - namespace llvm { namespace WebAssembly { +/// This is used to indicate block signatures. +enum class ExprType : unsigned { + Void = 0x40, + I32 = 0x7F, + I64 = 0x7E, + F32 = 0x7D, + F64 = 0x7C, + V128 = 0x7B, + Exnref = 0x68, + Invalid = 0x00 +}; + +/// Instruction opcodes emitted via means other than CodeGen. +static const unsigned Nop = 0x01; +static const unsigned End = 0x0b; + +wasm::ValType toValType(const MVT &Ty); + /// Return the default p2align value for a load or store with the given opcode. -inline unsigned GetDefaultP2Align(unsigned Opcode) { - switch (Opcode) { +inline unsigned GetDefaultP2AlignAny(unsigned Opc) { + switch (Opc) { case WebAssembly::LOAD8_S_I32: case WebAssembly::LOAD8_S_I32_S: case WebAssembly::LOAD8_U_I32: @@ -328,35 +351,238 @@ inline unsigned GetDefaultP2Align(unsigned Opcode) { case WebAssembly::STORE_v2f64_S: return 4; default: + return -1; + } +} + +inline unsigned GetDefaultP2Align(unsigned Opc) { + auto Align = GetDefaultP2AlignAny(Opc); + if (Align == -1U) { llvm_unreachable("Only loads and stores have p2align values"); } + return Align; } -/// The operand number of the load or store address in load/store instructions. -static const unsigned LoadAddressOperandNo = 3; -static const unsigned StoreAddressOperandNo = 2; +inline bool isArgument(unsigned Opc) { + switch (Opc) { + case WebAssembly::ARGUMENT_i32: + case WebAssembly::ARGUMENT_i32_S: + case WebAssembly::ARGUMENT_i64: + case WebAssembly::ARGUMENT_i64_S: + case WebAssembly::ARGUMENT_f32: + case WebAssembly::ARGUMENT_f32_S: + case WebAssembly::ARGUMENT_f64: + case WebAssembly::ARGUMENT_f64_S: + case WebAssembly::ARGUMENT_v16i8: + case WebAssembly::ARGUMENT_v16i8_S: + case WebAssembly::ARGUMENT_v8i16: + case WebAssembly::ARGUMENT_v8i16_S: + case WebAssembly::ARGUMENT_v4i32: + case WebAssembly::ARGUMENT_v4i32_S: + case WebAssembly::ARGUMENT_v2i64: + case WebAssembly::ARGUMENT_v2i64_S: + case WebAssembly::ARGUMENT_v4f32: + case WebAssembly::ARGUMENT_v4f32_S: + case WebAssembly::ARGUMENT_v2f64: + case WebAssembly::ARGUMENT_v2f64_S: + case WebAssembly::ARGUMENT_exnref: + case WebAssembly::ARGUMENT_exnref_S: + return true; + default: + return false; + } +} -/// The operand number of the load or store p2align in load/store instructions. -static const unsigned LoadP2AlignOperandNo = 1; -static const unsigned StoreP2AlignOperandNo = 0; +inline bool isCopy(unsigned Opc) { + switch (Opc) { + case WebAssembly::COPY_I32: + case WebAssembly::COPY_I32_S: + case WebAssembly::COPY_I64: + case WebAssembly::COPY_I64_S: + case WebAssembly::COPY_F32: + case WebAssembly::COPY_F32_S: + case WebAssembly::COPY_F64: + case WebAssembly::COPY_F64_S: + case WebAssembly::COPY_V128: + case WebAssembly::COPY_V128_S: + case WebAssembly::COPY_EXNREF: + case WebAssembly::COPY_EXNREF_S: + return true; + default: + return false; + } +} -/// This is used to indicate block signatures. -enum class ExprType : unsigned { - Void = 0x40, - I32 = 0x7F, - I64 = 0x7E, - F32 = 0x7D, - F64 = 0x7C, - V128 = 0x7B, - ExceptRef = 0x68, - Invalid = 0x00 -}; +inline bool isTee(unsigned Opc) { + switch (Opc) { + case WebAssembly::TEE_I32: + case WebAssembly::TEE_I32_S: + case WebAssembly::TEE_I64: + case WebAssembly::TEE_I64_S: + case WebAssembly::TEE_F32: + case WebAssembly::TEE_F32_S: + case WebAssembly::TEE_F64: + case WebAssembly::TEE_F64_S: + case WebAssembly::TEE_V128: + case WebAssembly::TEE_V128_S: + case WebAssembly::TEE_EXNREF: + case WebAssembly::TEE_EXNREF_S: + return true; + default: + return false; + } +} -/// Instruction opcodes emitted via means other than CodeGen. -static const unsigned Nop = 0x01; -static const unsigned End = 0x0b; +inline bool isCallDirect(unsigned Opc) { + switch (Opc) { + case WebAssembly::CALL_VOID: + case WebAssembly::CALL_VOID_S: + case WebAssembly::CALL_i32: + case WebAssembly::CALL_i32_S: + case WebAssembly::CALL_i64: + case WebAssembly::CALL_i64_S: + case WebAssembly::CALL_f32: + case WebAssembly::CALL_f32_S: + case WebAssembly::CALL_f64: + case WebAssembly::CALL_f64_S: + case WebAssembly::CALL_v16i8: + case WebAssembly::CALL_v16i8_S: + case WebAssembly::CALL_v8i16: + case WebAssembly::CALL_v8i16_S: + case WebAssembly::CALL_v4i32: + case WebAssembly::CALL_v4i32_S: + case WebAssembly::CALL_v2i64: + case WebAssembly::CALL_v2i64_S: + case WebAssembly::CALL_v4f32: + case WebAssembly::CALL_v4f32_S: + case WebAssembly::CALL_v2f64: + case WebAssembly::CALL_v2f64_S: + case WebAssembly::CALL_exnref: + case WebAssembly::CALL_exnref_S: + case WebAssembly::RET_CALL: + case WebAssembly::RET_CALL_S: + return true; + default: + return false; + } +} -wasm::ValType toValType(const MVT &Ty); +inline bool isCallIndirect(unsigned Opc) { + switch (Opc) { + case WebAssembly::CALL_INDIRECT_VOID: + case WebAssembly::CALL_INDIRECT_VOID_S: + case WebAssembly::CALL_INDIRECT_i32: + case WebAssembly::CALL_INDIRECT_i32_S: + case WebAssembly::CALL_INDIRECT_i64: + case WebAssembly::CALL_INDIRECT_i64_S: + case WebAssembly::CALL_INDIRECT_f32: + case WebAssembly::CALL_INDIRECT_f32_S: + case WebAssembly::CALL_INDIRECT_f64: + case WebAssembly::CALL_INDIRECT_f64_S: + case WebAssembly::CALL_INDIRECT_v16i8: + case WebAssembly::CALL_INDIRECT_v16i8_S: + case WebAssembly::CALL_INDIRECT_v8i16: + case WebAssembly::CALL_INDIRECT_v8i16_S: + case WebAssembly::CALL_INDIRECT_v4i32: + case WebAssembly::CALL_INDIRECT_v4i32_S: + case WebAssembly::CALL_INDIRECT_v2i64: + case WebAssembly::CALL_INDIRECT_v2i64_S: + case WebAssembly::CALL_INDIRECT_v4f32: + case WebAssembly::CALL_INDIRECT_v4f32_S: + case WebAssembly::CALL_INDIRECT_v2f64: + case WebAssembly::CALL_INDIRECT_v2f64_S: + case WebAssembly::CALL_INDIRECT_exnref: + case WebAssembly::CALL_INDIRECT_exnref_S: + case WebAssembly::RET_CALL_INDIRECT: + case WebAssembly::RET_CALL_INDIRECT_S: + return true; + default: + return false; + } +} + +/// Returns the operand number of a callee, assuming the argument is a call +/// instruction. +inline unsigned getCalleeOpNo(unsigned Opc) { + switch (Opc) { + case WebAssembly::CALL_VOID: + case WebAssembly::CALL_VOID_S: + case WebAssembly::CALL_INDIRECT_VOID: + case WebAssembly::CALL_INDIRECT_VOID_S: + case WebAssembly::RET_CALL: + case WebAssembly::RET_CALL_S: + case WebAssembly::RET_CALL_INDIRECT: + case WebAssembly::RET_CALL_INDIRECT_S: + return 0; + case WebAssembly::CALL_i32: + case WebAssembly::CALL_i32_S: + case WebAssembly::CALL_i64: + case WebAssembly::CALL_i64_S: + case WebAssembly::CALL_f32: + case WebAssembly::CALL_f32_S: + case WebAssembly::CALL_f64: + case WebAssembly::CALL_f64_S: + case WebAssembly::CALL_v16i8: + case WebAssembly::CALL_v16i8_S: + case WebAssembly::CALL_v8i16: + case WebAssembly::CALL_v8i16_S: + case WebAssembly::CALL_v4i32: + case WebAssembly::CALL_v4i32_S: + case WebAssembly::CALL_v2i64: + case WebAssembly::CALL_v2i64_S: + case WebAssembly::CALL_v4f32: + case WebAssembly::CALL_v4f32_S: + case WebAssembly::CALL_v2f64: + case WebAssembly::CALL_v2f64_S: + case WebAssembly::CALL_exnref: + case WebAssembly::CALL_exnref_S: + case WebAssembly::CALL_INDIRECT_i32: + case WebAssembly::CALL_INDIRECT_i32_S: + case WebAssembly::CALL_INDIRECT_i64: + case WebAssembly::CALL_INDIRECT_i64_S: + case WebAssembly::CALL_INDIRECT_f32: + case WebAssembly::CALL_INDIRECT_f32_S: + case WebAssembly::CALL_INDIRECT_f64: + case WebAssembly::CALL_INDIRECT_f64_S: + case WebAssembly::CALL_INDIRECT_v16i8: + case WebAssembly::CALL_INDIRECT_v16i8_S: + case WebAssembly::CALL_INDIRECT_v8i16: + case WebAssembly::CALL_INDIRECT_v8i16_S: + case WebAssembly::CALL_INDIRECT_v4i32: + case WebAssembly::CALL_INDIRECT_v4i32_S: + case WebAssembly::CALL_INDIRECT_v2i64: + case WebAssembly::CALL_INDIRECT_v2i64_S: + case WebAssembly::CALL_INDIRECT_v4f32: + case WebAssembly::CALL_INDIRECT_v4f32_S: + case WebAssembly::CALL_INDIRECT_v2f64: + case WebAssembly::CALL_INDIRECT_v2f64_S: + case WebAssembly::CALL_INDIRECT_exnref: + case WebAssembly::CALL_INDIRECT_exnref_S: + return 1; + default: + llvm_unreachable("Not a call instruction"); + } +} + +inline bool isMarker(unsigned Opc) { + switch (Opc) { + case WebAssembly::BLOCK: + case WebAssembly::BLOCK_S: + case WebAssembly::END_BLOCK: + case WebAssembly::END_BLOCK_S: + case WebAssembly::LOOP: + case WebAssembly::LOOP_S: + case WebAssembly::END_LOOP: + case WebAssembly::END_LOOP_S: + case WebAssembly::TRY: + case WebAssembly::TRY_S: + case WebAssembly::END_TRY: + case WebAssembly::END_TRY_S: + return true; + default: + return false; + } +} } // end namespace WebAssembly } // end namespace llvm diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp index 50143fb0ece3..e05efef7201b 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -1,9 +1,8 @@ //==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -13,9 +12,9 @@ /// //===----------------------------------------------------------------------===// -#include "WebAssemblyTargetStreamer.h" -#include "InstPrinter/WebAssemblyInstPrinter.h" -#include "WebAssemblyMCTargetDesc.h" +#include "MCTargetDesc/WebAssemblyTargetStreamer.h" +#include "MCTargetDesc/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -113,8 +112,15 @@ void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) { } void WebAssemblyTargetAsmStreamer::emitImportModule(const MCSymbolWasm *Sym, - StringRef ModuleName) { - OS << "\t.import_module\t" << Sym->getName() << ", " << ModuleName << '\n'; + StringRef ImportModule) { + OS << "\t.import_module\t" << Sym->getName() << ", " + << ImportModule << '\n'; +} + +void WebAssemblyTargetAsmStreamer::emitImportName(const MCSymbolWasm *Sym, + StringRef ImportName) { + OS << "\t.import_name\t" << Sym->getName() << ", " + << ImportName << '\n'; } void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) { diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h index 3073938118b4..5ea62b179d22 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -1,9 +1,8 @@ //==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -45,7 +44,10 @@ public: virtual void emitEventType(const MCSymbolWasm *Sym) = 0; /// .import_module virtual void emitImportModule(const MCSymbolWasm *Sym, - StringRef ModuleName) = 0; + StringRef ImportModule) = 0; + /// .import_name + virtual void emitImportName(const MCSymbolWasm *Sym, + StringRef ImportName) = 0; protected: void emitValueType(wasm::ValType Type); @@ -67,7 +69,8 @@ public: void emitIndIdx(const MCExpr *Value) override; void emitGlobalType(const MCSymbolWasm *Sym) override; void emitEventType(const MCSymbolWasm *Sym) override; - void emitImportModule(const MCSymbolWasm *Sym, StringRef ModuleName) override; + void emitImportModule(const MCSymbolWasm *Sym, StringRef ImportModule) override; + void emitImportName(const MCSymbolWasm *Sym, StringRef ImportName) override; }; /// This part is for Wasm object output @@ -82,7 +85,9 @@ public: void emitGlobalType(const MCSymbolWasm *Sym) override {} void emitEventType(const MCSymbolWasm *Sym) override {} void emitImportModule(const MCSymbolWasm *Sym, - StringRef ModuleName) override {} + StringRef ImportModule) override {} + void emitImportName(const MCSymbolWasm *Sym, + StringRef ImportName) override {} }; /// This part is for null output @@ -98,6 +103,7 @@ public: void emitGlobalType(const MCSymbolWasm *) override {} void emitEventType(const MCSymbolWasm *) override {} void emitImportModule(const MCSymbolWasm *, StringRef) override {} + void emitImportName(const MCSymbolWasm *, StringRef) override {} }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 763e30be8e02..a1cc3e268e8f 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyWasmObjectWriter.cpp - WebAssembly Wasm Writer ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -43,26 +42,7 @@ private: WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit) : MCWasmObjectTargetWriter(Is64Bit) {} -// Test whether the given expression computes a function address. -static bool IsFunctionExpr(const MCExpr *Expr) { - if (auto SyExp = dyn_cast(Expr)) - return cast(SyExp->getSymbol()).isFunction(); - - if (auto BinOp = dyn_cast(Expr)) - return IsFunctionExpr(BinOp->getLHS()) != IsFunctionExpr(BinOp->getRHS()); - - if (auto UnOp = dyn_cast(Expr)) - return IsFunctionExpr(UnOp->getSubExpr()); - - return false; -} - -static bool IsFunctionType(const MCValue &Target) { - const MCSymbolRefExpr *RefA = Target.getSymA(); - return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX; -} - -static const MCSection *GetFixupSection(const MCExpr *Expr) { +static const MCSection *getFixupSection(const MCExpr *Expr) { if (auto SyExp = dyn_cast(Expr)) { if (SyExp->getSymbol().isInSection()) return &SyExp->getSymbol().getSection(); @@ -70,63 +50,66 @@ static const MCSection *GetFixupSection(const MCExpr *Expr) { } if (auto BinOp = dyn_cast(Expr)) { - auto SectionLHS = GetFixupSection(BinOp->getLHS()); - auto SectionRHS = GetFixupSection(BinOp->getRHS()); + auto SectionLHS = getFixupSection(BinOp->getLHS()); + auto SectionRHS = getFixupSection(BinOp->getRHS()); return SectionLHS == SectionRHS ? nullptr : SectionLHS; } if (auto UnOp = dyn_cast(Expr)) - return GetFixupSection(UnOp->getSubExpr()); + return getFixupSection(UnOp->getSubExpr()); return nullptr; } -static bool IsGlobalType(const MCValue &Target) { - const MCSymbolRefExpr *RefA = Target.getSymA(); - return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_GLOBAL; -} - -static bool IsEventType(const MCValue &Target) { - const MCSymbolRefExpr *RefA = Target.getSymA(); - return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_EVENT; -} - unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target, const MCFixup &Fixup) const { - // WebAssembly functions are not allocated in the data address space. To - // resolve a pointer to a function, we must use a special relocation type. - bool IsFunction = IsFunctionExpr(Fixup.getValue()); + const MCSymbolRefExpr *RefA = Target.getSymA(); + assert(RefA); + auto& SymA = cast(RefA->getSymbol()); + + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + + switch (Modifier) { + case MCSymbolRefExpr::VK_GOT: + return wasm::R_WASM_GLOBAL_INDEX_LEB; + case MCSymbolRefExpr::VK_WASM_TBREL: + assert(SymA.isFunction()); + return wasm::R_WASM_TABLE_INDEX_REL_SLEB; + case MCSymbolRefExpr::VK_WASM_MBREL: + assert(SymA.isData()); + return wasm::R_WASM_MEMORY_ADDR_REL_SLEB; + case MCSymbolRefExpr::VK_WASM_TYPEINDEX: + return wasm::R_WASM_TYPE_INDEX_LEB; + default: + break; + } switch (unsigned(Fixup.getKind())) { - case WebAssembly::fixup_code_sleb128_i32: - if (IsFunction) - return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB; - return wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB; - case WebAssembly::fixup_code_sleb128_i64: + case WebAssembly::fixup_sleb128_i32: + if (SymA.isFunction()) + return wasm::R_WASM_TABLE_INDEX_SLEB; + return wasm::R_WASM_MEMORY_ADDR_SLEB; + case WebAssembly::fixup_sleb128_i64: llvm_unreachable("fixup_sleb128_i64 not implemented yet"); - case WebAssembly::fixup_code_uleb128_i32: - if (IsGlobalType(Target)) - return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB; - if (IsFunctionType(Target)) - return wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB; - if (IsFunction) - return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB; - if (IsEventType(Target)) - return wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB; - return wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB; + case WebAssembly::fixup_uleb128_i32: + if (SymA.isGlobal()) + return wasm::R_WASM_GLOBAL_INDEX_LEB; + if (SymA.isFunction()) + return wasm::R_WASM_FUNCTION_INDEX_LEB; + if (SymA.isEvent()) + return wasm::R_WASM_EVENT_INDEX_LEB; + return wasm::R_WASM_MEMORY_ADDR_LEB; case FK_Data_4: - if (IsFunction) - return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32; + if (SymA.isFunction()) + return wasm::R_WASM_TABLE_INDEX_I32; if (auto Section = static_cast( - GetFixupSection(Fixup.getValue()))) { + getFixupSection(Fixup.getValue()))) { if (Section->getKind().isText()) - return wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32; + return wasm::R_WASM_FUNCTION_OFFSET_I32; else if (!Section->isWasmData()) - return wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32; + return wasm::R_WASM_SECTION_OFFSET_I32; } - return wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32; - case FK_Data_8: - llvm_unreachable("FK_Data_8 not implemented yet"); + return wasm::R_WASM_MEMORY_ADDR_I32; default: llvm_unreachable("unimplemented fixup kind"); } diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt index a154b4bf7ea8..ef3f5aaf7d33 100644 --- a/lib/Target/WebAssembly/README.txt +++ b/lib/Target/WebAssembly/README.txt @@ -14,7 +14,7 @@ can run in browsers and other environments. For more information, see the Emscripten documentation in general, and this page in particular: * https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend - + Rust provides WebAssembly support integrated into Cargo. There are two main options: - wasm32-unknown-unknown, which provides a relatively minimal environment diff --git a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp index f7a417c0ed49..e4afe2bb2830 100644 --- a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp +++ b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyTargetInfo.cpp - WebAssembly Target Implementation -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -12,8 +11,7 @@ /// //===----------------------------------------------------------------------===// -#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" -#include "llvm/ADT/Triple.h" +#include "TargetInfo/WebAssemblyTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h new file mode 100644 index 000000000000..a7427f78c72c --- /dev/null +++ b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h @@ -0,0 +1,26 @@ +//===-- WebAssemblyTargetInfo.h - WebAssembly Target Impl -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file registers the WebAssembly target. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheWebAssemblyTarget32(); +Target &getTheWebAssemblyTarget64(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h index 45145c0a6527..fcbd0a5082ff 100644 --- a/lib/Target/WebAssembly/WebAssembly.h +++ b/lib/Target/WebAssembly/WebAssembly.h @@ -1,9 +1,8 @@ //===-- WebAssembly.h - Top-level interface for WebAssembly ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -39,18 +38,17 @@ FunctionPass *createWebAssemblyArgumentMove(); FunctionPass *createWebAssemblySetP2AlignOperands(); // Late passes. -FunctionPass *createWebAssemblyEHRestoreStackPointer(); FunctionPass *createWebAssemblyReplacePhysRegs(); FunctionPass *createWebAssemblyPrepareForLiveIntervals(); FunctionPass *createWebAssemblyOptimizeLiveIntervals(); FunctionPass *createWebAssemblyMemIntrinsicResults(); FunctionPass *createWebAssemblyRegStackify(); FunctionPass *createWebAssemblyRegColoring(); -FunctionPass *createWebAssemblyExplicitLocals(); FunctionPass *createWebAssemblyFixIrreducibleControlFlow(); FunctionPass *createWebAssemblyLateEHPrepare(); FunctionPass *createWebAssemblyCFGSort(); FunctionPass *createWebAssemblyCFGStackify(); +FunctionPass *createWebAssemblyExplicitLocals(); FunctionPass *createWebAssemblyLowerBrUnless(); FunctionPass *createWebAssemblyRegNumbering(); FunctionPass *createWebAssemblyPeephole(); @@ -64,19 +62,18 @@ void initializeFixFunctionBitcastsPass(PassRegistry &); void initializeOptimizeReturnedPass(PassRegistry &); void initializeWebAssemblyArgumentMovePass(PassRegistry &); void initializeWebAssemblySetP2AlignOperandsPass(PassRegistry &); -void initializeWebAssemblyEHRestoreStackPointerPass(PassRegistry &); void initializeWebAssemblyReplacePhysRegsPass(PassRegistry &); void initializeWebAssemblyPrepareForLiveIntervalsPass(PassRegistry &); void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &); void initializeWebAssemblyMemIntrinsicResultsPass(PassRegistry &); void initializeWebAssemblyRegStackifyPass(PassRegistry &); void initializeWebAssemblyRegColoringPass(PassRegistry &); -void initializeWebAssemblyExplicitLocalsPass(PassRegistry &); void initializeWebAssemblyFixIrreducibleControlFlowPass(PassRegistry &); void initializeWebAssemblyLateEHPreparePass(PassRegistry &); void initializeWebAssemblyExceptionInfoPass(PassRegistry &); void initializeWebAssemblyCFGSortPass(PassRegistry &); void initializeWebAssemblyCFGStackifyPass(PassRegistry &); +void initializeWebAssemblyExplicitLocalsPass(PassRegistry &); void initializeWebAssemblyLowerBrUnlessPass(PassRegistry &); void initializeWebAssemblyRegNumberingPass(PassRegistry &); void initializeWebAssemblyPeepholePass(PassRegistry &); diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td index 6b218f8aa880..b0b8a9b996a3 100644 --- a/lib/Target/WebAssembly/WebAssembly.td +++ b/lib/Target/WebAssembly/WebAssembly.td @@ -1,9 +1,8 @@ //- WebAssembly.td - Describe the WebAssembly Target Machine --*- tablegen -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -34,6 +33,7 @@ def FeatureUnimplementedSIMD128 : def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true", "Enable Atomics">; + def FeatureNontrappingFPToInt : SubtargetFeature<"nontrapping-fptoint", "HasNontrappingFPToInt", "true", @@ -44,10 +44,28 @@ def FeatureSignExt : "HasSignExt", "true", "Enable sign extension operators">; +def FeatureTailCall : + SubtargetFeature<"tail-call", + "HasTailCall", "true", + "Enable tail call instructions">; + def FeatureExceptionHandling : SubtargetFeature<"exception-handling", "HasExceptionHandling", "true", "Enable Wasm exception handling">; +def FeatureBulkMemory : + SubtargetFeature<"bulk-memory", "HasBulkMemory", "true", + "Enable bulk memory operations">; + +def FeatureMultivalue : + SubtargetFeature<"multivalue", + "HasMultivalue", "true", + "Enable multivalue blocks, instructions, and functions">; + +def FeatureMutableGlobals : + SubtargetFeature<"mutable-globals", "HasMutableGlobals", "true", + "Enable mutable globals">; + //===----------------------------------------------------------------------===// // Architectures. //===----------------------------------------------------------------------===// @@ -79,7 +97,8 @@ def : ProcessorModel<"generic", NoSchedModel, []>; // Latest and greatest experimental version of WebAssembly. Bugs included! def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128, FeatureAtomics, - FeatureNontrappingFPToInt, FeatureSignExt]>; + FeatureNontrappingFPToInt, FeatureSignExt, + FeatureMutableGlobals]>; //===----------------------------------------------------------------------===// // Target Declaration diff --git a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp index e49e2b67f435..b7a701f15782 100644 --- a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyAddMissingPrototypes.cpp - Fix prototypeless functions -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -79,32 +78,33 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) { report_fatal_error( "Functions with 'no-prototype' attribute must take varargs: " + F.getName()); - if (F.getFunctionType()->getNumParams() != 0) - report_fatal_error( - "Functions with 'no-prototype' attribute should not have params: " + - F.getName()); + unsigned NumParams = F.getFunctionType()->getNumParams(); + if (NumParams != 0) { + if (!(NumParams == 1 && F.arg_begin()->hasStructRetAttr())) + report_fatal_error("Functions with 'no-prototype' attribute should " + "not have params: " + + F.getName()); + } // Create a function prototype based on the first call site (first bitcast) // that we find. FunctionType *NewType = nullptr; - Function *NewF = nullptr; for (Use &U : F.uses()) { LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n"); + LLVM_DEBUG(dbgs() << *U.getUser() << "\n"); if (auto *BC = dyn_cast(U.getUser())) { if (auto *DestType = dyn_cast( BC->getDestTy()->getPointerElementType())) { if (!NewType) { // Create a new function with the correct type NewType = DestType; - NewF = Function::Create(NewType, F.getLinkage(), F.getName()); - NewF->setAttributes(F.getAttributes()); - NewF->removeFnAttr("no-prototype"); - } else { - if (NewType != DestType) { - report_fatal_error("Prototypeless function used with " - "conflicting signatures: " + - F.getName()); - } + LLVM_DEBUG(dbgs() << "found function type: " << *NewType << "\n"); + } else if (NewType != DestType) { + errs() << "warning: prototype-less function used with " + "conflicting signatures: " + << F.getName() << "\n"; + LLVM_DEBUG(dbgs() << " " << *DestType << "\n"); + LLVM_DEBUG(dbgs() << " "<< *NewType << "\n"); } } } @@ -114,47 +114,30 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) { LLVM_DEBUG( dbgs() << "could not derive a function prototype from usage: " + F.getName() + "\n"); - continue; + // We could not derive a type for this function. In this case strip + // the isVarArg and make it a simple zero-arg function. This has more + // chance of being correct. The current signature of (...) is illegal in + // C since it doesn't have any arguments before the "...", we this at + // least makes it possible for this symbol to be resolved by the linker. + NewType = FunctionType::get(F.getFunctionType()->getReturnType(), false); } - SmallVector DeadInsts; - - for (Use &US : F.uses()) { - User *U = US.getUser(); - if (auto *BC = dyn_cast(U)) { - if (auto *Inst = dyn_cast(U)) { - // Replace with a new bitcast - IRBuilder<> Builder(Inst); - Value *NewCast = Builder.CreatePointerCast(NewF, BC->getDestTy()); - Inst->replaceAllUsesWith(NewCast); - DeadInsts.push_back(Inst); - } else if (auto *Const = dyn_cast(U)) { - Constant *NewConst = - ConstantExpr::getPointerCast(NewF, BC->getDestTy()); - Const->replaceAllUsesWith(NewConst); - } else { - dbgs() << *U->getType() << "\n"; -#ifndef NDEBUG - U->dump(); -#endif - report_fatal_error("unexpected use of prototypeless function: " + - F.getName() + "\n"); - } - } - } - - for (auto I : DeadInsts) - I->eraseFromParent(); + Function *NewF = + Function::Create(NewType, F.getLinkage(), F.getName() + ".fixed_sig"); + NewF->setAttributes(F.getAttributes()); + NewF->removeFnAttr("no-prototype"); Replacements.emplace_back(&F, NewF); } - - // Finally replace the old function declarations with the new ones for (auto &Pair : Replacements) { - Function *Old = Pair.first; - Function *New = Pair.second; - Old->eraseFromParent(); - M.getFunctionList().push_back(New); + Function *OldF = Pair.first; + Function *NewF = Pair.second; + std::string Name = OldF->getName(); + M.getFunctionList().push_back(NewF); + OldF->replaceAllUsesWith( + ConstantExpr::getPointerBitCastOrAddrSpaceCast(NewF, OldF->getType())); + OldF->eraseFromParent(); + NewF->setName(Name); } return !Replacements.empty(); diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp index 7c8a631cde8a..02f5cc6da77c 100644 --- a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp +++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -79,7 +78,7 @@ bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) { // Look for the first NonArg instruction. for (MachineInstr &MI : EntryMBB) { - if (!WebAssembly::isArgument(MI)) { + if (!WebAssembly::isArgument(MI.getOpcode())) { InsertPt = MI; break; } @@ -88,7 +87,7 @@ bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) { // Now move any argument instructions later in the block // to before our first NonArg instruction. for (MachineInstr &MI : llvm::make_range(InsertPt, EntryMBB.end())) { - if (WebAssembly::isArgument(MI)) { + if (WebAssembly::isArgument(MI.getOpcode())) { EntryMBB.insert(InsertPt, MI.removeFromParent()); Changed = true; } diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index c4f03dfa7f9e..7f9d41da3978 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -15,21 +14,27 @@ //===----------------------------------------------------------------------===// #include "WebAssemblyAsmPrinter.h" -#include "InstPrinter/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyInstPrinter.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "MCTargetDesc/WebAssemblyTargetStreamer.h" +#include "TargetInfo/WebAssemblyTargetInfo.h" #include "WebAssembly.h" #include "WebAssemblyMCInstLower.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblyRegisterInfo.h" +#include "WebAssemblyTargetMachine.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/BinaryFormat/Wasm.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Metadata.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" @@ -38,10 +43,13 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" + using namespace llvm; #define DEBUG_TYPE "asm-printer" +extern cl::opt WasmKeepRegisters; + //===----------------------------------------------------------------------===// // Helpers. //===----------------------------------------------------------------------===// @@ -92,11 +100,11 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { if (F.isDeclarationForLinker() && !F.isIntrinsic()) { SmallVector Results; SmallVector Params; - ComputeSignatureVTs(F.getFunctionType(), F, TM, Params, Results); + computeSignatureVTs(F.getFunctionType(), F, TM, Params, Results); auto *Sym = cast(getSymbol(&F)); Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); if (!Sym->getSignature()) { - auto Signature = SignatureFromMVTs(Results, Params); + auto Signature = signatureFromMVTs(Results, Params); Sym->setSignature(Signature.get()); addSignature(std::move(Signature)); } @@ -111,9 +119,16 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { F.hasFnAttribute("wasm-import-module")) { StringRef Name = F.getFnAttribute("wasm-import-module").getValueAsString(); - Sym->setModuleName(Name); + Sym->setImportModule(Name); getTargetStreamer()->emitImportModule(Sym, Name); } + if (TM.getTargetTriple().isOSBinFormatWasm() && + F.hasFnAttribute("wasm-import-name")) { + StringRef Name = + F.getFnAttribute("wasm-import-name").getValueAsString(); + Sym->setImportName(Name); + getTargetStreamer()->emitImportName(Sym, Name); + } } } @@ -129,7 +144,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { if (const NamedMDNode *Named = M.getNamedMetadata("wasm.custom_sections")) { for (const Metadata *MD : Named->operands()) { - const MDTuple *Tuple = dyn_cast(MD); + const auto *Tuple = dyn_cast(MD); if (!Tuple || Tuple->getNumOperands() != 2) continue; const MDString *Name = dyn_cast(Tuple->getOperand(0)); @@ -139,13 +154,117 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer->PushSection(); std::string SectionName = (".custom_section." + Name->getString()).str(); - MCSectionWasm *mySection = + MCSectionWasm *MySection = OutContext.getWasmSection(SectionName, SectionKind::getMetadata()); - OutStreamer->SwitchSection(mySection); + OutStreamer->SwitchSection(MySection); OutStreamer->EmitBytes(Contents->getString()); OutStreamer->PopSection(); } } + + EmitProducerInfo(M); + EmitTargetFeatures(M); +} + +void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) { + llvm::SmallVector, 4> Languages; + if (const NamedMDNode *Debug = M.getNamedMetadata("llvm.dbg.cu")) { + llvm::SmallSet SeenLanguages; + for (size_t I = 0, E = Debug->getNumOperands(); I < E; ++I) { + const auto *CU = cast(Debug->getOperand(I)); + StringRef Language = dwarf::LanguageString(CU->getSourceLanguage()); + Language.consume_front("DW_LANG_"); + if (SeenLanguages.insert(Language).second) + Languages.emplace_back(Language.str(), ""); + } + } + + llvm::SmallVector, 4> Tools; + if (const NamedMDNode *Ident = M.getNamedMetadata("llvm.ident")) { + llvm::SmallSet SeenTools; + for (size_t I = 0, E = Ident->getNumOperands(); I < E; ++I) { + const auto *S = cast(Ident->getOperand(I)->getOperand(0)); + std::pair Field = S->getString().split("version"); + StringRef Name = Field.first.trim(); + StringRef Version = Field.second.trim(); + if (SeenTools.insert(Name).second) + Tools.emplace_back(Name.str(), Version.str()); + } + } + + int FieldCount = int(!Languages.empty()) + int(!Tools.empty()); + if (FieldCount != 0) { + MCSectionWasm *Producers = OutContext.getWasmSection( + ".custom_section.producers", SectionKind::getMetadata()); + OutStreamer->PushSection(); + OutStreamer->SwitchSection(Producers); + OutStreamer->EmitULEB128IntValue(FieldCount); + for (auto &Producers : {std::make_pair("language", &Languages), + std::make_pair("processed-by", &Tools)}) { + if (Producers.second->empty()) + continue; + OutStreamer->EmitULEB128IntValue(strlen(Producers.first)); + OutStreamer->EmitBytes(Producers.first); + OutStreamer->EmitULEB128IntValue(Producers.second->size()); + for (auto &Producer : *Producers.second) { + OutStreamer->EmitULEB128IntValue(Producer.first.size()); + OutStreamer->EmitBytes(Producer.first); + OutStreamer->EmitULEB128IntValue(Producer.second.size()); + OutStreamer->EmitBytes(Producer.second); + } + } + OutStreamer->PopSection(); + } +} + +void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) { + struct FeatureEntry { + uint8_t Prefix; + StringRef Name; + }; + + // Read target features and linkage policies from module metadata + SmallVector EmittedFeatures; + for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) { + std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str(); + Metadata *Policy = M.getModuleFlag(MDKey); + if (Policy == nullptr) + continue; + + FeatureEntry Entry; + Entry.Prefix = 0; + Entry.Name = KV.Key; + + if (auto *MD = cast(Policy)) + if (auto *I = cast(MD->getValue())) + Entry.Prefix = I->getZExtValue(); + + // Silently ignore invalid metadata + if (Entry.Prefix != wasm::WASM_FEATURE_PREFIX_USED && + Entry.Prefix != wasm::WASM_FEATURE_PREFIX_REQUIRED && + Entry.Prefix != wasm::WASM_FEATURE_PREFIX_DISALLOWED) + continue; + + EmittedFeatures.push_back(Entry); + } + + if (EmittedFeatures.size() == 0) + return; + + // Emit features and linkage policies into the "target_features" section + MCSectionWasm *FeaturesSection = OutContext.getWasmSection( + ".custom_section.target_features", SectionKind::getMetadata()); + OutStreamer->PushSection(); + OutStreamer->SwitchSection(FeaturesSection); + + OutStreamer->EmitULEB128IntValue(EmittedFeatures.size()); + for (auto &F : EmittedFeatures) { + OutStreamer->EmitIntValue(F.Prefix, 1); + OutStreamer->EmitULEB128IntValue(F.Name.size()); + OutStreamer->EmitBytes(F.Name); + } + + OutStreamer->PopSection(); } void WebAssemblyAsmPrinter::EmitConstantPool() { @@ -161,8 +280,8 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { const Function &F = MF->getFunction(); SmallVector ResultVTs; SmallVector ParamVTs; - ComputeSignatureVTs(F.getFunctionType(), F, TM, ParamVTs, ResultVTs); - auto Signature = SignatureFromMVTs(ResultVTs, ParamVTs); + computeSignatureVTs(F.getFunctionType(), F, TM, ParamVTs, ResultVTs); + auto Signature = signatureFromMVTs(ResultVTs, ParamVTs); auto *WasmSym = cast(CurrentFnSym); WasmSym->setSignature(Signature.get()); addSignature(std::move(Signature)); @@ -180,7 +299,7 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { } SmallVector Locals; - ValTypesFromMVTs(MFI->getLocals(), Locals); + valTypesFromMVTs(MFI->getLocals(), Locals); getTargetStreamer()->emitLocal(Locals); AsmPrinter::EmitFunctionBodyStart(); @@ -250,34 +369,34 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { OutStreamer->AddBlankLine(); } break; + case WebAssembly::COMPILER_FENCE: + // This is a compiler barrier that prevents instruction reordering during + // backend compilation, and should not be emitted. + break; + case WebAssembly::EXTRACT_EXCEPTION_I32: + case WebAssembly::EXTRACT_EXCEPTION_I32_S: + // These are pseudo instructions that simulates popping values from stack. + // We print these only when we have -wasm-keep-registers on for assembly + // readability. + if (!WasmKeepRegisters) + break; + LLVM_FALLTHROUGH; default: { WebAssemblyMCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); + MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); break; } } } -const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) { - if (const GlobalValue *GV = dyn_cast(CV)) - if (GV->getValueType()->isFunctionTy()) { - return MCSymbolRefExpr::create( - getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext); - } - return AsmPrinter::lowerConstant(CV); -} - bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, + unsigned OpNo, const char *ExtraCode, raw_ostream &OS) { - if (AsmVariant != 0) - report_fatal_error("There are no defined alternate asm variants"); - // First try the generic code, which knows about modifiers like 'c' and 'n'. - if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS)) + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS)) return false; if (!ExtraCode) { @@ -293,8 +412,7 @@ bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI, OS << regToString(MO); return false; case MachineOperand::MO_GlobalAddress: - getSymbol(MO.getGlobal())->print(OS, MAI); - printOffset(MO.getOffset(), OS); + PrintSymbolOperand(MO, OS); return false; case MachineOperand::MO_ExternalSymbol: GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI); @@ -313,19 +431,15 @@ bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI, bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) { - if (AsmVariant != 0) - report_fatal_error("There are no defined alternate asm variants"); - // The current approach to inline asm is that "r" constraints are expressed // as local indices, rather than values on the operand stack. This simplifies // using "r" as it eliminates the need to push and pop the values in a // particular order, however it also makes it impossible to have an "m" // constraint. So we don't support it. - return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS); + return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS); } // Force static initialization. diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h index f6cb5610bad3..4e55c81dec38 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h @@ -1,9 +1,8 @@ // WebAssemblyAsmPrinter.h - WebAssembly implementation of AsmPrinter-*- C++ -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -59,17 +58,16 @@ public: //===------------------------------------------------------------------===// void EmitEndOfAsmFile(Module &M) override; + void EmitProducerInfo(Module &M); + void EmitTargetFeatures(Module &M); void EmitJumpTableInfo() override; void EmitConstantPool() override; void EmitFunctionBodyStart() override; void EmitInstruction(const MachineInstr *MI) override; - const MCExpr *lowerConstant(const Constant *CV) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; MVT getRegType(unsigned RegNo) const; std::string regToString(const MachineOperand &MO); diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index fc827e9d5780..4c5d0192fc28 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyCFGSort.cpp - CFG Sorting ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -35,6 +34,14 @@ using namespace llvm; #define DEBUG_TYPE "wasm-cfg-sort" +// Option to disable EH pad first sorting. Only for testing unwind destination +// mismatches in CFGStackify. +static cl::opt WasmDisableEHPadSort( + "wasm-disable-ehpad-sort", cl::ReallyHidden, + cl::desc( + "WebAssembly: Disable EH pad-first sort order. Testing purpose only."), + cl::init(false)); + namespace { // Wrapper for loops and exceptions @@ -133,7 +140,7 @@ FunctionPass *llvm::createWebAssemblyCFGSort() { return new WebAssemblyCFGSort(); } -static void MaybeUpdateTerminator(MachineBasicBlock *MBB) { +static void maybeUpdateTerminator(MachineBasicBlock *MBB) { #ifndef NDEBUG bool AnyBarrier = false; #endif @@ -188,10 +195,12 @@ namespace { struct CompareBlockNumbers { bool operator()(const MachineBasicBlock *A, const MachineBasicBlock *B) const { - if (A->isEHPad() && !B->isEHPad()) - return false; - if (!A->isEHPad() && B->isEHPad()) - return true; + if (!WasmDisableEHPadSort) { + if (A->isEHPad() && !B->isEHPad()) + return false; + if (!A->isEHPad() && B->isEHPad()) + return true; + } return A->getNumber() > B->getNumber(); } @@ -200,11 +209,12 @@ struct CompareBlockNumbers { struct CompareBlockNumbersBackwards { bool operator()(const MachineBasicBlock *A, const MachineBasicBlock *B) const { - // We give a higher priority to an EH pad - if (A->isEHPad() && !B->isEHPad()) - return false; - if (!A->isEHPad() && B->isEHPad()) - return true; + if (!WasmDisableEHPadSort) { + if (A->isEHPad() && !B->isEHPad()) + return false; + if (!A->isEHPad() && B->isEHPad()) + return true; + } return A->getNumber() < B->getNumber(); } @@ -228,7 +238,7 @@ struct Entry { /// interrupted by blocks not dominated by their header. /// TODO: There are many opportunities for improving the heuristics here. /// Explore them. -static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, +static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, const WebAssemblyExceptionInfo &WEI, const MachineDominatorTree &MDT) { // Prepare for a topological sort: Record the number of predecessors each @@ -260,10 +270,10 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, CompareBlockNumbersBackwards> Ready; - RegionInfo SUI(MLI, WEI); + RegionInfo RI(MLI, WEI); SmallVector Entries; for (MachineBasicBlock *MBB = &MF.front();;) { - const Region *R = SUI.getRegionFor(MBB); + const Region *R = RI.getRegionFor(MBB); if (R) { // If MBB is a region header, add it to the active region list. We can't // put any blocks that it doesn't dominate until we see the end of the @@ -320,7 +330,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, if (!Next) { // If there are no more blocks to process, we're done. if (Ready.empty()) { - MaybeUpdateTerminator(MBB); + maybeUpdateTerminator(MBB); break; } for (;;) { @@ -338,7 +348,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, } // Move the next block into place and iterate. Next->moveAfter(MBB); - MaybeUpdateTerminator(MBB); + maybeUpdateTerminator(MBB); MBB = Next; } assert(Entries.empty() && "Active sort region list not finished"); @@ -354,7 +364,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, for (auto &MBB : MF) { assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative."); - const Region *Region = SUI.getRegionFor(&MBB); + const Region *Region = RI.getRegionFor(&MBB); if (Region && &MBB == Region->getHeader()) { if (Region->isLoop()) { @@ -379,7 +389,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, for (auto Pred : MBB.predecessors()) assert(Pred->getNumber() < MBB.getNumber() && "Non-loop-header predecessors should be topologically sorted"); - assert(OnStack.count(SUI.getRegionFor(&MBB)) && + assert(OnStack.count(RI.getRegionFor(&MBB)) && "Blocks must be nested in their regions"); } while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back())) @@ -404,7 +414,7 @@ bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) { MF.getRegInfo().invalidateLiveness(); // Sort the blocks, with contiguous sort regions. - SortBlocks(MF, MLI, WEI, MDT); + sortBlocks(MF, MLI, WEI, MDT); return true; } diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index f8f5f4040c86..e6bfc5226e2e 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -22,26 +21,21 @@ /// //===----------------------------------------------------------------------===// -#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" #include "WebAssemblyExceptionInfo.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyUtilities.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/WasmEHFuncInfo.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "wasm-cfg-stackify" +STATISTIC(NumUnwindMismatches, "Number of EH pad unwind mismatches found"); + namespace { class WebAssemblyCFGStackify final : public MachineFunctionPass { StringRef getPassName() const override { return "WebAssembly CFG Stackify"; } @@ -60,10 +54,13 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { // over scoped regions when walking blocks. SmallVector ScopeTops; + // Placing markers. void placeMarkers(MachineFunction &MF); void placeBlockMarker(MachineBasicBlock &MBB); void placeLoopMarker(MachineBasicBlock &MBB); void placeTryMarker(MachineBasicBlock &MBB); + void removeUnnecessaryInstrs(MachineFunction &MF); + bool fixUnwindMismatches(MachineFunction &MF); void rewriteDepthImmediates(MachineFunction &MF); void fixEndsAtEndOfFunction(MachineFunction &MF); @@ -75,16 +72,28 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { DenseMap TryToEHPad; // map DenseMap EHPadToTry; - // map - DenseMap BeginToBottom; - // Helper functions to register scope information created by marker - // instructions. + // There can be an appendix block at the end of each function, shared for: + // - creating a correct signature for fallthrough returns + // - target for rethrows that need to unwind to the caller, but are trapped + // inside another try/catch + MachineBasicBlock *AppendixBB = nullptr; + MachineBasicBlock *getAppendixBlock(MachineFunction &MF) { + if (!AppendixBB) { + AppendixBB = MF.CreateMachineBasicBlock(); + // Give it a fake predecessor so that AsmPrinter prints its label. + AppendixBB->addSuccessor(AppendixBB); + MF.push_back(AppendixBB); + } + return AppendixBB; + } + + // Helper functions to register / unregister scope information created by + // marker instructions. void registerScope(MachineInstr *Begin, MachineInstr *End); void registerTryScope(MachineInstr *Begin, MachineInstr *End, MachineBasicBlock *EHPad); - - MachineBasicBlock *getBottom(const MachineInstr *Begin); + void unregisterScope(MachineInstr *Begin); public: static char ID; // Pass identification, replacement for typeid @@ -96,7 +105,7 @@ public: char WebAssemblyCFGStackify::ID = 0; INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE, - "Insert BLOCK and LOOP markers for WebAssembly scopes", false, + "Insert BLOCK/LOOP/TRY markers for WebAssembly scopes", false, false) FunctionPass *llvm::createWebAssemblyCFGStackify() { @@ -108,14 +117,12 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() { /// code) for a branch instruction to both branch to a block and fallthrough /// to it, so we check the actual branch operands to see if there are any /// explicit mentions. -static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, +static bool explicitlyBranchesTo(MachineBasicBlock *Pred, MachineBasicBlock *MBB) { for (MachineInstr &MI : Pred->terminators()) - // Even if a rethrow takes a BB argument, it is not a branch - if (!WebAssembly::isRethrow(MI)) - for (MachineOperand &MO : MI.explicit_operands()) - if (MO.isMBB() && MO.getMBB() == MBB) - return true; + for (MachineOperand &MO : MI.explicit_operands()) + if (MO.isMBB() && MO.getMBB() == MBB) + return true; return false; } @@ -125,7 +132,7 @@ static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, // ones that should go after the marker. In this function, AfterSet is only // used for sanity checking. static MachineBasicBlock::iterator -GetEarliestInsertPos(MachineBasicBlock *MBB, +getEarliestInsertPos(MachineBasicBlock *MBB, const SmallPtrSet &BeforeSet, const SmallPtrSet &AfterSet) { auto InsertPos = MBB->end(); @@ -149,7 +156,7 @@ GetEarliestInsertPos(MachineBasicBlock *MBB, // ones that should go after the marker. In this function, BeforeSet is only // used for sanity checking. static MachineBasicBlock::iterator -GetLatestInsertPos(MachineBasicBlock *MBB, +getLatestInsertPos(MachineBasicBlock *MBB, const SmallPtrSet &BeforeSet, const SmallPtrSet &AfterSet) { auto InsertPos = MBB->begin(); @@ -181,33 +188,25 @@ void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin, EHPadToTry[EHPad] = Begin; } -// Given a LOOP/TRY marker, returns its bottom BB. Use cached information if any -// to prevent recomputation. -MachineBasicBlock * -WebAssemblyCFGStackify::getBottom(const MachineInstr *Begin) { - const auto &MLI = getAnalysis(); - const auto &WEI = getAnalysis(); - if (BeginToBottom.count(Begin)) - return BeginToBottom[Begin]; - if (Begin->getOpcode() == WebAssembly::LOOP) { - MachineLoop *L = MLI.getLoopFor(Begin->getParent()); - assert(L); - BeginToBottom[Begin] = WebAssembly::getBottom(L); - } else if (Begin->getOpcode() == WebAssembly::TRY) { - WebAssemblyException *WE = WEI.getExceptionFor(TryToEHPad[Begin]); - assert(WE); - BeginToBottom[Begin] = WebAssembly::getBottom(WE); - } else - assert(false); - return BeginToBottom[Begin]; +void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) { + assert(BeginToEnd.count(Begin)); + MachineInstr *End = BeginToEnd[Begin]; + assert(EndToBegin.count(End)); + BeginToEnd.erase(Begin); + EndToBegin.erase(End); + MachineBasicBlock *EHPad = TryToEHPad.lookup(Begin); + if (EHPad) { + assert(EHPadToTry.count(EHPad)); + TryToEHPad.erase(Begin); + EHPadToTry.erase(EHPad); + } } /// Insert a BLOCK marker for branches to MBB (if needed). +// TODO Consider a more generalized way of handling block (and also loop and +// try) signatures when we implement the multi-value proposal later. void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { - // This should have been handled in placeTryMarker. - if (MBB.isEHPad()) - return; - + assert(!MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); auto &MDT = getAnalysis(); const auto &TII = *MF.getSubtarget().getInstrInfo(); @@ -218,12 +217,20 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { // which reduces overall stack height. MachineBasicBlock *Header = nullptr; bool IsBranchedTo = false; + bool IsBrOnExn = false; + MachineInstr *BrOnExn = nullptr; int MBBNumber = MBB.getNumber(); for (MachineBasicBlock *Pred : MBB.predecessors()) { if (Pred->getNumber() < MBBNumber) { Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred; - if (ExplicitlyBranchesTo(Pred, &MBB)) + if (explicitlyBranchesTo(Pred, &MBB)) { IsBranchedTo = true; + if (Pred->getFirstTerminator()->getOpcode() == WebAssembly::BR_ON_EXN) { + IsBrOnExn = true; + assert(!BrOnExn && "There should be only one br_on_exn per block"); + BrOnExn = &*Pred->getFirstTerminator(); + } + } } } if (!Header) @@ -232,7 +239,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { return; assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors"); - MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(&MBB)); + MachineBasicBlock *LayoutPred = MBB.getPrevNode(); // If the nearest common dominator is inside a more deeply nested context, // walk out to the nearest scope which isn't more deeply nested. @@ -240,7 +247,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) { if (ScopeTop->getNumber() > Header->getNumber()) { // Skip over an intervening scope. - I = std::next(MachineFunction::iterator(ScopeTop)); + I = std::next(ScopeTop->getIterator()); } else { // We found a scope level at an appropriate depth. Header = ScopeTop; @@ -256,13 +263,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { // Instructions that should go after the BLOCK. SmallPtrSet AfterSet; for (const auto &MI : *Header) { - // If there is a previously placed LOOP/TRY marker and the bottom block of - // the loop/exception is above MBB, it should be after the BLOCK, because - // the loop/exception is nested in this block. Otherwise it should be before - // the BLOCK. - if (MI.getOpcode() == WebAssembly::LOOP || - MI.getOpcode() == WebAssembly::TRY) { - if (MBB.getNumber() > getBottom(&MI)->getNumber()) + // If there is a previously placed LOOP marker and the bottom block of the + // loop is above MBB, it should be after the BLOCK, because the loop is + // nested in this BLOCK. Otherwise it should be before the BLOCK. + if (MI.getOpcode() == WebAssembly::LOOP) { + auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode(); + if (MBB.getNumber() > LoopBottom->getNumber()) AfterSet.insert(&MI); #ifndef NDEBUG else @@ -270,9 +276,10 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { #endif } - // All previously inserted BLOCK markers should be after the BLOCK because - // they are all nested blocks. - if (MI.getOpcode() == WebAssembly::BLOCK) + // All previously inserted BLOCK/TRY markers should be after the BLOCK + // because they are all nested blocks. + if (MI.getOpcode() == WebAssembly::BLOCK || + MI.getOpcode() == WebAssembly::TRY) AfterSet.insert(&MI); #ifndef NDEBUG @@ -300,11 +307,27 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { } // Add the BLOCK. - auto InsertPos = GetLatestInsertPos(Header, BeforeSet, AfterSet); + + // 'br_on_exn' extracts exnref object and pushes variable number of values + // depending on its tag. For C++ exception, its a single i32 value, and the + // generated code will be in the form of: + // block i32 + // br_on_exn 0, $__cpp_exception + // rethrow + // end_block + WebAssembly::ExprType ReturnType = WebAssembly::ExprType::Void; + if (IsBrOnExn) { + const char *TagName = BrOnExn->getOperand(1).getSymbolName(); + if (std::strcmp(TagName, "__cpp_exception") != 0) + llvm_unreachable("Only C++ exception is supported"); + ReturnType = WebAssembly::ExprType::I32; + } + + auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet); MachineInstr *Begin = BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos), TII.get(WebAssembly::BLOCK)) - .addImm(int64_t(WebAssembly::ExprType::Void)); + .addImm(int64_t(ReturnType)); // Decide where in Header to put the END_BLOCK. BeforeSet.clear(); @@ -333,7 +356,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { } // Mark the end of the block. - InsertPos = GetEarliestInsertPos(&MBB, BeforeSet, AfterSet); + InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet); MachineInstr *End = BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos), TII.get(WebAssembly::END_BLOCK)); registerScope(Begin, End); @@ -358,13 +381,10 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { // The operand of a LOOP is the first block after the loop. If the loop is the // bottom of the function, insert a dummy block at the end. MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop); - auto Iter = std::next(MachineFunction::iterator(Bottom)); + auto Iter = std::next(Bottom->getIterator()); if (Iter == MF.end()) { - MachineBasicBlock *Label = MF.CreateMachineBasicBlock(); - // Give it a fake predecessor so that AsmPrinter prints its label. - Label->addSuccessor(Label); - MF.push_back(Label); - Iter = std::next(MachineFunction::iterator(Bottom)); + getAppendixBlock(MF); + Iter = std::next(Bottom->getIterator()); } MachineBasicBlock *AfterLoop = &*Iter; @@ -383,7 +403,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { } // Mark the beginning of the loop. - auto InsertPos = GetEarliestInsertPos(&MBB, BeforeSet, AfterSet); + auto InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet); MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos), TII.get(WebAssembly::LOOP)) .addImm(int64_t(WebAssembly::ExprType::Void)); @@ -400,8 +420,10 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { // Mark the end of the loop (using arbitrary debug location that branched to // the loop end as its location). - InsertPos = GetEarliestInsertPos(AfterLoop, BeforeSet, AfterSet); - DebugLoc EndDL = (*AfterLoop->pred_rbegin())->findBranchDebugLoc(); + InsertPos = getEarliestInsertPos(AfterLoop, BeforeSet, AfterSet); + DebugLoc EndDL = AfterLoop->pred_empty() + ? DebugLoc() + : (*AfterLoop->pred_rbegin())->findBranchDebugLoc(); MachineInstr *End = BuildMI(*AfterLoop, InsertPos, EndDL, TII.get(WebAssembly::END_LOOP)); registerScope(Begin, End); @@ -414,14 +436,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { } void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { - if (!MBB.isEHPad()) - return; - - // catch_all terminate pad is grouped together with catch terminate pad and - // does not need a separate TRY and END_TRY marker. - if (WebAssembly::isCatchAllTerminatePad(MBB)) - return; - + assert(MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); auto &MDT = getAnalysis(); const auto &TII = *MF.getSubtarget().getInstrInfo(); @@ -434,7 +449,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { for (auto *Pred : MBB.predecessors()) { if (Pred->getNumber() < MBBNumber) { Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred; - assert(!ExplicitlyBranchesTo(Pred, &MBB) && + assert(!explicitlyBranchesTo(Pred, &MBB) && "Explicit branch to an EH pad!"); } } @@ -447,19 +462,15 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { assert(WE); MachineBasicBlock *Bottom = WebAssembly::getBottom(WE); - auto Iter = std::next(MachineFunction::iterator(Bottom)); + auto Iter = std::next(Bottom->getIterator()); if (Iter == MF.end()) { - MachineBasicBlock *Label = MF.CreateMachineBasicBlock(); - // Give it a fake predecessor so that AsmPrinter prints its label. - Label->addSuccessor(Label); - MF.push_back(Label); - Iter = std::next(MachineFunction::iterator(Bottom)); + getAppendixBlock(MF); + Iter = std::next(Bottom->getIterator()); } - MachineBasicBlock *AfterTry = &*Iter; + MachineBasicBlock *Cont = &*Iter; - assert(AfterTry != &MF.front()); - MachineBasicBlock *LayoutPred = - &*std::prev(MachineFunction::iterator(AfterTry)); + assert(Cont != &MF.front()); + MachineBasicBlock *LayoutPred = Cont->getPrevNode(); // If the nearest common dominator is inside a more deeply nested context, // walk out to the nearest scope which isn't more deeply nested. @@ -467,7 +478,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) { if (ScopeTop->getNumber() > Header->getNumber()) { // Skip over an intervening scope. - I = std::next(MachineFunction::iterator(ScopeTop)); + I = std::next(ScopeTop->getIterator()); } else { // We found a scope level at an appropriate depth. Header = ScopeTop; @@ -478,16 +489,17 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { // Decide where in Header to put the TRY. - // Instructions that should go before the BLOCK. + // Instructions that should go before the TRY. SmallPtrSet BeforeSet; - // Instructions that should go after the BLOCK. + // Instructions that should go after the TRY. SmallPtrSet AfterSet; for (const auto &MI : *Header) { - // If there is a previously placed LOOP marker and the bottom block of - // the loop is above MBB, the LOOP should be after the TRY, because the - // loop is nested in this try. Otherwise it should be before the TRY. + // If there is a previously placed LOOP marker and the bottom block of the + // loop is above MBB, it should be after the TRY, because the loop is nested + // in this TRY. Otherwise it should be before the TRY. if (MI.getOpcode() == WebAssembly::LOOP) { - if (MBB.getNumber() > Bottom->getNumber()) + auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode(); + if (MBB.getNumber() > LoopBottom->getNumber()) AfterSet.insert(&MI); #ifndef NDEBUG else @@ -495,14 +507,16 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { #endif } - // All previously inserted TRY markers should be after the TRY because they - // are all nested trys. - if (MI.getOpcode() == WebAssembly::TRY) + // All previously inserted BLOCK/TRY markers should be after the TRY because + // they are all nested trys. + if (MI.getOpcode() == WebAssembly::BLOCK || + MI.getOpcode() == WebAssembly::TRY) AfterSet.insert(&MI); #ifndef NDEBUG - // All END_(LOOP/TRY) markers should be before the TRY. - if (MI.getOpcode() == WebAssembly::END_LOOP || + // All END_(BLOCK/LOOP/TRY) markers should be before the TRY. + if (MI.getOpcode() == WebAssembly::END_BLOCK || + MI.getOpcode() == WebAssembly::END_LOOP || MI.getOpcode() == WebAssembly::END_TRY) BeforeSet.insert(&MI); #endif @@ -530,10 +544,16 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { // throw. if (MBB.isPredecessor(Header)) { auto TermPos = Header->getFirstTerminator(); - if (TermPos == Header->end() || !WebAssembly::isRethrow(*TermPos)) { + if (TermPos == Header->end() || + TermPos->getOpcode() != WebAssembly::RETHROW) { for (const auto &MI : reverse(*Header)) { if (MI.isCall()) { AfterSet.insert(&MI); + // Possibly throwing calls are usually wrapped by EH_LABEL + // instructions. We don't want to split them and the call. + if (MI.getIterator() != Header->begin() && + std::prev(MI.getIterator())->isEHLabel()) + AfterSet.insert(&*std::prev(MI.getIterator())); break; } } @@ -541,7 +561,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { } // Add the TRY. - auto InsertPos = GetLatestInsertPos(Header, BeforeSet, AfterSet); + auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet); MachineInstr *Begin = BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos), TII.get(WebAssembly::TRY)) @@ -550,10 +570,11 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { // Decide where in Header to put the END_TRY. BeforeSet.clear(); AfterSet.clear(); - for (const auto &MI : *AfterTry) { + for (const auto &MI : *Cont) { #ifndef NDEBUG - // END_TRY should precede existing LOOP markers. - if (MI.getOpcode() == WebAssembly::LOOP) + // END_TRY should precede existing LOOP and BLOCK markers. + if (MI.getOpcode() == WebAssembly::LOOP || + MI.getOpcode() == WebAssembly::BLOCK) AfterSet.insert(&MI); // All END_TRY markers placed earlier belong to exceptions that contains @@ -567,31 +588,595 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { // the END_TRY marker should go after that. Otherwise, the whole try-catch // is contained within this loop, so the END_TRY should go before that. if (MI.getOpcode() == WebAssembly::END_LOOP) { - if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber()) + // For a LOOP to be after TRY, LOOP's BB should be after TRY's BB; if they + // are in the same BB, LOOP is always before TRY. + if (EndToBegin[&MI]->getParent()->getNumber() > Header->getNumber()) BeforeSet.insert(&MI); #ifndef NDEBUG else AfterSet.insert(&MI); #endif } + + // It is not possible for an END_BLOCK to be already in this block. } // Mark the end of the TRY. - InsertPos = GetEarliestInsertPos(AfterTry, BeforeSet, AfterSet); + InsertPos = getEarliestInsertPos(Cont, BeforeSet, AfterSet); MachineInstr *End = - BuildMI(*AfterTry, InsertPos, Bottom->findBranchDebugLoc(), + BuildMI(*Cont, InsertPos, Bottom->findBranchDebugLoc(), TII.get(WebAssembly::END_TRY)); registerTryScope(Begin, End, &MBB); - // Track the farthest-spanning scope that ends at this point. - int Number = AfterTry->getNumber(); - if (!ScopeTops[Number] || - ScopeTops[Number]->getNumber() > Header->getNumber()) - ScopeTops[Number] = Header; + // Track the farthest-spanning scope that ends at this point. We create two + // mappings: (BB with 'end_try' -> BB with 'try') and (BB with 'catch' -> BB + // with 'try'). We need to create 'catch' -> 'try' mapping here too because + // markers should not span across 'catch'. For example, this should not + // happen: + // + // try + // block --| (X) + // catch | + // end_block --| + // end_try + for (int Number : {Cont->getNumber(), MBB.getNumber()}) { + if (!ScopeTops[Number] || + ScopeTops[Number]->getNumber() > Header->getNumber()) + ScopeTops[Number] = Header; + } +} + +void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) { + const auto &TII = *MF.getSubtarget().getInstrInfo(); + + // When there is an unconditional branch right before a catch instruction and + // it branches to the end of end_try marker, we don't need the branch, because + // it there is no exception, the control flow transfers to that point anyway. + // bb0: + // try + // ... + // br bb2 <- Not necessary + // bb1: + // catch + // ... + // bb2: + // end + for (auto &MBB : MF) { + if (!MBB.isEHPad()) + continue; + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + MachineBasicBlock *EHPadLayoutPred = MBB.getPrevNode(); + MachineBasicBlock *Cont = BeginToEnd[EHPadToTry[&MBB]]->getParent(); + bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond); + if (Analyzable && ((Cond.empty() && TBB && TBB == Cont) || + (!Cond.empty() && FBB && FBB == Cont))) + TII.removeBranch(*EHPadLayoutPred); + } + + // When there are block / end_block markers that overlap with try / end_try + // markers, and the block and try markers' return types are the same, the + // block /end_block markers are not necessary, because try / end_try markers + // also can serve as boundaries for branches. + // block <- Not necessary + // try + // ... + // catch + // ... + // end + // end <- Not necessary + SmallVector ToDelete; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() != WebAssembly::TRY) + continue; + + MachineInstr *Try = &MI, *EndTry = BeginToEnd[Try]; + MachineBasicBlock *TryBB = Try->getParent(); + MachineBasicBlock *Cont = EndTry->getParent(); + int64_t RetType = Try->getOperand(0).getImm(); + for (auto B = Try->getIterator(), E = std::next(EndTry->getIterator()); + B != TryBB->begin() && E != Cont->end() && + std::prev(B)->getOpcode() == WebAssembly::BLOCK && + E->getOpcode() == WebAssembly::END_BLOCK && + std::prev(B)->getOperand(0).getImm() == RetType; + --B, ++E) { + ToDelete.push_back(&*std::prev(B)); + ToDelete.push_back(&*E); + } + } + } + for (auto *MI : ToDelete) { + if (MI->getOpcode() == WebAssembly::BLOCK) + unregisterScope(MI); + MI->eraseFromParent(); + } +} + +bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { + const auto &TII = *MF.getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Linearizing the control flow by placing TRY / END_TRY markers can create + // mismatches in unwind destinations. There are two kinds of mismatches we + // try to solve here. + + // 1. When an instruction may throw, but the EH pad it will unwind to can be + // different from the original CFG. + // + // Example: we have the following CFG: + // bb0: + // call @foo (if it throws, unwind to bb2) + // bb1: + // call @bar (if it throws, unwind to bb3) + // bb2 (ehpad): + // catch + // ... + // bb3 (ehpad) + // catch + // handler body + // + // And the CFG is sorted in this order. Then after placing TRY markers, it + // will look like: (BB markers are omitted) + // try $label1 + // try + // call @foo + // call @bar (if it throws, unwind to bb3) + // catch <- ehpad (bb2) + // ... + // end_try + // catch <- ehpad (bb3) + // handler body + // end_try + // + // Now if bar() throws, it is going to end up ip in bb2, not bb3, where it + // is supposed to end up. We solve this problem by + // a. Split the target unwind EH pad (here bb3) so that the handler body is + // right after 'end_try', which means we extract the handler body out of + // the catch block. We do this because this handler body should be + // somewhere branch-eable from the inner scope. + // b. Wrap the call that has an incorrect unwind destination ('call @bar' + // here) with a nested try/catch/end_try scope, and within the new catch + // block, branches to the handler body. + // c. Place a branch after the newly inserted nested end_try so it can bypass + // the handler body, which is now outside of a catch block. + // + // The result will like as follows. (new: a) means this instruction is newly + // created in the process of doing 'a' above. + // + // block $label0 (new: placeBlockMarker) + // try $label1 + // try + // call @foo + // try (new: b) + // call @bar + // catch (new: b) + // local.set n / drop (new: b) + // br $label1 (new: b) + // end_try (new: b) + // catch <- ehpad (bb2) + // end_try + // br $label0 (new: c) + // catch <- ehpad (bb3) + // end_try (hoisted: a) + // handler body + // end_block (new: placeBlockMarker) + // + // Note that the new wrapping block/end_block will be generated later in + // placeBlockMarker. + // + // TODO Currently local.set and local.gets are generated to move exnref value + // created by catches. That's because we don't support yielding values from a + // block in LLVM machine IR yet, even though it is supported by wasm. Delete + // unnecessary local.get/local.sets once yielding values from a block is + // supported. The full EH spec requires multi-value support to do this, but + // for C++ we don't yet need it because we only throw a single i32. + // + // --- + // 2. The same as 1, but in this case an instruction unwinds to a caller + // function and not another EH pad. + // + // Example: we have the following CFG: + // bb0: + // call @foo (if it throws, unwind to bb2) + // bb1: + // call @bar (if it throws, unwind to caller) + // bb2 (ehpad): + // catch + // ... + // + // And the CFG is sorted in this order. Then after placing TRY markers, it + // will look like: + // try + // call @foo + // call @bar (if it throws, unwind to caller) + // catch <- ehpad (bb2) + // ... + // end_try + // + // Now if bar() throws, it is going to end up ip in bb2, when it is supposed + // throw up to the caller. + // We solve this problem by + // a. Create a new 'appendix' BB at the end of the function and put a single + // 'rethrow' instruction (+ local.get) in there. + // b. Wrap the call that has an incorrect unwind destination ('call @bar' + // here) with a nested try/catch/end_try scope, and within the new catch + // block, branches to the new appendix block. + // + // block $label0 (new: placeBlockMarker) + // try + // call @foo + // try (new: b) + // call @bar + // catch (new: b) + // local.set n (new: b) + // br $label0 (new: b) + // end_try (new: b) + // catch <- ehpad (bb2) + // ... + // end_try + // ... + // end_block (new: placeBlockMarker) + // local.get n (new: a) <- appendix block + // rethrow (new: a) + // + // In case there are multiple calls in a BB that may throw to the caller, they + // can be wrapped together in one nested try scope. (In 1, this couldn't + // happen, because may-throwing instruction there had an unwind destination, + // i.e., it was an invoke before, and there could be only one invoke within a + // BB.) + + SmallVector EHPadStack; + // Range of intructions to be wrapped in a new nested try/catch + using TryRange = std::pair; + // In original CFG, + DenseMap> UnwindDestToTryRanges; + // In new CFG, + DenseMap> BrDestToTryRanges; + // In new CFG, + DenseMap BrDestToExnReg; + + // Gather possibly throwing calls (i.e., previously invokes) whose current + // unwind destination is not the same as the original CFG. + for (auto &MBB : reverse(MF)) { + bool SeenThrowableInstInBB = false; + for (auto &MI : reverse(MBB)) { + if (MI.getOpcode() == WebAssembly::TRY) + EHPadStack.pop_back(); + else if (MI.getOpcode() == WebAssembly::CATCH) + EHPadStack.push_back(MI.getParent()); + + // In this loop we only gather calls that have an EH pad to unwind. So + // there will be at most 1 such call (= invoke) in a BB, so after we've + // seen one, we can skip the rest of BB. Also if MBB has no EH pad + // successor or MI does not throw, this is not an invoke. + if (SeenThrowableInstInBB || !MBB.hasEHPadSuccessor() || + !WebAssembly::mayThrow(MI)) + continue; + SeenThrowableInstInBB = true; + + // If the EH pad on the stack top is where this instruction should unwind + // next, we're good. + MachineBasicBlock *UnwindDest = nullptr; + for (auto *Succ : MBB.successors()) { + if (Succ->isEHPad()) { + UnwindDest = Succ; + break; + } + } + if (EHPadStack.back() == UnwindDest) + continue; + + // If not, record the range. + UnwindDestToTryRanges[UnwindDest].push_back(TryRange(&MI, &MI)); + } + } + + assert(EHPadStack.empty()); + + // Gather possibly throwing calls that are supposed to unwind up to the caller + // if they throw, but currently unwind to an incorrect destination. Unlike the + // loop above, there can be multiple calls within a BB that unwind to the + // caller, which we should group together in a range. + bool NeedAppendixBlock = false; + for (auto &MBB : reverse(MF)) { + MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; // inclusive + for (auto &MI : reverse(MBB)) { + if (MI.getOpcode() == WebAssembly::TRY) + EHPadStack.pop_back(); + else if (MI.getOpcode() == WebAssembly::CATCH) + EHPadStack.push_back(MI.getParent()); + + // If MBB has an EH pad successor, this inst does not unwind to caller. + if (MBB.hasEHPadSuccessor()) + continue; + + // We wrap up the current range when we see a marker even if we haven't + // finished a BB. + if (RangeEnd && WebAssembly::isMarker(MI.getOpcode())) { + NeedAppendixBlock = true; + // Record the range. nullptr here means the unwind destination is the + // caller. + UnwindDestToTryRanges[nullptr].push_back( + TryRange(RangeBegin, RangeEnd)); + RangeBegin = RangeEnd = nullptr; // Reset range pointers + } + + // If EHPadStack is empty, that means it is correctly unwind to caller if + // it throws, so we're good. If MI does not throw, we're good too. + if (EHPadStack.empty() || !WebAssembly::mayThrow(MI)) + continue; + + // We found an instruction that unwinds to the caller but currently has an + // incorrect unwind destination. Create a new range or increment the + // currently existing range. + if (!RangeEnd) + RangeBegin = RangeEnd = &MI; + else + RangeBegin = &MI; + } + + if (RangeEnd) { + NeedAppendixBlock = true; + // Record the range. nullptr here means the unwind destination is the + // caller. + UnwindDestToTryRanges[nullptr].push_back(TryRange(RangeBegin, RangeEnd)); + RangeBegin = RangeEnd = nullptr; // Reset range pointers + } + } + + assert(EHPadStack.empty()); + // We don't have any unwind destination mismatches to resolve. + if (UnwindDestToTryRanges.empty()) + return false; + + // If we found instructions that should unwind to the caller but currently + // have incorrect unwind destination, we create an appendix block at the end + // of the function with a local.get and a rethrow instruction. + if (NeedAppendixBlock) { + auto *AppendixBB = getAppendixBlock(MF); + unsigned ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); + BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW)) + .addReg(ExnReg); + // These instruction ranges should branch to this appendix BB. + for (auto Range : UnwindDestToTryRanges[nullptr]) + BrDestToTryRanges[AppendixBB].push_back(Range); + BrDestToExnReg[AppendixBB] = ExnReg; + } + + // We loop through unwind destination EH pads that are targeted from some + // inner scopes. Because these EH pads are destination of more than one scope + // now, we split them so that the handler body is after 'end_try'. + // - Before + // ehpad: + // catch + // local.set n / drop + // handler body + // ... + // cont: + // end_try + // + // - After + // ehpad: + // catch + // local.set n / drop + // brdest: (new) + // end_try (hoisted from 'cont' BB) + // handler body (taken from 'ehpad') + // ... + // cont: + for (auto &P : UnwindDestToTryRanges) { + NumUnwindMismatches++; + + // This means the destination is the appendix BB, which was separately + // handled above. + if (!P.first) + continue; + + MachineBasicBlock *EHPad = P.first; + + // Find 'catch' and 'local.set' or 'drop' instruction that follows the + // 'catch'. If -wasm-disable-explicit-locals is not set, 'catch' should be + // always followed by either 'local.set' or a 'drop', because 'br_on_exn' is + // generated after 'catch' in LateEHPrepare and we don't support blocks + // taking values yet. + MachineInstr *Catch = nullptr; + unsigned ExnReg = 0; + for (auto &MI : *EHPad) { + switch (MI.getOpcode()) { + case WebAssembly::CATCH: + Catch = &MI; + ExnReg = Catch->getOperand(0).getReg(); + break; + } + } + assert(Catch && "EH pad does not have a catch"); + assert(ExnReg != 0 && "Invalid register"); + + auto SplitPos = std::next(Catch->getIterator()); + + // Create a new BB that's gonna be the destination for branches from the + // inner mismatched scope. + MachineInstr *BeginTry = EHPadToTry[EHPad]; + MachineInstr *EndTry = BeginToEnd[BeginTry]; + MachineBasicBlock *Cont = EndTry->getParent(); + auto *BrDest = MF.CreateMachineBasicBlock(); + MF.insert(std::next(EHPad->getIterator()), BrDest); + // Hoist up the existing 'end_try'. + BrDest->insert(BrDest->end(), EndTry->removeFromParent()); + // Take out the handler body from EH pad to the new branch destination BB. + BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end()); + // Fix predecessor-successor relationship. + BrDest->transferSuccessors(EHPad); + EHPad->addSuccessor(BrDest); + + // All try ranges that were supposed to unwind to this EH pad now have to + // branch to this new branch dest BB. + for (auto Range : UnwindDestToTryRanges[EHPad]) + BrDestToTryRanges[BrDest].push_back(Range); + BrDestToExnReg[BrDest] = ExnReg; + + // In case we fall through to the continuation BB after the catch block, we + // now have to add a branch to it. + // - Before + // try + // ... + // (falls through to 'cont') + // catch + // handler body + // end + // <-- cont + // + // - After + // try + // ... + // br %cont (new) + // catch + // end + // handler body + // <-- cont + MachineBasicBlock *EHPadLayoutPred = &*std::prev(EHPad->getIterator()); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond); + if (Analyzable && !TBB && !FBB) { + DebugLoc DL = EHPadLayoutPred->empty() + ? DebugLoc() + : EHPadLayoutPred->rbegin()->getDebugLoc(); + BuildMI(EHPadLayoutPred, DL, TII.get(WebAssembly::BR)).addMBB(Cont); + } + } + + // For possibly throwing calls whose unwind destinations are currently + // incorrect because of CFG linearization, we wrap them with a nested + // try/catch/end_try, and within the new catch block, we branch to the correct + // handler. + // - Before + // mbb: + // call @foo <- Unwind destination mismatch! + // ehpad: + // ... + // + // - After + // mbb: + // try (new) + // call @foo + // nested-ehpad: (new) + // catch (new) + // local.set n / drop (new) + // br %brdest (new) + // nested-end: (new) + // end_try (new) + // ehpad: + // ... + for (auto &P : BrDestToTryRanges) { + MachineBasicBlock *BrDest = P.first; + auto &TryRanges = P.second; + unsigned ExnReg = BrDestToExnReg[BrDest]; + + for (auto Range : TryRanges) { + MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; + std::tie(RangeBegin, RangeEnd) = Range; + auto *MBB = RangeBegin->getParent(); + + // Include possible EH_LABELs in the range + if (RangeBegin->getIterator() != MBB->begin() && + std::prev(RangeBegin->getIterator())->isEHLabel()) + RangeBegin = &*std::prev(RangeBegin->getIterator()); + if (std::next(RangeEnd->getIterator()) != MBB->end() && + std::next(RangeEnd->getIterator())->isEHLabel()) + RangeEnd = &*std::next(RangeEnd->getIterator()); + + MachineBasicBlock *EHPad = nullptr; + for (auto *Succ : MBB->successors()) { + if (Succ->isEHPad()) { + EHPad = Succ; + break; + } + } + + // Create the nested try instruction. + MachineInstr *NestedTry = + BuildMI(*MBB, *RangeBegin, RangeBegin->getDebugLoc(), + TII.get(WebAssembly::TRY)) + .addImm(int64_t(WebAssembly::ExprType::Void)); + + // Create the nested EH pad and fill instructions in. + MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock(); + MF.insert(std::next(MBB->getIterator()), NestedEHPad); + NestedEHPad->setIsEHPad(); + NestedEHPad->setIsEHScopeEntry(); + BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::CATCH), + ExnReg); + BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::BR)) + .addMBB(BrDest); + + // Create the nested continuation BB and end_try instruction. + MachineBasicBlock *NestedCont = MF.CreateMachineBasicBlock(); + MF.insert(std::next(NestedEHPad->getIterator()), NestedCont); + MachineInstr *NestedEndTry = + BuildMI(*NestedCont, NestedCont->begin(), RangeEnd->getDebugLoc(), + TII.get(WebAssembly::END_TRY)); + // In case MBB has more instructions after the try range, move them to the + // new nested continuation BB. + NestedCont->splice(NestedCont->end(), MBB, + std::next(RangeEnd->getIterator()), MBB->end()); + registerTryScope(NestedTry, NestedEndTry, NestedEHPad); + + // Fix predecessor-successor relationship. + NestedCont->transferSuccessors(MBB); + if (EHPad) + NestedCont->removeSuccessor(EHPad); + MBB->addSuccessor(NestedEHPad); + MBB->addSuccessor(NestedCont); + NestedEHPad->addSuccessor(BrDest); + } + } + + // Renumber BBs and recalculate ScopeTop info because new BBs might have been + // created and inserted above. + MF.RenumberBlocks(); + ScopeTops.clear(); + ScopeTops.resize(MF.getNumBlockIDs()); + for (auto &MBB : reverse(MF)) { + for (auto &MI : reverse(MBB)) { + if (ScopeTops[MBB.getNumber()]) + break; + switch (MI.getOpcode()) { + case WebAssembly::END_BLOCK: + case WebAssembly::END_LOOP: + case WebAssembly::END_TRY: + ScopeTops[MBB.getNumber()] = EndToBegin[&MI]->getParent(); + break; + case WebAssembly::CATCH: + ScopeTops[MBB.getNumber()] = EHPadToTry[&MBB]->getParent(); + break; + } + } + } + + // Recompute the dominator tree. + getAnalysis().runOnMachineFunction(MF); + + // Place block markers for newly added branches. + SmallVector BrDests; + for (auto &P : BrDestToTryRanges) + BrDests.push_back(P.first); + llvm::sort(BrDests, + [&](const MachineBasicBlock *A, const MachineBasicBlock *B) { + auto ANum = A->getNumber(); + auto BNum = B->getNumber(); + return ANum < BNum; + }); + for (auto *Dest : BrDests) + placeBlockMarker(*Dest); + + return true; } static unsigned -GetDepth(const SmallVectorImpl &Stack, +getDepth(const SmallVectorImpl &Stack, const MachineBasicBlock *MBB) { unsigned Depth = 0; for (auto X : reverse(Stack)) { @@ -617,19 +1202,19 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { if (MFI.getResults().empty()) return; - WebAssembly::ExprType retType; + WebAssembly::ExprType RetType; switch (MFI.getResults().front().SimpleTy) { case MVT::i32: - retType = WebAssembly::ExprType::I32; + RetType = WebAssembly::ExprType::I32; break; case MVT::i64: - retType = WebAssembly::ExprType::I64; + RetType = WebAssembly::ExprType::I64; break; case MVT::f32: - retType = WebAssembly::ExprType::F32; + RetType = WebAssembly::ExprType::F32; break; case MVT::f64: - retType = WebAssembly::ExprType::F64; + RetType = WebAssembly::ExprType::F64; break; case MVT::v16i8: case MVT::v8i16: @@ -637,10 +1222,10 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: - retType = WebAssembly::ExprType::V128; + RetType = WebAssembly::ExprType::V128; break; - case MVT::ExceptRef: - retType = WebAssembly::ExprType::ExceptRef; + case MVT::exnref: + RetType = WebAssembly::ExprType::Exnref; break; default: llvm_unreachable("unexpected return type"); @@ -651,11 +1236,11 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { if (MI.isPosition() || MI.isDebugInstr()) continue; if (MI.getOpcode() == WebAssembly::END_BLOCK) { - EndToBegin[&MI]->getOperand(0).setImm(int32_t(retType)); + EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType)); continue; } if (MI.getOpcode() == WebAssembly::END_LOOP) { - EndToBegin[&MI]->getOperand(0).setImm(int32_t(retType)); + EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType)); continue; } // Something other than an `end`. We're done. @@ -666,7 +1251,7 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { // WebAssembly functions end with an end instruction, as if the function body // were a block. -static void AppendEndToFunction(MachineFunction &MF, +static void appendEndToFunction(MachineFunction &MF, const WebAssemblyInstrInfo &TII) { BuildMI(MF.back(), MF.back().end(), MF.back().findPrevDebugLoc(MF.back().end()), @@ -675,66 +1260,42 @@ static void AppendEndToFunction(MachineFunction &MF, /// Insert LOOP/TRY/BLOCK markers at appropriate places. void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { - const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo(); // We allocate one more than the number of blocks in the function to // accommodate for the possible fake block we may insert at the end. ScopeTops.resize(MF.getNumBlockIDs() + 1); // Place the LOOP for MBB if MBB is the header of a loop. for (auto &MBB : MF) placeLoopMarker(MBB); - // Place the TRY for MBB if MBB is the EH pad of an exception. - if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && - MF.getFunction().hasPersonalityFn()) - for (auto &MBB : MF) - placeTryMarker(MBB); - // Place the BLOCK for MBB if MBB is branched to from above. - for (auto &MBB : MF) - placeBlockMarker(MBB); + + const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo(); + for (auto &MBB : MF) { + if (MBB.isEHPad()) { + // Place the TRY for MBB if MBB is the EH pad of an exception. + if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && + MF.getFunction().hasPersonalityFn()) + placeTryMarker(MBB); + } else { + // Place the BLOCK for MBB if MBB is branched to from above. + placeBlockMarker(MBB); + } + } + // Fix mismatches in unwind destinations induced by linearizing the code. + fixUnwindMismatches(MF); } void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { - const auto &TII = *MF.getSubtarget().getInstrInfo(); // Now rewrite references to basic blocks to be depth immediates. - // We need two stacks: one for normal scopes and the other for EH pad scopes. - // EH pad stack is used to rewrite depths in rethrow instructions. SmallVector Stack; - SmallVector EHPadStack; for (auto &MBB : reverse(MF)) { for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) { MachineInstr &MI = *I; switch (MI.getOpcode()) { case WebAssembly::BLOCK: - assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <= - MBB.getNumber() && - "Block/try should be balanced"); - Stack.pop_back(); - break; - case WebAssembly::TRY: assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <= MBB.getNumber() && "Block/try marker should be balanced"); Stack.pop_back(); - EHPadStack.pop_back(); - break; - - case WebAssembly::CATCH_I32: - case WebAssembly::CATCH_I64: - case WebAssembly::CATCH_ALL: - // Currently the only case there are more than one catch for a try is - // for catch terminate pad, in the form of - // try - // catch - // call @__clang_call_terminate - // unreachable - // catch_all - // call @std::terminate - // unreachable - // end - // So we shouldn't push the current BB for the second catch_all block - // here. - if (!WebAssembly::isCatchAllTerminatePad(MBB)) - EHPadStack.push_back(&MBB); break; case WebAssembly::LOOP: @@ -751,23 +1312,6 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { Stack.push_back(EndToBegin[&MI]->getParent()); break; - case WebAssembly::RETHROW: { - // Rewrite MBB operands to be depth immediates. - unsigned EHPadDepth = GetDepth(EHPadStack, MI.getOperand(0).getMBB()); - MI.RemoveOperand(0); - MI.addOperand(MF, MachineOperand::CreateImm(EHPadDepth)); - break; - } - - case WebAssembly::RETHROW_TO_CALLER: { - MachineInstr *Rethrow = - BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(WebAssembly::RETHROW)) - .addImm(EHPadStack.size()); - MI.eraseFromParent(); - I = MachineBasicBlock::reverse_iterator(Rethrow); - break; - } - default: if (MI.isTerminator()) { // Rewrite MBB operands to be depth immediates. @@ -776,7 +1320,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { MI.RemoveOperand(MI.getNumOperands() - 1); for (auto MO : Ops) { if (MO.isMBB()) - MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB())); + MO = MachineOperand::CreateImm(getDepth(Stack, MO.getMBB())); MI.addOperand(MF, MO); } } @@ -793,13 +1337,14 @@ void WebAssemblyCFGStackify::releaseMemory() { EndToBegin.clear(); TryToEHPad.clear(); EHPadToTry.clear(); - BeginToBottom.clear(); + AppendixBB = nullptr; } bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "********** CFG Stackifying **********\n" "********** Function: " << MF.getName() << '\n'); + const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo(); releaseMemory(); @@ -809,6 +1354,11 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { // Place the BLOCK/LOOP/TRY markers to indicate the beginnings of scopes. placeMarkers(MF); + // Remove unnecessary instructions possibly introduced by try/end_trys. + if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && + MF.getFunction().hasPersonalityFn()) + removeUnnecessaryInstrs(MF); + // Convert MBB operands in terminators to relative depth immediates. rewriteDepthImmediates(MF); @@ -821,7 +1371,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { if (!MF.getSubtarget() .getTargetTriple() .isOSBinFormatELF()) - AppendEndToFunction(MF, TII); + appendEndToFunction(MF, TII); + MF.getInfo()->setCFGStackified(); return true; } diff --git a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp index aaa6d286598f..2537e6042b1e 100644 --- a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyCallIndirectFixup.cpp - Fix call_indirects -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -61,19 +60,19 @@ FunctionPass *llvm::createWebAssemblyCallIndirectFixup() { return new WebAssemblyCallIndirectFixup(); } -static unsigned GetNonPseudoCallIndirectOpcode(const MachineInstr &MI) { +static unsigned getNonPseudoCallIndirectOpcode(const MachineInstr &MI) { switch (MI.getOpcode()) { using namespace WebAssembly; case PCALL_INDIRECT_VOID: return CALL_INDIRECT_VOID; - case PCALL_INDIRECT_I32: - return CALL_INDIRECT_I32; - case PCALL_INDIRECT_I64: - return CALL_INDIRECT_I64; - case PCALL_INDIRECT_F32: - return CALL_INDIRECT_F32; - case PCALL_INDIRECT_F64: - return CALL_INDIRECT_F64; + case PCALL_INDIRECT_i32: + return CALL_INDIRECT_i32; + case PCALL_INDIRECT_i64: + return CALL_INDIRECT_i64; + case PCALL_INDIRECT_f32: + return CALL_INDIRECT_f32; + case PCALL_INDIRECT_f64: + return CALL_INDIRECT_f64; case PCALL_INDIRECT_v16i8: return CALL_INDIRECT_v16i8; case PCALL_INDIRECT_v8i16: @@ -86,13 +85,17 @@ static unsigned GetNonPseudoCallIndirectOpcode(const MachineInstr &MI) { return CALL_INDIRECT_v4f32; case PCALL_INDIRECT_v2f64: return CALL_INDIRECT_v2f64; + case PCALL_INDIRECT_exnref: + return CALL_INDIRECT_exnref; + case PRET_CALL_INDIRECT: + return RET_CALL_INDIRECT; default: return INSTRUCTION_LIST_END; } } -static bool IsPseudoCallIndirect(const MachineInstr &MI) { - return GetNonPseudoCallIndirectOpcode(MI) != +static bool isPseudoCallIndirect(const MachineInstr &MI) { + return getNonPseudoCallIndirectOpcode(MI) != WebAssembly::INSTRUCTION_LIST_END; } @@ -106,11 +109,11 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (IsPseudoCallIndirect(MI)) { + if (isPseudoCallIndirect(MI)) { LLVM_DEBUG(dbgs() << "Found call_indirect: " << MI << '\n'); // Rewrite pseudo to non-pseudo - const MCInstrDesc &Desc = TII->get(GetNonPseudoCallIndirectOpcode(MI)); + const MCInstrDesc &Desc = TII->get(getNonPseudoCallIndirectOpcode(MI)); MI.setDesc(Desc); // Rewrite argument order diff --git a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp index 8ecc159951ad..579377c9a5d7 100644 --- a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp +++ b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyDebugValueManager.cpp - WebAssembly DebugValue Manager -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h index 73f317214058..06e8805b5ad0 100644 --- a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h +++ b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h @@ -1,9 +1,8 @@ // WebAssemblyDebugValueManager.h - WebAssembly DebugValue Manager -*- C++ -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp b/lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp deleted file mode 100644 index c86260ba408c..000000000000 --- a/lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp +++ /dev/null @@ -1,87 +0,0 @@ -//===-- WebAssemblyEHRestoreStackPointer.cpp - __stack_pointer restoration ===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// After the stack is unwound due to a thrown exception, the __stack_pointer -/// global can point to an invalid address. This inserts instructions that -/// restore __stack_pointer global. -/// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" -#include "WebAssembly.h" -#include "WebAssemblySubtarget.h" -#include "WebAssemblyUtilities.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/MC/MCAsmInfo.h" -using namespace llvm; - -#define DEBUG_TYPE "wasm-eh-restore-stack-pointer" - -namespace { -class WebAssemblyEHRestoreStackPointer final : public MachineFunctionPass { -public: - static char ID; // Pass identification, replacement for typeid - WebAssemblyEHRestoreStackPointer() : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { - return "WebAssembly Restore Stack Pointer for Exception Handling"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; -} // end anonymous namespace - -char WebAssemblyEHRestoreStackPointer::ID = 0; -INITIALIZE_PASS(WebAssemblyEHRestoreStackPointer, DEBUG_TYPE, - "Restore Stack Pointer for Exception Handling", true, false) - -FunctionPass *llvm::createWebAssemblyEHRestoreStackPointer() { - return new WebAssemblyEHRestoreStackPointer(); -} - -bool WebAssemblyEHRestoreStackPointer::runOnMachineFunction( - MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "********** EH Restore Stack Pointer **********\n" - "********** Function: " - << MF.getName() << '\n'); - - const auto *FrameLowering = static_cast( - MF.getSubtarget().getFrameLowering()); - if (!FrameLowering->needsPrologForEH(MF)) - return false; - bool Changed = false; - - for (auto &MBB : MF) { - if (!MBB.isEHPad()) - continue; - Changed = true; - - // Insert __stack_pointer restoring instructions at the beginning of each EH - // pad, after the catch instruction. (Catch instructions may have been - // reordered, and catch_all instructions have not been inserted yet, but - // those cases are handled in LateEHPrepare). - // - // Here it is safe to assume that SP32 holds the latest value of - // __stack_pointer, because the only exception for this case is when a - // function uses the red zone, but that only happens with leaf functions, - // and we don't restore __stack_pointer in leaf functions anyway. - auto InsertPos = MBB.begin(); - if (WebAssembly::isCatch(*MBB.begin())) - InsertPos++; - FrameLowering->writeSPToGlobal(WebAssembly::SP32, MF, MBB, InsertPos, - MBB.begin()->getDebugLoc()); - } - return Changed; -} diff --git a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp index 6b3a3e765786..0387957b14c2 100644 --- a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp @@ -1,9 +1,8 @@ //===--- WebAssemblyExceptionInfo.cpp - Exception Infomation --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -51,10 +50,6 @@ void WebAssemblyExceptionInfo::recalculate( MachineBasicBlock *EHPad = DomNode->getBlock(); if (!EHPad->isEHPad()) continue; - // We group catch & catch-all terminate pads together, so skip the second - // one - if (WebAssembly::isCatchAllTerminatePad(*EHPad)) - continue; auto *WE = new WebAssemblyException(EHPad); discoverAndMapException(WE, MDT, MDF); Exceptions.push_back(WE); @@ -105,16 +100,6 @@ void WebAssemblyExceptionInfo::discoverAndMapException( // Map blocks that belong to a catchpad / cleanuppad MachineBasicBlock *EHPad = WE->getEHPad(); - - // We group catch & catch-all terminate pads together within an exception - if (WebAssembly::isCatchTerminatePad(*EHPad)) { - assert(EHPad->succ_size() == 1 && - "Catch terminate pad has more than one successors"); - changeExceptionFor(EHPad, WE); - changeExceptionFor(*(EHPad->succ_begin()), WE); - return; - } - SmallVector WL; WL.push_back(EHPad); while (!WL.empty()) { diff --git a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h index fcd7e2366e03..9a90d7df7d47 100644 --- a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h @@ -1,9 +1,8 @@ //===-- WebAssemblyExceptionInfo.h - WebAssembly Exception Info -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index 27aabe6ba0bd..dbd62179f055 100644 --- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyExplicitLocals.cpp - Make Locals Explicit --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -91,13 +90,13 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) { return WebAssembly::DROP_F64; if (RC == &WebAssembly::V128RegClass) return WebAssembly::DROP_V128; - if (RC == &WebAssembly::EXCEPT_REFRegClass) - return WebAssembly::DROP_EXCEPT_REF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::DROP_EXNREF; llvm_unreachable("Unexpected register class"); } /// Get the appropriate local.get opcode for the given register class. -static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) { +static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) { if (RC == &WebAssembly::I32RegClass) return WebAssembly::LOCAL_GET_I32; if (RC == &WebAssembly::I64RegClass) @@ -108,13 +107,13 @@ static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_GET_F64; if (RC == &WebAssembly::V128RegClass) return WebAssembly::LOCAL_GET_V128; - if (RC == &WebAssembly::EXCEPT_REFRegClass) - return WebAssembly::LOCAL_GET_EXCEPT_REF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_GET_EXNREF; llvm_unreachable("Unexpected register class"); } /// Get the appropriate local.set opcode for the given register class. -static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) { +static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) { if (RC == &WebAssembly::I32RegClass) return WebAssembly::LOCAL_SET_I32; if (RC == &WebAssembly::I64RegClass) @@ -125,13 +124,13 @@ static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_SET_F64; if (RC == &WebAssembly::V128RegClass) return WebAssembly::LOCAL_SET_V128; - if (RC == &WebAssembly::EXCEPT_REFRegClass) - return WebAssembly::LOCAL_SET_EXCEPT_REF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_SET_EXNREF; llvm_unreachable("Unexpected register class"); } /// Get the appropriate local.tee opcode for the given register class. -static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) { +static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) { if (RC == &WebAssembly::I32RegClass) return WebAssembly::LOCAL_TEE_I32; if (RC == &WebAssembly::I64RegClass) @@ -142,8 +141,8 @@ static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_TEE_F64; if (RC == &WebAssembly::V128RegClass) return WebAssembly::LOCAL_TEE_V128; - if (RC == &WebAssembly::EXCEPT_REFRegClass) - return WebAssembly::LOCAL_TEE_EXCEPT_REF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_TEE_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -159,8 +158,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) { return MVT::f64; if (RC == &WebAssembly::V128RegClass) return MVT::v16i8; - if (RC == &WebAssembly::EXCEPT_REFRegClass) - return MVT::ExceptRef; + if (RC == &WebAssembly::EXNREFRegClass) + return MVT::exnref; llvm_unreachable("unrecognized register class"); } @@ -206,7 +205,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { E = MF.begin()->end(); I != E;) { MachineInstr &MI = *I++; - if (!WebAssembly::isArgument(MI)) + if (!WebAssembly::isArgument(MI.getOpcode())) break; unsigned Reg = MI.getOperand(0).getReg(); assert(!MFI.isVRegStackified(Reg)); @@ -228,7 +227,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr &MI = *I++; - assert(!WebAssembly::isArgument(MI)); + assert(!WebAssembly::isArgument(MI.getOpcode())); if (MI.isDebugInstr() || MI.isLabel()) continue; @@ -236,7 +235,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // Replace tee instructions with local.tee. The difference is that tee // instructions have two defs, while local.tee instructions have one def // and an index of a local to write to. - if (WebAssembly::isTee(MI)) { + if (WebAssembly::isTee(MI.getOpcode())) { assert(MFI.isVRegStackified(MI.getOperand(0).getReg())); assert(!MFI.isVRegStackified(MI.getOperand(1).getReg())); unsigned OldReg = MI.getOperand(2).getReg(); @@ -246,7 +245,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { if (!MFI.isVRegStackified(OldReg)) { unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); unsigned NewReg = MRI.createVirtualRegister(RC); - unsigned Opc = getGetLocalOpcode(RC); + unsigned Opc = getLocalGetOpcode(RC); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg) .addImm(LocalId); MI.getOperand(2).setReg(NewReg); @@ -256,7 +255,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // Replace the TEE with a LOCAL_TEE. unsigned LocalId = getLocalId(Reg2Local, CurLocal, MI.getOperand(1).getReg()); - unsigned Opc = getTeeLocalOpcode(RC); + unsigned Opc = getLocalTeeOpcode(RC); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), MI.getOperand(0).getReg()) .addImm(LocalId) @@ -275,7 +274,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { if (!MFI.isVRegStackified(OldReg)) { const TargetRegisterClass *RC = MRI.getRegClass(OldReg); unsigned NewReg = MRI.createVirtualRegister(RC); - auto InsertPt = std::next(MachineBasicBlock::iterator(&MI)); + auto InsertPt = std::next(MI.getIterator()); if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) { MI.eraseFromParent(); Changed = true; @@ -290,7 +289,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { Drop->getOperand(0).setIsKill(); } else { unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); - unsigned Opc = getSetLocalOpcode(RC); + unsigned Opc = getLocalSetOpcode(RC); BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc)) .addImm(LocalId) .addReg(NewReg); @@ -317,7 +316,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // with inline asm register operands is to provide local indices as // immediates. if (MO.isDef()) { - assert(MI.getOpcode() == TargetOpcode::INLINEASM); + assert(MI.isInlineAsm()); unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); // If this register operand is tied to another operand, we can't // change it to an immediate. Untie it first. @@ -335,7 +334,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // Our contract with inline asm register operands is to provide local // indices as immediates. - if (MI.getOpcode() == TargetOpcode::INLINEASM) { + if (MI.isInlineAsm()) { unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); // Untie it first if this reg operand is tied to another operand. MI.untieRegOperand(MI.getOperandNo(&MO)); @@ -347,7 +346,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); const TargetRegisterClass *RC = MRI.getRegClass(OldReg); unsigned NewReg = MRI.createVirtualRegister(RC); - unsigned Opc = getGetLocalOpcode(RC); + unsigned Opc = getLocalGetOpcode(RC); InsertPt = BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg) .addImm(LocalId); @@ -357,7 +356,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { } // Coalesce and eliminate COPY instructions. - if (WebAssembly::isCopy(MI)) { + if (WebAssembly::isCopy(MI.getOpcode())) { MRI.replaceRegWith(MI.getOperand(1).getReg(), MI.getOperand(0).getReg()); MI.eraseFromParent(); diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 3856700cca94..2552e9150833 100644 --- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -50,22 +49,22 @@ class WebAssemblyFastISel final : public FastISel { // All possible address modes. class Address { public: - typedef enum { RegBase, FrameIndexBase } BaseKind; + using BaseKind = enum { RegBase, FrameIndexBase }; private: - BaseKind Kind; + BaseKind Kind = RegBase; union { unsigned Reg; int FI; } Base; - int64_t Offset; + int64_t Offset = 0; - const GlobalValue *GV; + const GlobalValue *GV = nullptr; public: // Innocuous defaults for our address. - Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; } + Address() { Base.Reg = 0; } void setKind(BaseKind K) { assert(!isSet() && "Can't change kind with non-zero base"); Kind = K; @@ -92,9 +91,9 @@ class WebAssemblyFastISel final : public FastISel { return Base.FI; } - void setOffset(int64_t Offset_) { - assert(Offset_ >= 0 && "Offsets must be non-negative"); - Offset = Offset_; + void setOffset(int64_t NewOffset) { + assert(NewOffset >= 0 && "Offsets must be non-negative"); + Offset = NewOffset; } int64_t getOffset() const { return Offset; } void setGlobalValue(const GlobalValue *G) { GV = G; } @@ -116,7 +115,7 @@ class WebAssemblyFastISel final : public FastISel { private: // Utility helper routines MVT::SimpleValueType getSimpleType(Type *Ty) { - EVT VT = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true); + EVT VT = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true); return VT.isSimple() ? VT.getSimpleVT().SimpleTy : MVT::INVALID_SIMPLE_VALUE_TYPE; } @@ -130,7 +129,7 @@ private: case MVT::i64: case MVT::f32: case MVT::f64: - case MVT::ExceptRef: + case MVT::exnref: return VT; case MVT::f16: return MVT::f32; @@ -208,10 +207,9 @@ public: } // end anonymous namespace bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { - const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; - if (const Instruction *I = dyn_cast(Obj)) { + if (const auto *I = dyn_cast(Obj)) { // Don't walk into other basic blocks unless the object is an alloca from // another block, otherwise it may not have a virtual register assigned. if (FuncInfo.StaticAllocaMap.count(static_cast(Obj)) || @@ -219,7 +217,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { Opcode = I->getOpcode(); U = I; } - } else if (const ConstantExpr *C = dyn_cast(Obj)) { + } else if (const auto *C = dyn_cast(Obj)) { Opcode = C->getOpcode(); U = C; } @@ -230,9 +228,13 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { // address spaces. return false; - if (const GlobalValue *GV = dyn_cast(Obj)) { + if (const auto *GV = dyn_cast(Obj)) { + if (TLI.isPositionIndependent()) + return false; if (Addr.getGlobalValue()) return false; + if (GV->isThreadLocal()) + return false; Addr.setGlobalValue(GV); return true; } @@ -275,7 +277,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { } else { uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); for (;;) { - if (const ConstantInt *CI = dyn_cast(Op)) { + if (const auto *CI = dyn_cast(Op)) { // Constant-offset addressing. TmpOffset += CI->getSExtValue() * S; break; @@ -290,8 +292,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { } if (canFoldAddIntoGEP(U, Op)) { // A compatible add with a constant operand. Fold the constant. - ConstantInt *CI = - cast(cast(Op)->getOperand(1)); + auto *CI = cast(cast(Op)->getOperand(1)); TmpOffset += CI->getSExtValue() * S; // Iterate on the other operand. Op = cast(Op)->getOperand(0); @@ -315,7 +316,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { break; } case Instruction::Alloca: { - const AllocaInst *AI = cast(Obj); + const auto *AI = cast(Obj); DenseMap::iterator SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { @@ -336,7 +337,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { if (isa(LHS)) std::swap(LHS, RHS); - if (const ConstantInt *CI = dyn_cast(RHS)) { + if (const auto *CI = dyn_cast(RHS)) { uint64_t TmpOffset = Addr.getOffset() + CI->getSExtValue(); if (int64_t(TmpOffset) >= 0) { Addr.setOffset(TmpOffset); @@ -356,7 +357,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { const Value *LHS = U->getOperand(0); const Value *RHS = U->getOperand(1); - if (const ConstantInt *CI = dyn_cast(RHS)) { + if (const auto *CI = dyn_cast(RHS)) { int64_t TmpOffset = Addr.getOffset() - CI->getSExtValue(); if (TmpOffset >= 0) { Addr.setOffset(TmpOffset); @@ -416,7 +417,7 @@ unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) { } unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) { - if (const ICmpInst *ICmp = dyn_cast(V)) + if (const auto *ICmp = dyn_cast(V)) if (const ConstantInt *C = dyn_cast(ICmp->getOperand(1))) if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) { Not = ICmp->isTrueWhenEqual(); @@ -524,7 +525,10 @@ unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V, return Result; } - return zeroExtendToI32(Reg, V, From); + if (To == MVT::i32) + return zeroExtendToI32(Reg, V, From); + + return 0; } unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V, @@ -543,7 +547,10 @@ unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V, return Result; } - return signExtendToI32(Reg, V, From); + if (To == MVT::i32) + return signExtendToI32(Reg, V, From); + + return 0; } unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) { @@ -607,6 +614,10 @@ unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) { unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) { if (const GlobalValue *GV = dyn_cast(C)) { + if (TLI.isPositionIndependent()) + return 0; + if (GV->isThreadLocal()) + return 0; unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ? &WebAssembly::I64RegClass : &WebAssembly::I32RegClass); @@ -629,14 +640,14 @@ bool WebAssemblyFastISel::fastLowerArguments() { if (F->isVarArg()) return false; - unsigned i = 0; + unsigned I = 0; for (auto const &Arg : F->args()) { const AttributeList &Attrs = F->getAttributes(); - if (Attrs.hasParamAttribute(i, Attribute::ByVal) || - Attrs.hasParamAttribute(i, Attribute::SwiftSelf) || - Attrs.hasParamAttribute(i, Attribute::SwiftError) || - Attrs.hasParamAttribute(i, Attribute::InAlloca) || - Attrs.hasParamAttribute(i, Attribute::Nest)) + if (Attrs.hasParamAttribute(I, Attribute::ByVal) || + Attrs.hasParamAttribute(I, Attribute::SwiftSelf) || + Attrs.hasParamAttribute(I, Attribute::SwiftError) || + Attrs.hasParamAttribute(I, Attribute::InAlloca) || + Attrs.hasParamAttribute(I, Attribute::Nest)) return false; Type *ArgTy = Arg.getType(); @@ -691,19 +702,19 @@ bool WebAssemblyFastISel::fastLowerArguments() { Opc = WebAssembly::ARGUMENT_v2f64; RC = &WebAssembly::V128RegClass; break; - case MVT::ExceptRef: - Opc = WebAssembly::ARGUMENT_ExceptRef; - RC = &WebAssembly::EXCEPT_REFRegClass; + case MVT::exnref: + Opc = WebAssembly::ARGUMENT_exnref; + RC = &WebAssembly::EXNREFRegClass; break; default: return false; } unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addImm(i); + .addImm(I); updateValueMap(&Arg, ResultReg); - ++i; + ++I; } MRI.addLiveIn(WebAssembly::ARGUMENTS); @@ -732,8 +743,9 @@ bool WebAssemblyFastISel::fastLowerArguments() { } bool WebAssemblyFastISel::selectCall(const Instruction *I) { - const CallInst *Call = cast(I); + const auto *Call = cast(I); + // TODO: Support tail calls in FastISel if (Call->isMustTailCall() || Call->isInlineAsm() || Call->getFunctionType()->isVarArg()) return false; @@ -762,19 +774,19 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { case MVT::i8: case MVT::i16: case MVT::i32: - Opc = IsDirect ? WebAssembly::CALL_I32 : WebAssembly::PCALL_INDIRECT_I32; + Opc = IsDirect ? WebAssembly::CALL_i32 : WebAssembly::PCALL_INDIRECT_i32; ResultReg = createResultReg(&WebAssembly::I32RegClass); break; case MVT::i64: - Opc = IsDirect ? WebAssembly::CALL_I64 : WebAssembly::PCALL_INDIRECT_I64; + Opc = IsDirect ? WebAssembly::CALL_i64 : WebAssembly::PCALL_INDIRECT_i64; ResultReg = createResultReg(&WebAssembly::I64RegClass); break; case MVT::f32: - Opc = IsDirect ? WebAssembly::CALL_F32 : WebAssembly::PCALL_INDIRECT_F32; + Opc = IsDirect ? WebAssembly::CALL_f32 : WebAssembly::PCALL_INDIRECT_f32; ResultReg = createResultReg(&WebAssembly::F32RegClass); break; case MVT::f64: - Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::PCALL_INDIRECT_F64; + Opc = IsDirect ? WebAssembly::CALL_f64 : WebAssembly::PCALL_INDIRECT_f64; ResultReg = createResultReg(&WebAssembly::F64RegClass); break; case MVT::v16i8: @@ -807,10 +819,10 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { : WebAssembly::PCALL_INDIRECT_v2f64; ResultReg = createResultReg(&WebAssembly::V128RegClass); break; - case MVT::ExceptRef: - Opc = IsDirect ? WebAssembly::CALL_EXCEPT_REF - : WebAssembly::PCALL_INDIRECT_EXCEPT_REF; - ResultReg = createResultReg(&WebAssembly::EXCEPT_REFRegClass); + case MVT::exnref: + Opc = IsDirect ? WebAssembly::CALL_exnref + : WebAssembly::PCALL_INDIRECT_exnref; + ResultReg = createResultReg(&WebAssembly::EXNREFRegClass); break; default: return false; @@ -818,25 +830,25 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { } SmallVector Args; - for (unsigned i = 0, e = Call->getNumArgOperands(); i < e; ++i) { - Value *V = Call->getArgOperand(i); + for (unsigned I = 0, E = Call->getNumArgOperands(); I < E; ++I) { + Value *V = Call->getArgOperand(I); MVT::SimpleValueType ArgTy = getSimpleType(V->getType()); if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE) return false; const AttributeList &Attrs = Call->getAttributes(); - if (Attrs.hasParamAttribute(i, Attribute::ByVal) || - Attrs.hasParamAttribute(i, Attribute::SwiftSelf) || - Attrs.hasParamAttribute(i, Attribute::SwiftError) || - Attrs.hasParamAttribute(i, Attribute::InAlloca) || - Attrs.hasParamAttribute(i, Attribute::Nest)) + if (Attrs.hasParamAttribute(I, Attribute::ByVal) || + Attrs.hasParamAttribute(I, Attribute::SwiftSelf) || + Attrs.hasParamAttribute(I, Attribute::SwiftError) || + Attrs.hasParamAttribute(I, Attribute::InAlloca) || + Attrs.hasParamAttribute(I, Attribute::Nest)) return false; unsigned Reg; - if (Attrs.hasParamAttribute(i, Attribute::SExt)) + if (Attrs.hasParamAttribute(I, Attribute::SExt)) Reg = getRegForSignedValue(V); - else if (Attrs.hasParamAttribute(i, Attribute::ZExt)) + else if (Attrs.hasParamAttribute(I, Attribute::ZExt)) Reg = getRegForUnsignedValue(V); else Reg = getRegForValue(V); @@ -847,6 +859,13 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { Args.push_back(Reg); } + unsigned CalleeReg = 0; + if (!IsDirect) { + CalleeReg = getRegForValue(Call->getCalledValue()); + if (!CalleeReg) + return false; + } + auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); if (!IsVoid) @@ -854,12 +873,8 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { if (IsDirect) MIB.addGlobalAddress(Func); - else { - unsigned Reg = getRegForValue(Call->getCalledValue()); - if (Reg == 0) - return false; - MIB.addReg(Reg); - } + else + MIB.addReg(CalleeReg); for (unsigned ArgReg : Args) MIB.addReg(ArgReg); @@ -870,7 +885,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { } bool WebAssemblyFastISel::selectSelect(const Instruction *I) { - const SelectInst *Select = cast(I); + const auto *Select = cast(I); bool Not; unsigned CondReg = getRegForI1Value(Select->getCondition(), Not); @@ -910,9 +925,9 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { Opc = WebAssembly::SELECT_F64; RC = &WebAssembly::F64RegClass; break; - case MVT::ExceptRef: - Opc = WebAssembly::SELECT_EXCEPT_REF; - RC = &WebAssembly::EXCEPT_REFRegClass; + case MVT::exnref: + Opc = WebAssembly::SELECT_EXNREF; + RC = &WebAssembly::EXNREFRegClass; break; default: return false; @@ -929,7 +944,7 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { } bool WebAssemblyFastISel::selectTrunc(const Instruction *I) { - const TruncInst *Trunc = cast(I); + const auto *Trunc = cast(I); unsigned Reg = getRegForValue(Trunc->getOperand(0)); if (Reg == 0) @@ -948,7 +963,7 @@ bool WebAssemblyFastISel::selectTrunc(const Instruction *I) { } bool WebAssemblyFastISel::selectZExt(const Instruction *I) { - const ZExtInst *ZExt = cast(I); + const auto *ZExt = cast(I); const Value *Op = ZExt->getOperand(0); MVT::SimpleValueType From = getSimpleType(Op->getType()); @@ -965,7 +980,7 @@ bool WebAssemblyFastISel::selectZExt(const Instruction *I) { } bool WebAssemblyFastISel::selectSExt(const Instruction *I) { - const SExtInst *SExt = cast(I); + const auto *SExt = cast(I); const Value *Op = SExt->getOperand(0); MVT::SimpleValueType From = getSimpleType(Op->getType()); @@ -982,11 +997,11 @@ bool WebAssemblyFastISel::selectSExt(const Instruction *I) { } bool WebAssemblyFastISel::selectICmp(const Instruction *I) { - const ICmpInst *ICmp = cast(I); + const auto *ICmp = cast(I); bool I32 = getSimpleType(ICmp->getOperand(0)->getType()) != MVT::i64; unsigned Opc; - bool isSigned = false; + bool IsSigned = false; switch (ICmp->getPredicate()) { case ICmpInst::ICMP_EQ: Opc = I32 ? WebAssembly::EQ_I32 : WebAssembly::EQ_I64; @@ -1008,29 +1023,29 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) { break; case ICmpInst::ICMP_SGT: Opc = I32 ? WebAssembly::GT_S_I32 : WebAssembly::GT_S_I64; - isSigned = true; + IsSigned = true; break; case ICmpInst::ICMP_SGE: Opc = I32 ? WebAssembly::GE_S_I32 : WebAssembly::GE_S_I64; - isSigned = true; + IsSigned = true; break; case ICmpInst::ICMP_SLT: Opc = I32 ? WebAssembly::LT_S_I32 : WebAssembly::LT_S_I64; - isSigned = true; + IsSigned = true; break; case ICmpInst::ICMP_SLE: Opc = I32 ? WebAssembly::LE_S_I32 : WebAssembly::LE_S_I64; - isSigned = true; + IsSigned = true; break; default: return false; } - unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), isSigned); + unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), IsSigned); if (LHS == 0) return false; - unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), isSigned); + unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), IsSigned); if (RHS == 0) return false; @@ -1043,7 +1058,7 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) { } bool WebAssemblyFastISel::selectFCmp(const Instruction *I) { - const FCmpInst *FCmp = cast(I); + const auto *FCmp = cast(I); unsigned LHS = getRegForValue(FCmp->getOperand(0)); if (LHS == 0) @@ -1139,7 +1154,7 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) { } bool WebAssemblyFastISel::selectLoad(const Instruction *I) { - const LoadInst *Load = cast(I); + const auto *Load = cast(I); if (Load->isAtomic()) return false; if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy()) @@ -1196,7 +1211,7 @@ bool WebAssemblyFastISel::selectLoad(const Instruction *I) { } bool WebAssemblyFastISel::selectStore(const Instruction *I) { - const StoreInst *Store = cast(I); + const auto *Store = cast(I); if (Store->isAtomic()) return false; if (!Subtarget->hasSIMD128() && @@ -1252,7 +1267,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) { } bool WebAssemblyFastISel::selectBr(const Instruction *I) { - const BranchInst *Br = cast(I); + const auto *Br = cast(I); if (Br->isUnconditional()) { MachineBasicBlock *MSucc = FuncInfo.MBBMap[Br->getSuccessor(0)]; fastEmitBranch(MSucc, Br->getDebugLoc()); @@ -1283,7 +1298,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; - const ReturnInst *Ret = cast(I); + const auto *Ret = cast(I); if (Ret->getNumOperands() == 0) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1330,8 +1345,8 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { case MVT::v2f64: Opc = WebAssembly::RETURN_v2f64; break; - case MVT::ExceptRef: - Opc = WebAssembly::RETURN_EXCEPT_REF; + case MVT::exnref: + Opc = WebAssembly::RETURN_EXNREF; break; default: return false; diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index 1a416520f97d..b7fc65401fc4 100644 --- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyFixFunctionBitcasts.cpp - Fix function bitcasts --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -36,11 +35,6 @@ using namespace llvm; #define DEBUG_TYPE "wasm-fix-function-bitcasts" -static cl::opt - TemporaryWorkarounds("wasm-temporary-workarounds", - cl::desc("Apply certain temporary workarounds"), - cl::init(true), cl::Hidden); - namespace { class FixFunctionBitcasts final : public ModulePass { StringRef getPassName() const override { @@ -70,12 +64,12 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() { // Recursively descend the def-use lists from V to find non-bitcast users of // bitcasts of V. -static void FindUses(Value *V, Function &F, +static void findUses(Value *V, Function &F, SmallVectorImpl> &Uses, SmallPtrSetImpl &ConstantBCs) { for (Use &U : V->uses()) { - if (BitCastOperator *BC = dyn_cast(U.getUser())) - FindUses(BC, F, Uses, ConstantBCs); + if (auto *BC = dyn_cast(U.getUser())) + findUses(BC, F, Uses, ConstantBCs); else if (U.get()->getType() != F.getType()) { CallSite CS(U.getUser()); if (!CS) @@ -87,8 +81,8 @@ static void FindUses(Value *V, Function &F, continue; if (isa(U.get())) { // Only add constant bitcasts to the list once; they get RAUW'd - auto c = ConstantBCs.insert(cast(U.get())); - if (!c.second) + auto C = ConstantBCs.insert(cast(U.get())); + if (!C.second) continue; } Uses.push_back(std::make_pair(&U, &F)); @@ -119,7 +113,7 @@ static void FindUses(Value *V, Function &F, // For bitcasts that involve struct types we don't know at this stage if they // would be equivalent at the wasm level and so we can't know if we need to // generate a wrapper. -static Function *CreateWrapper(Function *F, FunctionType *Ty) { +static Function *createWrapper(Function *F, FunctionType *Ty) { Module *M = F->getParent(); Function *Wrapper = Function::Create(Ty, Function::PrivateLinkage, @@ -157,11 +151,11 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) { BB->getInstList().push_back(PtrCast); Args.push_back(PtrCast); } else if (ArgType->isStructTy() || ParamType->isStructTy()) { - LLVM_DEBUG(dbgs() << "CreateWrapper: struct param type in bitcast: " + LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: " << F->getName() << "\n"); WrapperNeeded = false; } else { - LLVM_DEBUG(dbgs() << "CreateWrapper: arg type mismatch calling: " + LLVM_DEBUG(dbgs() << "createWrapper: arg type mismatch calling: " << F->getName() << "\n"); LLVM_DEBUG(dbgs() << "Arg[" << Args.size() << "] Expected: " << *ParamType << " Got: " << *ArgType << "\n"); @@ -197,11 +191,11 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) { BB->getInstList().push_back(Cast); ReturnInst::Create(M->getContext(), Cast, BB); } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) { - LLVM_DEBUG(dbgs() << "CreateWrapper: struct return type in bitcast: " + LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: " << F->getName() << "\n"); WrapperNeeded = false; } else { - LLVM_DEBUG(dbgs() << "CreateWrapper: return type mismatch calling: " + LLVM_DEBUG(dbgs() << "createWrapper: return type mismatch calling: " << F->getName() << "\n"); LLVM_DEBUG(dbgs() << "Expected: " << *ExpectedRtnType << " Got: " << *RtnType << "\n"); @@ -218,15 +212,26 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) { new UnreachableInst(M->getContext(), BB); Wrapper->setName(F->getName() + "_bitcast_invalid"); } else if (!WrapperNeeded) { - LLVM_DEBUG(dbgs() << "CreateWrapper: no wrapper needed: " << F->getName() + LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName() << "\n"); Wrapper->eraseFromParent(); return nullptr; } - LLVM_DEBUG(dbgs() << "CreateWrapper: " << F->getName() << "\n"); + LLVM_DEBUG(dbgs() << "createWrapper: " << F->getName() << "\n"); return Wrapper; } +// Test whether a main function with type FuncTy should be rewritten to have +// type MainTy. +static bool shouldFixMainFunction(FunctionType *FuncTy, FunctionType *MainTy) { + // Only fix the main function if it's the standard zero-arg form. That way, + // the standard cases will work as expected, and users will see signature + // mismatches from the linker for non-standard cases. + return FuncTy->getReturnType() == MainTy->getReturnType() && + FuncTy->getNumParams() == 0 && + !FuncTy->isVarArg(); +} + bool FixFunctionBitcasts::runOnModule(Module &M) { LLVM_DEBUG(dbgs() << "********** Fix Function Bitcasts **********\n"); @@ -237,27 +242,27 @@ bool FixFunctionBitcasts::runOnModule(Module &M) { // Collect all the places that need wrappers. for (Function &F : M) { - FindUses(&F, F, Uses, ConstantBCs); + findUses(&F, F, Uses, ConstantBCs); // If we have a "main" function, and its type isn't // "int main(int argc, char *argv[])", create an artificial call with it // bitcasted to that type so that we generate a wrapper for it, so that // the C runtime can call it. - if (!TemporaryWorkarounds && !F.isDeclaration() && F.getName() == "main") { + if (F.getName() == "main") { Main = &F; LLVMContext &C = M.getContext(); Type *MainArgTys[] = {Type::getInt32Ty(C), PointerType::get(Type::getInt8PtrTy(C), 0)}; FunctionType *MainTy = FunctionType::get(Type::getInt32Ty(C), MainArgTys, /*isVarArg=*/false); - if (F.getFunctionType() != MainTy) { + if (shouldFixMainFunction(F.getFunctionType(), MainTy)) { LLVM_DEBUG(dbgs() << "Found `main` function with incorrect type: " << *F.getFunctionType() << "\n"); Value *Args[] = {UndefValue::get(MainArgTys[0]), UndefValue::get(MainArgTys[1])}; Value *Casted = ConstantExpr::getBitCast(Main, PointerType::get(MainTy, 0)); - CallMain = CallInst::Create(Casted, Args, "call_main"); + CallMain = CallInst::Create(MainTy, Casted, Args, "call_main"); Use *UseMain = &CallMain->getOperandUse(2); Uses.push_back(std::make_pair(UseMain, &F)); } @@ -269,8 +274,8 @@ bool FixFunctionBitcasts::runOnModule(Module &M) { for (auto &UseFunc : Uses) { Use *U = UseFunc.first; Function *F = UseFunc.second; - PointerType *PTy = cast(U->get()->getType()); - FunctionType *Ty = dyn_cast(PTy->getElementType()); + auto *PTy = cast(U->get()->getType()); + auto *Ty = dyn_cast(PTy->getElementType()); // If the function is casted to something like i8* as a "generic pointer" // to be later casted to something else, we can't generate a wrapper for it. @@ -280,7 +285,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) { auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr)); if (Pair.second) - Pair.first->second = CreateWrapper(F, Ty); + Pair.first->second = createWrapper(F, Ty); Function *Wrapper = Pair.first->second; if (!Wrapper) @@ -296,14 +301,20 @@ bool FixFunctionBitcasts::runOnModule(Module &M) { // one that gets called from startup. if (CallMain) { Main->setName("__original_main"); - Function *MainWrapper = + auto *MainWrapper = cast(CallMain->getCalledValue()->stripPointerCasts()); - MainWrapper->setName("main"); - MainWrapper->setLinkage(Main->getLinkage()); - MainWrapper->setVisibility(Main->getVisibility()); - Main->setLinkage(Function::PrivateLinkage); - Main->setVisibility(Function::DefaultVisibility); delete CallMain; + if (Main->isDeclaration()) { + // The wrapper is not needed in this case as we don't need to export + // it to anyone else. + MainWrapper->eraseFromParent(); + } else { + // Otherwise give the wrapper the same linkage as the original main + // function, so that it can be called from the same places. + MainWrapper->setName("main"); + MainWrapper->setLinkage(Main->getLinkage()); + MainWrapper->setVisibility(Main->getVisibility()); + } } return true; diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp index 108f2879a071..7d8e86d9b2c0 100644 --- a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp @@ -1,46 +1,48 @@ //=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file -/// This file implements a pass that transforms irreducible control flow into -/// reducible control flow. Irreducible control flow means multiple-entry -/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo -/// due to being unnatural. +/// This file implements a pass that removes irreducible control flow. +/// Irreducible control flow means multiple-entry loops, which this pass +/// transforms to have a single entry. /// /// Note that LLVM has a generic pass that lowers irreducible control flow, but /// it linearizes control flow, turning diamonds into two triangles, which is /// both unnecessary and undesirable for WebAssembly. /// -/// The big picture: Ignoring natural loops (seeing them monolithically), we -/// find all the blocks which can return to themselves ("loopers"). Loopers -/// reachable from the non-loopers are loop entries: if there are 2 or more, -/// then we have irreducible control flow. We fix that as follows: a new block -/// is created that can dispatch to each of the loop entries, based on the -/// value of a label "helper" variable, and we replace direct branches to the -/// entries with assignments to the label variable and a branch to the dispatch -/// block. Then the dispatch block is the single entry in a new natural loop. +/// The big picture: We recursively process each "region", defined as a group +/// of blocks with a single entry and no branches back to that entry. A region +/// may be the entire function body, or the inner part of a loop, i.e., the +/// loop's body without branches back to the loop entry. In each region we fix +/// up multi-entry loops by adding a new block that can dispatch to each of the +/// loop entries, based on the value of a label "helper" variable, and we +/// replace direct branches to the entries with assignments to the label +/// variable and a branch to the dispatch block. Then the dispatch block is the +/// single entry in the loop containing the previous multiple entries. After +/// ensuring all the loops in a region are reducible, we recurse into them. The +/// total time complexity of this pass is: +/// +/// O(NumBlocks * NumNestedLoops * NumIrreducibleLoops + +/// NumLoops * NumLoops) /// -/// This is similar to what the Relooper [1] does, both identify looping code -/// that requires multiple entries, and resolve it in a similar way. In -/// Relooper terminology, we implement a Multiple shape in a Loop shape. Note +/// This pass is similar to what the Relooper [1] does. Both identify looping +/// code that requires multiple entries, and resolve it in a similar way (in +/// Relooper terminology, we implement a Multiple shape in a Loop shape). Note /// also that like the Relooper, we implement a "minimal" intervention: we only /// use the "label" helper for the blocks we absolutely must and no others. We -/// also prioritize code size and do not perform node splitting (i.e. we don't -/// duplicate code in order to resolve irreducibility). +/// also prioritize code size and do not duplicate code in order to resolve +/// irreducibility. The graph algorithms for finding loops and entries and so +/// forth are also similar to the Relooper. The main differences between this +/// pass and the Relooper are: /// -/// The difference between this code and the Relooper is that the Relooper also -/// generates ifs and loops and works in a recursive manner, knowing at each -/// point what the entries are, and recursively breaks down the problem. Here -/// we just want to resolve irreducible control flow, and we also want to use -/// as much LLVM infrastructure as possible. So we use the MachineLoopInfo to -/// identify natural loops, etc., and we start with the whole CFG and must -/// identify both the looping code and its entries. +/// * We just care about irreducibility, so we just look at loops. +/// * The Relooper emits structured control flow (with ifs etc.), while we +/// emit a CFG. /// /// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In /// Proceedings of the ACM international conference companion on Object oriented @@ -52,200 +54,277 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" -#include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" -#include "llvm/ADT/PriorityQueue.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "wasm-fix-irreducible-control-flow" namespace { -class LoopFixer { +using BlockVector = SmallVector; +using BlockSet = SmallPtrSet; + +// Calculates reachability in a region. Ignores branches to blocks outside of +// the region, and ignores branches to the region entry (for the case where +// the region is the inner part of a loop). +class ReachabilityGraph { public: - LoopFixer(MachineFunction &MF, MachineLoopInfo &MLI, MachineLoop *Loop) - : MF(MF), MLI(MLI), Loop(Loop) {} + ReachabilityGraph(MachineBasicBlock *Entry, const BlockSet &Blocks) + : Entry(Entry), Blocks(Blocks) { +#ifndef NDEBUG + // The region must have a single entry. + for (auto *MBB : Blocks) { + if (MBB != Entry) { + for (auto *Pred : MBB->predecessors()) { + assert(inRegion(Pred)); + } + } + } +#endif + calculate(); + } + + bool canReach(MachineBasicBlock *From, MachineBasicBlock *To) const { + assert(inRegion(From) && inRegion(To)); + auto I = Reachable.find(From); + if (I == Reachable.end()) + return false; + return I->second.count(To); + } + + // "Loopers" are blocks that are in a loop. We detect these by finding blocks + // that can reach themselves. + const BlockSet &getLoopers() const { return Loopers; } + + // Get all blocks that are loop entries. + const BlockSet &getLoopEntries() const { return LoopEntries; } - // Run the fixer on the given inputs. Returns whether changes were made. - bool run(); + // Get all blocks that enter a particular loop from outside. + const BlockSet &getLoopEnterers(MachineBasicBlock *LoopEntry) const { + assert(inRegion(LoopEntry)); + auto I = LoopEnterers.find(LoopEntry); + assert(I != LoopEnterers.end()); + return I->second; + } private: - MachineFunction &MF; - MachineLoopInfo &MLI; - MachineLoop *Loop; + MachineBasicBlock *Entry; + const BlockSet &Blocks; + + BlockSet Loopers, LoopEntries; + DenseMap LoopEnterers; - MachineBasicBlock *Header; - SmallPtrSet LoopBlocks; + bool inRegion(MachineBasicBlock *MBB) const { return Blocks.count(MBB); } - using BlockSet = SmallPtrSet; + // Maps a block to all the other blocks it can reach. DenseMap Reachable; - // The worklist contains pairs of recent additions, (a, b), where we just - // added a link a => b. - using BlockPair = std::pair; - SmallVector WorkList; - - // Get a canonical block to represent a block or a loop: the block, or if in - // an inner loop, the loop header, of it in an outer loop scope, we can - // ignore it. We need to call this on all blocks we work on. - MachineBasicBlock *canonicalize(MachineBasicBlock *MBB) { - MachineLoop *InnerLoop = MLI.getLoopFor(MBB); - if (InnerLoop == Loop) { - return MBB; - } else { - // This is either in an outer or an inner loop, and not in ours. - if (!LoopBlocks.count(MBB)) { - // It's in outer code, ignore it. - return nullptr; + void calculate() { + // Reachability computation work list. Contains pairs of recent additions + // (A, B) where we just added a link A => B. + using BlockPair = std::pair; + SmallVector WorkList; + + // Add all relevant direct branches. + for (auto *MBB : Blocks) { + for (auto *Succ : MBB->successors()) { + if (Succ != Entry && inRegion(Succ)) { + Reachable[MBB].insert(Succ); + WorkList.emplace_back(MBB, Succ); + } } - assert(InnerLoop); - // It's in an inner loop, canonicalize it to the header of that loop. - return InnerLoop->getHeader(); } - } - // For a successor we can additionally ignore it if it's a branch back to a - // natural loop top, as when we are in the scope of a loop, we just care - // about internal irreducibility, and can ignore the loop we are in. We need - // to call this on all blocks in a context where they are a successor. - MachineBasicBlock *canonicalizeSuccessor(MachineBasicBlock *MBB) { - if (Loop && MBB == Loop->getHeader()) { - // Ignore branches going to the loop's natural header. - return nullptr; + while (!WorkList.empty()) { + MachineBasicBlock *MBB, *Succ; + std::tie(MBB, Succ) = WorkList.pop_back_val(); + assert(inRegion(MBB) && Succ != Entry && inRegion(Succ)); + if (MBB != Entry) { + // We recently added MBB => Succ, and that means we may have enabled + // Pred => MBB => Succ. + for (auto *Pred : MBB->predecessors()) { + if (Reachable[Pred].insert(Succ).second) { + WorkList.emplace_back(Pred, Succ); + } + } + } } - return canonicalize(MBB); - } - // Potentially insert a new reachable edge, and if so, note it as further - // work. - void maybeInsert(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { - assert(MBB == canonicalize(MBB)); - assert(Succ); - // Succ may not be interesting as a sucessor. - Succ = canonicalizeSuccessor(Succ); - if (!Succ) - return; - if (Reachable[MBB].insert(Succ).second) { - // For there to be further work, it means that we have - // X => MBB => Succ - // for some other X, and in that case X => Succ would be a new edge for - // us to discover later. However, if we don't care about MBB as a - // successor, then we don't care about that anyhow. - if (canonicalizeSuccessor(MBB)) { - WorkList.emplace_back(MBB, Succ); + // Blocks that can return to themselves are in a loop. + for (auto *MBB : Blocks) { + if (canReach(MBB, MBB)) { + Loopers.insert(MBB); + } + } + assert(!Loopers.count(Entry)); + + // Find the loop entries - loopers reachable from blocks not in that loop - + // and those outside blocks that reach them, the "loop enterers". + for (auto *Looper : Loopers) { + for (auto *Pred : Looper->predecessors()) { + // Pred can reach Looper. If Looper can reach Pred, it is in the loop; + // otherwise, it is a block that enters into the loop. + if (!canReach(Looper, Pred)) { + LoopEntries.insert(Looper); + LoopEnterers[Looper].insert(Pred); + } } } } }; -bool LoopFixer::run() { - Header = Loop ? Loop->getHeader() : &*MF.begin(); - - // Identify all the blocks in this loop scope. - if (Loop) { - for (auto *MBB : Loop->getBlocks()) { - LoopBlocks.insert(MBB); - } - } else { - for (auto &MBB : MF) { - LoopBlocks.insert(&MBB); - } +// Finds the blocks in a single-entry loop, given the loop entry and the +// list of blocks that enter the loop. +class LoopBlocks { +public: + LoopBlocks(MachineBasicBlock *Entry, const BlockSet &Enterers) + : Entry(Entry), Enterers(Enterers) { + calculate(); } - // Compute which (canonicalized) blocks each block can reach. - - // Add all the initial work. - for (auto *MBB : LoopBlocks) { - MachineLoop *InnerLoop = MLI.getLoopFor(MBB); + BlockSet &getBlocks() { return Blocks; } - if (InnerLoop == Loop) { - for (auto *Succ : MBB->successors()) { - maybeInsert(MBB, Succ); - } - } else { - // It can't be in an outer loop - we loop on LoopBlocks - and so it must - // be an inner loop. - assert(InnerLoop); - // Check if we are the canonical block for this loop. - if (canonicalize(MBB) != MBB) { - continue; - } - // The successors are those of the loop. - SmallVector ExitBlocks; - InnerLoop->getExitBlocks(ExitBlocks); - for (auto *Succ : ExitBlocks) { - maybeInsert(MBB, Succ); +private: + MachineBasicBlock *Entry; + const BlockSet &Enterers; + + BlockSet Blocks; + + void calculate() { + // Going backwards from the loop entry, if we ignore the blocks entering + // from outside, we will traverse all the blocks in the loop. + BlockVector WorkList; + BlockSet AddedToWorkList; + Blocks.insert(Entry); + for (auto *Pred : Entry->predecessors()) { + if (!Enterers.count(Pred)) { + WorkList.push_back(Pred); + AddedToWorkList.insert(Pred); } } - } - // Do work until we are all done. - while (!WorkList.empty()) { - MachineBasicBlock *MBB; - MachineBasicBlock *Succ; - std::tie(MBB, Succ) = WorkList.pop_back_val(); - // The worklist item is an edge we just added, so it must have valid blocks - // (and not something canonicalized to nullptr). - assert(MBB); - assert(Succ); - // The successor in that pair must also be a valid successor. - assert(MBB == canonicalizeSuccessor(MBB)); - // We recently added MBB => Succ, and that means we may have enabled - // Pred => MBB => Succ. Check all the predecessors. Note that our loop here - // is correct for both a block and a block representing a loop, as the loop - // is natural and so the predecessors are all predecessors of the loop - // header, which is the block we have here. - for (auto *Pred : MBB->predecessors()) { - // Canonicalize, make sure it's relevant, and check it's not the same - // block (an update to the block itself doesn't help compute that same - // block). - Pred = canonicalize(Pred); - if (Pred && Pred != MBB) { - maybeInsert(Pred, Succ); + while (!WorkList.empty()) { + auto *MBB = WorkList.pop_back_val(); + assert(!Enterers.count(MBB)); + if (Blocks.insert(MBB).second) { + for (auto *Pred : MBB->predecessors()) { + if (!AddedToWorkList.count(Pred)) { + WorkList.push_back(Pred); + AddedToWorkList.insert(Pred); + } + } } } } +}; - // It's now trivial to identify the loopers. - SmallPtrSet Loopers; - for (auto MBB : LoopBlocks) { - if (Reachable[MBB].count(MBB)) { - Loopers.insert(MBB); - } +class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass { + StringRef getPassName() const override { + return "WebAssembly Fix Irreducible Control Flow"; } - // The header cannot be a looper. At the toplevel, LLVM does not allow the - // entry to be in a loop, and in a natural loop we should ignore the header. - assert(Loopers.count(Header) == 0); - - // Find the entries, loopers reachable from non-loopers. - SmallPtrSet Entries; - SmallVector SortedEntries; - for (auto *Looper : Loopers) { - for (auto *Pred : Looper->predecessors()) { - Pred = canonicalize(Pred); - if (Pred && !Loopers.count(Pred)) { - Entries.insert(Looper); - SortedEntries.push_back(Looper); + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool processRegion(MachineBasicBlock *Entry, BlockSet &Blocks, + MachineFunction &MF); + + void makeSingleEntryLoop(BlockSet &Entries, BlockSet &Blocks, + MachineFunction &MF, const ReachabilityGraph &Graph); + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {} +}; + +bool WebAssemblyFixIrreducibleControlFlow::processRegion( + MachineBasicBlock *Entry, BlockSet &Blocks, MachineFunction &MF) { + bool Changed = false; + + // Remove irreducibility before processing child loops, which may take + // multiple iterations. + while (true) { + ReachabilityGraph Graph(Entry, Blocks); + + bool FoundIrreducibility = false; + + for (auto *LoopEntry : Graph.getLoopEntries()) { + // Find mutual entries - all entries which can reach this one, and + // are reached by it (that always includes LoopEntry itself). All mutual + // entries must be in the same loop, so if we have more than one, then we + // have irreducible control flow. + // + // Note that irreducibility may involve inner loops, e.g. imagine A + // starts one loop, and it has B inside it which starts an inner loop. + // If we add a branch from all the way on the outside to B, then in a + // sense B is no longer an "inner" loop, semantically speaking. We will + // fix that irreducibility by adding a block that dispatches to either + // either A or B, so B will no longer be an inner loop in our output. + // (A fancier approach might try to keep it as such.) + // + // Note that we still need to recurse into inner loops later, to handle + // the case where the irreducibility is entirely nested - we would not + // be able to identify that at this point, since the enclosing loop is + // a group of blocks all of whom can reach each other. (We'll see the + // irreducibility after removing branches to the top of that enclosing + // loop.) + BlockSet MutualLoopEntries; + MutualLoopEntries.insert(LoopEntry); + for (auto *OtherLoopEntry : Graph.getLoopEntries()) { + if (OtherLoopEntry != LoopEntry && + Graph.canReach(LoopEntry, OtherLoopEntry) && + Graph.canReach(OtherLoopEntry, LoopEntry)) { + MutualLoopEntries.insert(OtherLoopEntry); + } + } + + if (MutualLoopEntries.size() > 1) { + makeSingleEntryLoop(MutualLoopEntries, Blocks, MF, Graph); + FoundIrreducibility = true; + Changed = true; break; } } + // Only go on to actually process the inner loops when we are done + // removing irreducible control flow and changing the graph. Modifying + // the graph as we go is possible, and that might let us avoid looking at + // the already-fixed loops again if we are careful, but all that is + // complex and bug-prone. Since irreducible loops are rare, just starting + // another iteration is best. + if (FoundIrreducibility) { + continue; + } + + for (auto *LoopEntry : Graph.getLoopEntries()) { + LoopBlocks InnerBlocks(LoopEntry, Graph.getLoopEnterers(LoopEntry)); + // Each of these calls to processRegion may change the graph, but are + // guaranteed not to interfere with each other. The only changes we make + // to the graph are to add blocks on the way to a loop entry. As the + // loops are disjoint, that means we may only alter branches that exit + // another loop, which are ignored when recursing into that other loop + // anyhow. + if (processRegion(LoopEntry, InnerBlocks.getBlocks(), MF)) { + Changed = true; + } + } + + return Changed; } +} - // Check if we found irreducible control flow. - if (LLVM_LIKELY(Entries.size() <= 1)) - return false; +// Given a set of entries to a single loop, create a single entry for that +// loop by creating a dispatch block for them, routing control flow using +// a helper variable. Also updates Blocks with any new blocks created, so +// that we properly track all the blocks in the region. But this does not update +// ReachabilityGraph; this will be updated in the caller of this function as +// needed. +void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop( + BlockSet &Entries, BlockSet &Blocks, MachineFunction &MF, + const ReachabilityGraph &Graph) { + assert(Entries.size() >= 2); // Sort the entries to ensure a deterministic build. + BlockVector SortedEntries(Entries.begin(), Entries.end()); llvm::sort(SortedEntries, [&](const MachineBasicBlock *A, const MachineBasicBlock *B) { auto ANum = A->getNumber(); @@ -257,8 +336,8 @@ bool LoopFixer::run() { for (auto Block : SortedEntries) assert(Block->getNumber() != -1); if (SortedEntries.size() > 1) { - for (auto I = SortedEntries.begin(), E = SortedEntries.end() - 1; - I != E; ++I) { + for (auto I = SortedEntries.begin(), E = SortedEntries.end() - 1; I != E; + ++I) { auto ANum = (*I)->getNumber(); auto BNum = (*(std::next(I)))->getNumber(); assert(ANum != BNum); @@ -269,12 +348,12 @@ bool LoopFixer::run() { // Create a dispatch block which will contain a jump table to the entries. MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock(); MF.insert(MF.end(), Dispatch); - MLI.changeLoopFor(Dispatch, Loop); + Blocks.insert(Dispatch); // Add the jump table. const auto &TII = *MF.getSubtarget().getInstrInfo(); - MachineInstrBuilder MIB = BuildMI(*Dispatch, Dispatch->end(), DebugLoc(), - TII.get(WebAssembly::BR_TABLE_I32)); + MachineInstrBuilder MIB = + BuildMI(Dispatch, DebugLoc(), TII.get(WebAssembly::BR_TABLE_I32)); // Add the register which will be used to tell the jump table which block to // jump to. @@ -285,112 +364,110 @@ bool LoopFixer::run() { // Compute the indices in the superheader, one for each bad block, and // add them as successors. DenseMap Indices; - for (auto *MBB : SortedEntries) { - auto Pair = Indices.insert(std::make_pair(MBB, 0)); - if (!Pair.second) { - continue; - } + for (auto *Entry : SortedEntries) { + auto Pair = Indices.insert(std::make_pair(Entry, 0)); + assert(Pair.second); unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1; Pair.first->second = Index; - MIB.addMBB(MBB); - Dispatch->addSuccessor(MBB); + MIB.addMBB(Entry); + Dispatch->addSuccessor(Entry); } - // Rewrite the problematic successors for every block that wants to reach the - // bad blocks. For simplicity, we just introduce a new block for every edge - // we need to rewrite. (Fancier things are possible.) + // Rewrite the problematic successors for every block that wants to reach + // the bad blocks. For simplicity, we just introduce a new block for every + // edge we need to rewrite. (Fancier things are possible.) - SmallVector AllPreds; - for (auto *MBB : SortedEntries) { - for (auto *Pred : MBB->predecessors()) { + BlockVector AllPreds; + for (auto *Entry : SortedEntries) { + for (auto *Pred : Entry->predecessors()) { if (Pred != Dispatch) { AllPreds.push_back(Pred); } } } - for (MachineBasicBlock *MBB : AllPreds) { - DenseMap Map; - for (auto *Succ : MBB->successors()) { - if (!Entries.count(Succ)) { + // This set stores predecessors within this loop. + DenseSet InLoop; + for (auto *Pred : AllPreds) { + for (auto *Entry : Pred->successors()) { + if (!Entries.count(Entry)) continue; + if (Graph.canReach(Entry, Pred)) { + InLoop.insert(Pred); + break; } + } + } + + // Record if each entry has a layout predecessor. This map stores + // <, layout predecessor> + std::map, MachineBasicBlock *> + EntryToLayoutPred; + for (auto *Pred : AllPreds) + for (auto *Entry : Pred->successors()) + if (Entries.count(Entry) && Pred->isLayoutSuccessor(Entry)) + EntryToLayoutPred[std::make_pair(InLoop.count(Pred), Entry)] = Pred; + + // We need to create at most two routing blocks per entry: one for + // predecessors outside the loop and one for predecessors inside the loop. + // This map stores + // <, routing block> + std::map, MachineBasicBlock *> Map; + for (auto *Pred : AllPreds) { + bool PredInLoop = InLoop.count(Pred); + for (auto *Entry : Pred->successors()) { + if (!Entries.count(Entry) || + Map.count(std::make_pair(InLoop.count(Pred), Entry))) + continue; + // If there exists a layout predecessor of this entry and this predecessor + // is not that, we rather create a routing block after that layout + // predecessor to save a branch. + if (EntryToLayoutPred.count(std::make_pair(PredInLoop, Entry)) && + EntryToLayoutPred[std::make_pair(PredInLoop, Entry)] != Pred) + continue; // This is a successor we need to rewrite. - MachineBasicBlock *Split = MF.CreateMachineBasicBlock(); - MF.insert(MBB->isLayoutSuccessor(Succ) ? MachineFunction::iterator(Succ) - : MF.end(), - Split); - MLI.changeLoopFor(Split, Loop); + MachineBasicBlock *Routing = MF.CreateMachineBasicBlock(); + MF.insert(Pred->isLayoutSuccessor(Entry) + ? MachineFunction::iterator(Entry) + : MF.end(), + Routing); + Blocks.insert(Routing); // Set the jump table's register of the index of the block we wish to // jump to, and jump to the jump table. - BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::CONST_I32), - Reg) - .addImm(Indices[Succ]); - BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::BR)) - .addMBB(Dispatch); - Split->addSuccessor(Dispatch); - Map[Succ] = Split; + BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::CONST_I32), Reg) + .addImm(Indices[Entry]); + BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::BR)).addMBB(Dispatch); + Routing->addSuccessor(Dispatch); + Map[std::make_pair(PredInLoop, Entry)] = Routing; } + } + + for (auto *Pred : AllPreds) { + bool PredInLoop = InLoop.count(Pred); // Remap the terminator operands and the successor list. - for (MachineInstr &Term : MBB->terminators()) + for (MachineInstr &Term : Pred->terminators()) for (auto &Op : Term.explicit_uses()) if (Op.isMBB() && Indices.count(Op.getMBB())) - Op.setMBB(Map[Op.getMBB()]); - for (auto Rewrite : Map) - MBB->replaceSuccessor(Rewrite.first, Rewrite.second); + Op.setMBB(Map[std::make_pair(PredInLoop, Op.getMBB())]); + + for (auto *Succ : Pred->successors()) { + if (!Entries.count(Succ)) + continue; + auto *Routing = Map[std::make_pair(PredInLoop, Succ)]; + Pred->replaceSuccessor(Succ, Routing); + } } // Create a fake default label, because br_table requires one. MIB.addMBB(MIB.getInstr() ->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1) .getMBB()); - - return true; } -class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass { - StringRef getPassName() const override { - return "WebAssembly Fix Irreducible Control Flow"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - bool runIteration(MachineFunction &MF, MachineLoopInfo &MLI) { - // Visit the function body, which is identified as a null loop. - if (LoopFixer(MF, MLI, nullptr).run()) { - return true; - } - - // Visit all the loops. - SmallVector Worklist(MLI.begin(), MLI.end()); - while (!Worklist.empty()) { - MachineLoop *Loop = Worklist.pop_back_val(); - Worklist.append(Loop->begin(), Loop->end()); - if (LoopFixer(MF, MLI, Loop).run()) { - return true; - } - } - - return false; - } - -public: - static char ID; // Pass identification, replacement for typeid - WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {} -}; } // end anonymous namespace char WebAssemblyFixIrreducibleControlFlow::ID = 0; @@ -407,23 +484,18 @@ bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction( "********** Function: " << MF.getName() << '\n'); - bool Changed = false; - auto &MLI = getAnalysis(); - - // When we modify something, bail out and recompute MLI, then start again, as - // we create a new natural loop when we resolve irreducible control flow, and - // other loops may become nested in it, etc. In practice this is not an issue - // because irreducible control flow is rare, only very few cycles are needed - // here. - while (LLVM_UNLIKELY(runIteration(MF, MLI))) { - // We rewrote part of the function; recompute MLI and start again. - LLVM_DEBUG(dbgs() << "Recomputing loops.\n"); + // Start the recursive process on the entire function body. + BlockSet AllBlocks; + for (auto &MBB : MF) { + AllBlocks.insert(&MBB); + } + + if (LLVM_UNLIKELY(processRegion(&*MF.begin(), AllBlocks, MF))) { + // We rewrote part of the function; recompute relevant things. MF.getRegInfo().invalidateLiveness(); MF.RenumberBlocks(); - getAnalysis().runOnMachineFunction(MF); - MLI.runOnMachineFunction(MF); - Changed = true; + return true; } - return Changed; + return false; } diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 2d5aff28d27b..5299068efdd4 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyFrameLowering.cpp - WebAssembly Frame Lowering ----------==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -131,7 +130,7 @@ void WebAssemblyFrameLowering::writeSPToGlobal( const char *ES = "__stack_pointer"; auto *SPSymbol = MF.createExternalSymbolName(ES); BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::GLOBAL_SET_I32)) - .addExternalSymbol(SPSymbol, WebAssemblyII::MO_SYMBOL_GLOBAL) + .addExternalSymbol(SPSymbol) .addReg(SrcReg); } @@ -165,7 +164,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, auto &MRI = MF.getRegInfo(); auto InsertPt = MBB.begin(); - while (InsertPt != MBB.end() && WebAssembly::isArgument(*InsertPt)) + while (InsertPt != MBB.end() && + WebAssembly::isArgument(InsertPt->getOpcode())) ++InsertPt; DebugLoc DL; @@ -178,7 +178,7 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, const char *ES = "__stack_pointer"; auto *SPSymbol = MF.createExternalSymbolName(ES); BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GLOBAL_GET_I32), SPReg) - .addExternalSymbol(SPSymbol, WebAssemblyII::MO_SYMBOL_GLOBAL); + .addExternalSymbol(SPSymbol); bool HasBP = hasBP(MF); if (HasBP) { diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index c6fa8261b03f..daddd4ca16ff 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -1,9 +1,8 @@ // WebAssemblyFrameLowering.h - TargetFrameLowering for WebAssembly -*- C++ -*-/ // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def index e987d7f7f43a..77217f16a727 100644 --- a/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/lib/Target/WebAssembly/WebAssemblyISD.def @@ -1,9 +1,8 @@ //- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -16,9 +15,14 @@ HANDLE_NODETYPE(CALL1) HANDLE_NODETYPE(CALL0) +HANDLE_NODETYPE(RET_CALL) HANDLE_NODETYPE(RETURN) HANDLE_NODETYPE(ARGUMENT) +// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol HANDLE_NODETYPE(Wrapper) +// A special wapper used in PIC code for __memory_base/__table_base relcative +// access. +HANDLE_NODETYPE(WrapperPIC) HANDLE_NODETYPE(BR_IF) HANDLE_NODETYPE(BR_TABLE) HANDLE_NODETYPE(SHUFFLE) @@ -26,5 +30,7 @@ HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) HANDLE_NODETYPE(THROW) +HANDLE_NODETYPE(MEMORY_COPY) +HANDLE_NODETYPE(MEMORY_FILL) // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index 0a7464cedc90..26339eaef37d 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //- WebAssemblyISelDAGToDAG.cpp - A dag to dag inst selector for WebAssembly -// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -16,6 +15,7 @@ #include "WebAssembly.h" #include "WebAssemblyTargetMachine.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" // To access function attributes. #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" @@ -38,9 +38,9 @@ class WebAssemblyDAGToDAGISel final : public SelectionDAGISel { bool ForCodeSize; public: - WebAssemblyDAGToDAGISel(WebAssemblyTargetMachine &tm, + WebAssemblyDAGToDAGISel(WebAssemblyTargetMachine &TM, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) { + : SelectionDAGISel(TM, OptLevel), Subtarget(nullptr), ForCodeSize(false) { } StringRef getPassName() const override { @@ -52,8 +52,7 @@ public: "********** Function: " << MF.getName() << '\n'); - ForCodeSize = MF.getFunction().hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction().hasFnAttribute(Attribute::MinSize); + ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -79,14 +78,159 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { return; } - // Few custom selection stuff. If we need WebAssembly-specific selection, - // uncomment this block add corresponding case statements. - /* + // Few custom selection stuff. + SDLoc DL(Node); + MachineFunction &MF = CurDAG->getMachineFunction(); switch (Node->getOpcode()) { + case ISD::ATOMIC_FENCE: { + if (!MF.getSubtarget().hasAtomics()) + break; + + uint64_t SyncScopeID = + cast(Node->getOperand(2).getNode())->getZExtValue(); + switch (SyncScopeID) { + case SyncScope::SingleThread: { + // We lower a single-thread fence to a pseudo compiler barrier instruction + // preventing instruction reordering. This will not be emitted in final + // binary. + MachineSDNode *Fence = + CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE, + DL, // debug loc + MVT::Other, // outchain type + Node->getOperand(0) // inchain + ); + ReplaceNode(Node, Fence); + CurDAG->RemoveDeadNode(Node); + return; + } + + case SyncScope::System: { + // For non-emscripten systems, we have not decided on what we should + // traslate fences to yet. + if (!Subtarget->getTargetTriple().isOSEmscripten()) + report_fatal_error( + "ATOMIC_FENCE is not yet supported in non-emscripten OSes"); + + // Wasm does not have a fence instruction, but because all atomic + // instructions in wasm are sequentially consistent, we translate a + // fence to an idempotent atomic RMW instruction to a linear memory + // address. All atomic instructions in wasm are sequentially consistent, + // but this is to ensure a fence also prevents reordering of non-atomic + // instructions in the VM. Even though LLVM IR's fence instruction does + // not say anything about its relationship with non-atomic instructions, + // we think this is more user-friendly. + // + // While any address can work, here we use a value stored in + // __stack_pointer wasm global because there's high chance that area is + // in cache. + // + // So the selected instructions will be in the form of: + // %addr = get_global $__stack_pointer + // %0 = i32.const 0 + // i32.atomic.rmw.or %addr, %0 + SDValue StackPtrSym = CurDAG->getTargetExternalSymbol( + "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout())); + MachineSDNode *GetGlobal = + CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode + DL, // debug loc + MVT::i32, // result type + StackPtrSym // __stack_pointer symbol + ); + + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + auto *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getUnknownStack(MF), + // FIXME Volatile isn't really correct, but currently all LLVM + // atomic instructions are treated as volatiles in the backend, so + // we should be consistent. + MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad | + MachineMemOperand::MOStore, + 4, 4, AAMDNodes(), nullptr, SyncScope::System, + AtomicOrdering::SequentiallyConsistent); + MachineSDNode *Const0 = + CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero); + MachineSDNode *AtomicRMW = CurDAG->getMachineNode( + WebAssembly::ATOMIC_RMW_OR_I32, // opcode + DL, // debug loc + MVT::i32, // result type + MVT::Other, // outchain type + { + Zero, // alignment + Zero, // offset + SDValue(GetGlobal, 0), // __stack_pointer + SDValue(Const0, 0), // OR with 0 to make it idempotent + Node->getOperand(0) // inchain + }); + + CurDAG->setNodeMemRefs(AtomicRMW, {MMO}); + ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + default: + llvm_unreachable("Unknown scope!"); + } + } + + case ISD::GlobalTLSAddress: { + const auto *GA = cast(Node); + + if (!MF.getSubtarget().hasBulkMemory()) + report_fatal_error("cannot use thread-local storage without bulk memory", + false); + + // Currently Emscripten does not support dynamic linking with threads. + // Therefore, if we have thread-local storage, only the local-exec model + // is possible. + // TODO: remove this and implement proper TLS models once Emscripten + // supports dynamic linking with threads. + if (GA->getGlobal()->getThreadLocalMode() != + GlobalValue::LocalExecTLSModel && + !Subtarget->getTargetTriple().isOSEmscripten()) { + report_fatal_error("only -ftls-model=local-exec is supported for now on " + "non-Emscripten OSes: variable " + + GA->getGlobal()->getName(), + false); + } + + MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout()); + assert(PtrVT == MVT::i32 && "only wasm32 is supported for now"); + + SDValue TLSBaseSym = CurDAG->getTargetExternalSymbol("__tls_base", PtrVT); + SDValue TLSOffsetSym = CurDAG->getTargetGlobalAddress( + GA->getGlobal(), DL, PtrVT, GA->getOffset(), 0); + + MachineSDNode *TLSBase = CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, + DL, MVT::i32, TLSBaseSym); + MachineSDNode *TLSOffset = CurDAG->getMachineNode( + WebAssembly::CONST_I32, DL, MVT::i32, TLSOffsetSym); + MachineSDNode *TLSAddress = + CurDAG->getMachineNode(WebAssembly::ADD_I32, DL, MVT::i32, + SDValue(TLSBase, 0), SDValue(TLSOffset, 0)); + ReplaceNode(Node, TLSAddress); + return; + } + + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast(Node->getOperand(0))->getZExtValue(); + switch (IntNo) { + case Intrinsic::wasm_tls_size: { + MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout()); + assert(PtrVT == MVT::i32 && "only wasm32 is supported for now"); + + MachineSDNode *TLSSize = CurDAG->getMachineNode( + WebAssembly::GLOBAL_GET_I32, DL, PtrVT, + CurDAG->getTargetExternalSymbol("__tls_size", MVT::i32)); + ReplaceNode(Node, TLSSize); + return; + } + } + break; + } + default: break; } - */ // Select the default instruction. SelectCode(Node); diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 003848e34227..4064a983099c 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1,9 +1,8 @@ //=- WebAssemblyISelLowering.cpp - WebAssembly DAG Lowering Implementation -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -46,9 +45,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setBooleanContents(ZeroOrOneBooleanContent); // Except in SIMD vectors setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - // WebAssembly does not produce floating-point exceptions on normal floating - // point operations. - setHasFloatingPointExceptions(false); // We don't know the microarchitecture here, so just reduce register pressure. setSchedulingPreference(Sched::RegPressure); // Tell ISel that we have a stack pointer. @@ -64,10 +60,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass); addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass); addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass); - if (Subtarget->hasUnimplementedSIMD128()) { - addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass); - addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass); - } + } + if (Subtarget->hasUnimplementedSIMD128()) { + addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass); + addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass); } // Compute derived properties from the register classes. computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -111,56 +107,62 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setTruncStoreAction(T, MVT::f16, Expand); } - // Support saturating add for i8x16 and i16x8 - if (Subtarget->hasSIMD128()) - for (auto T : {MVT::v16i8, MVT::v8i16}) - for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) - setOperationAction(Op, T, Legal); - // Expand unavailable integer operations. for (auto Op : {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}) { - for (auto T : {MVT::i32, MVT::i64}) { + for (auto T : {MVT::i32, MVT::i64}) setOperationAction(Op, T, Expand); - } - if (Subtarget->hasSIMD128()) { - for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) { + if (Subtarget->hasSIMD128()) + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) setOperationAction(Op, T, Expand); - } - if (Subtarget->hasUnimplementedSIMD128()) { - setOperationAction(Op, MVT::v2i64, Expand); - } - } + if (Subtarget->hasUnimplementedSIMD128()) + setOperationAction(Op, MVT::v2i64, Expand); } - // There is no i64x2.mul instruction - setOperationAction(ISD::MUL, MVT::v2i64, Expand); - - // We have custom shuffle lowering to expose the shuffle mask + // SIMD-specific configuration if (Subtarget->hasSIMD128()) { - for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) { + // Support saturating add for i8x16 and i16x8 + for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) + for (auto T : {MVT::v16i8, MVT::v8i16}) + setOperationAction(Op, T, Legal); + + // Custom lower BUILD_VECTORs to minimize number of replace_lanes + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + if (Subtarget->hasUnimplementedSIMD128()) + for (auto T : {MVT::v2i64, MVT::v2f64}) + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + + // We have custom shuffle lowering to expose the shuffle mask + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom); - } - if (Subtarget->hasUnimplementedSIMD128()) { - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); - } - } + if (Subtarget->hasUnimplementedSIMD128()) + for (auto T: {MVT::v2i64, MVT::v2f64}) + setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom); - // Custom lowering since wasm shifts must have a scalar shift amount - if (Subtarget->hasSIMD128()) { - for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) - for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL}) + // Custom lowering since wasm shifts must have a scalar shift amount + for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL}) { + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) setOperationAction(Op, T, Custom); - if (Subtarget->hasUnimplementedSIMD128()) - for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL}) + if (Subtarget->hasUnimplementedSIMD128()) setOperationAction(Op, MVT::v2i64, Custom); - } + } - // There are no select instructions for vectors - if (Subtarget->hasSIMD128()) + // Custom lower lane accesses to expand out variable indices + for (auto Op : {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}) { + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) + setOperationAction(Op, T, Custom); + if (Subtarget->hasUnimplementedSIMD128()) + for (auto T : {MVT::v2i64, MVT::v2f64}) + setOperationAction(Op, T, Custom); + } + + // There is no i64x2.mul instruction + setOperationAction(ISD::MUL, MVT::v2i64, Expand); + + // There are no vector select instructions for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT}) { for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) setOperationAction(Op, T, Expand); @@ -169,6 +171,31 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(Op, T, Expand); } + // Expand integer operations supported for scalars but not SIMD + for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV, + ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR}) { + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) + setOperationAction(Op, T, Expand); + if (Subtarget->hasUnimplementedSIMD128()) + setOperationAction(Op, MVT::v2i64, Expand); + } + + // Expand float operations supported for scalars but not SIMD + for (auto Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, + ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, + ISD::FEXP, ISD::FEXP2, ISD::FRINT}) { + setOperationAction(Op, MVT::v4f32, Expand); + if (Subtarget->hasUnimplementedSIMD128()) + setOperationAction(Op, MVT::v2f64, Expand); + } + + // Expand additional SIMD ops that V8 hasn't implemented yet + if (!Subtarget->hasUnimplementedSIMD128()) { + setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); + setOperationAction(ISD::FDIV, MVT::v4f32, Expand); + } + } + // As a special case, these operators use the type to mean the type to // sign-extend from. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -220,25 +247,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( } } - // Expand additional SIMD ops that V8 hasn't implemented yet - if (Subtarget->hasSIMD128() && !Subtarget->hasUnimplementedSIMD128()) { - setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); - setOperationAction(ISD::FDIV, MVT::v4f32, Expand); - } - - // Custom lower lane accesses to expand out variable indices - if (Subtarget->hasSIMD128()) { - for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) { - setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); - } - if (Subtarget->hasUnimplementedSIMD128()) { - for (auto T : {MVT::v2i64, MVT::v2f64}) { - setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); - } - } - } + // Don't do anything clever with build_pairs + setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // Trap lowers to wasm unreachable setOperationAction(ISD::TRAP, MVT::Other, Legal); @@ -248,6 +258,31 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setMaxAtomicSizeInBitsSupported(64); + + if (Subtarget->hasBulkMemory()) { + // Use memory.copy and friends over multiple loads and stores + MaxStoresPerMemcpy = 1; + MaxStoresPerMemcpyOptSize = 1; + MaxStoresPerMemmove = 1; + MaxStoresPerMemmoveOptSize = 1; + MaxStoresPerMemset = 1; + MaxStoresPerMemsetOptSize = 1; + } + + // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is + // consistent with the f64 and f128 names. + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + + // Define the emscripten name for return address helper. + // TODO: when implementing other WASM backends, make this generic or only do + // this on emscripten depending on what they end up doing. + setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address"); + + // Always convert switches to br_tables unless there is only one case, which + // is equivalent to a simple branch. This reduces code size for wasm, and we + // defer possible jump table optimizations to the VM. + setMinimumJumpTableEntries(2); } TargetLowering::AtomicExpansionKind @@ -272,12 +307,6 @@ FastISel *WebAssemblyTargetLowering::createFastISel( return WebAssembly::createFastISel(FuncInfo, LibInfo); } -bool WebAssemblyTargetLowering::isOffsetFoldingLegal( - const GlobalAddressSDNode * /*GA*/) const { - // All offsets can be folded. - return true; -} - MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/, EVT VT) const { unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1); @@ -324,11 +353,11 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL, auto &Context = BB->getParent()->getFunction().getContext(); Type *Ty = Float64 ? Type::getDoubleTy(Context) : Type::getFloatTy(Context); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); + const BasicBlock *LLVMBB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); - MachineBasicBlock *TrueMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *DoneMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *TrueMBB = F->CreateMachineBasicBlock(LLVMBB); + MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVMBB); + MachineBasicBlock *DoneMBB = F->CreateMachineBasicBlock(LLVMBB); MachineFunction::iterator It = ++BB->getIterator(); F->insert(It, FalseMBB); @@ -336,8 +365,7 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL, F->insert(It, DoneMBB); // Transfer the remainder of BB and its successor edges to DoneMBB. - DoneMBB->splice(DoneMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); + DoneMBB->splice(DoneMBB->begin(), BB, std::next(MI.getIterator()), BB->end()); DoneMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(TrueMBB); @@ -502,7 +530,8 @@ bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL, } bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses( - EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, bool *Fast) const { + EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, + MachineMemOperand::Flags /*Flags*/, bool *Fast) const { // WebAssembly supports unaligned accesses, though it should be declared // with the p2align attribute on loads and stores which do so, and there // may be a performance impact. We tell LLVM they're "fast" because @@ -578,14 +607,14 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // Lowering Code //===----------------------------------------------------------------------===// -static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *msg) { +static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) { MachineFunction &MF = DAG.getMachineFunction(); DAG.getContext()->diagnose( - DiagnosticInfoUnsupported(MF.getFunction(), msg, DL.getDebugLoc())); + DiagnosticInfoUnsupported(MF.getFunction(), Msg, DL.getDebugLoc())); } // Test whether the given calling convention is supported. -static bool CallingConvSupported(CallingConv::ID CallConv) { +static bool callingConvSupported(CallingConv::ID CallConv) { // We currently support the language-independent target-independent // conventions. We don't yet have a way to annotate calls with properties like // "cold", and we don't have any call-clobbered registers, so these are mostly @@ -608,20 +637,21 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, auto Layout = MF.getDataLayout(); CallingConv::ID CallConv = CLI.CallConv; - if (!CallingConvSupported(CallConv)) + if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support language-specific or target-specific " "calling conventions yet"); if (CLI.IsPatchPoint) fail(DL, DAG, "WebAssembly doesn't support patch point yet"); - // WebAssembly doesn't currently support explicit tail calls. If they are - // required, fail. Otherwise, just disable them. - if ((CallConv == CallingConv::Fast && CLI.IsTailCall && - MF.getTarget().Options.GuaranteedTailCallOpt) || - (CLI.CS && CLI.CS.isMustTailCall())) - fail(DL, DAG, "WebAssembly doesn't support tail call yet"); - CLI.IsTailCall = false; + // Fail if tail calls are required but not enabled + if (!Subtarget->hasTailCall()) { + if ((CallConv == CallingConv::Fast && CLI.IsTailCall && + MF.getTarget().Options.GuaranteedTailCallOpt) || + (CLI.CS && CLI.CS.isMustTailCall())) + fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled"); + CLI.IsTailCall = false; + } SmallVectorImpl &Ins = CLI.Ins; if (Ins.size() > 1) @@ -630,9 +660,9 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; unsigned NumFixedArgs = 0; - for (unsigned i = 0; i < Outs.size(); ++i) { - const ISD::OutputArg &Out = Outs[i]; - SDValue &OutVal = OutVals[i]; + for (unsigned I = 0; I < Outs.size(); ++I) { + const ISD::OutputArg &Out = Outs[I]; + SDValue &OutVal = OutVals[I]; if (Out.Flags.isNest()) fail(DL, DAG, "WebAssembly hasn't implemented nest arguments"); if (Out.Flags.isInAlloca()) @@ -669,13 +699,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsVarArg) { // Outgoing non-fixed arguments are placed in a buffer. First // compute their offsets and the total amount of buffer space needed. - for (SDValue Arg : - make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) { + for (unsigned I = NumFixedArgs; I < Outs.size(); ++I) { + const ISD::OutputArg &Out = Outs[I]; + SDValue &Arg = OutVals[I]; EVT VT = Arg.getValueType(); assert(VT != MVT::iPTR && "Legalized args should be concrete"); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + unsigned Align = std::max(Out.Flags.getOrigAlign(), + Layout.getABITypeAlignment(Ty)); unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty), - Layout.getABITypeAlignment(Ty)); + Align); CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(), Offset, VT.getSimpleVT(), CCValAssign::Full)); @@ -711,6 +744,18 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, FINode = DAG.getIntPtrConstant(0, DL); } + if (Callee->getOpcode() == ISD::GlobalAddress) { + // If the callee is a GlobalAddress node (quite common, every direct call + // is) turn it into a TargetGlobalAddress node so that LowerGlobalAddress + // doesn't at MO_GOT which is not needed for direct calls. + GlobalAddressSDNode* GA = cast(Callee); + Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, + getPointerTy(DAG.getDataLayout()), + GA->getOffset()); + Callee = DAG.getNode(WebAssemblyISD::Wrapper, DL, + getPointerTy(DAG.getDataLayout()), Callee); + } + // Compute the operands for the CALLn node. SmallVector Ops; Ops.push_back(Chain); @@ -739,6 +784,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, // registers. InTys.push_back(In.VT); } + + if (CLI.IsTailCall) { + // ret_calls do not return values to the current frame + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + return DAG.getNode(WebAssemblyISD::RET_CALL, DL, NodeTys, Ops); + } + InTys.push_back(MVT::Other); SDVTList InTyList = DAG.getVTList(InTys); SDValue Res = @@ -768,7 +820,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn( const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { assert(Outs.size() <= 1 && "WebAssembly can only return up to one value"); - if (!CallingConvSupported(CallConv)) + if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); SmallVector RetOps(1, Chain); @@ -795,7 +847,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { - if (!CallingConvSupported(CallConv)) + if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); MachineFunction &MF = DAG.getMachineFunction(); @@ -842,7 +894,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( // Record the number and types of arguments and results. SmallVector Params; SmallVector Results; - ComputeSignatureVTs(MF.getFunction().getFunctionType(), MF.getFunction(), + computeSignatureVTs(MF.getFunction().getFunctionType(), MF.getFunction(), DAG.getTarget(), Params, Results); for (MVT VT : Results) MFI->addResult(VT); @@ -855,6 +907,21 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( return Chain; } +void WebAssemblyTargetLowering::ReplaceNodeResults( + SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Do not add any results, signifying that N should not be custom lowered + // after all. This happens because simd128 turns on custom lowering for + // SIGN_EXTEND_INREG, but for non-vector sign extends the result might be an + // illegal type. + break; + default: + llvm_unreachable( + "ReplaceNodeResults not implemented for this op for WebAssembly!"); + } +} + //===----------------------------------------------------------------------===// // Custom lowering hooks. //===----------------------------------------------------------------------===// @@ -882,22 +949,23 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op, case ISD::BRIND: fail(DL, DAG, "WebAssembly hasn't implemented computed gotos"); return SDValue(); - case ISD::RETURNADDR: // Probably nothing meaningful can be returned here. - fail(DL, DAG, "WebAssembly hasn't implemented __builtin_return_address"); - return SDValue(); + case ISD::RETURNADDR: + return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::CopyToReg: return LowerCopyToReg(Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: - return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: return LowerAccessVectorElement(Op, DAG); case ISD::INTRINSIC_VOID: - return LowerINTRINSIC_VOID(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + case ISD::INTRINSIC_W_CHAIN: + return LowerIntrinsic(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::SHL: @@ -939,6 +1007,26 @@ SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op, return DAG.getTargetFrameIndex(FI, Op.getValueType()); } +SDValue WebAssemblyTargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + if (!Subtarget->getTargetTriple().isOSEmscripten()) { + fail(DL, DAG, + "Non-Emscripten WebAssembly hasn't implemented " + "__builtin_return_address"); + return SDValue(); + } + + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + return makeLibCall(DAG, RTLIB::RETURN_ADDRESS, Op.getValueType(), + {DAG.getConstant(Depth, DL, MVT::i32)}, false, DL) + .first; +} + SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // Non-zero depths are not supported by WebAssembly currently. Use the @@ -963,9 +1051,40 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op, "Unexpected target flags on generic GlobalAddressSDNode"); if (GA->getAddressSpace() != 0) fail(DL, DAG, "WebAssembly only expects the 0 address space"); - return DAG.getNode( - WebAssemblyISD::Wrapper, DL, VT, - DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset())); + + unsigned OperandFlags = 0; + if (isPositionIndependent()) { + const GlobalValue *GV = GA->getGlobal(); + if (getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) { + MachineFunction &MF = DAG.getMachineFunction(); + MVT PtrVT = getPointerTy(MF.getDataLayout()); + const char *BaseName; + if (GV->getValueType()->isFunctionTy()) { + BaseName = MF.createExternalSymbolName("__table_base"); + OperandFlags = WebAssemblyII::MO_TABLE_BASE_REL; + } + else { + BaseName = MF.createExternalSymbolName("__memory_base"); + OperandFlags = WebAssemblyII::MO_MEMORY_BASE_REL; + } + SDValue BaseAddr = + DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, + DAG.getTargetExternalSymbol(BaseName, PtrVT)); + + SDValue SymAddr = DAG.getNode( + WebAssemblyISD::WrapperPIC, DL, VT, + DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset(), + OperandFlags)); + + return DAG.getNode(ISD::ADD, DL, VT, BaseAddr, SymAddr); + } else { + OperandFlags = WebAssemblyII::MO_GOT; + } + } + + return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, + DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, + GA->getOffset(), OperandFlags)); } SDValue @@ -976,15 +1095,8 @@ WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op, EVT VT = Op.getValueType(); assert(ES->getTargetFlags() == 0 && "Unexpected target flags on generic ExternalSymbolSDNode"); - // Set the TargetFlags to 0x1 which indicates that this is a "function" - // symbol rather than a data symbol. We do this unconditionally even though - // we don't know anything about the symbol other than its name, because all - // external symbols used in target-independent SelectionDAG code are for - // functions. - return DAG.getNode( - WebAssemblyISD::Wrapper, DL, VT, - DAG.getTargetExternalSymbol(ES->getSymbol(), VT, - WebAssemblyII::MO_SYMBOL_FUNCTION)); + return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, + DAG.getTargetExternalSymbol(ES->getSymbol(), VT)); } SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op, @@ -1038,17 +1150,28 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op, MachinePointerInfo(SV), 0); } -SDValue -WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); +SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + unsigned IntNo; + switch (Op.getOpcode()) { + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + IntNo = cast(Op.getOperand(1))->getZExtValue(); + break; + case ISD::INTRINSIC_WO_CHAIN: + IntNo = cast(Op.getOperand(0))->getZExtValue(); + break; + default: + llvm_unreachable("Invalid intrinsic"); + } SDLoc DL(Op); + switch (IntNo) { default: - return {}; // Don't custom lower most intrinsics. + return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::wasm_lsda: { - MachineFunction &MF = DAG.getMachineFunction(); EVT VT = Op.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); @@ -1058,43 +1181,24 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, DAG.getMCSymbol(S, PtrVT)); } - } -} - -SDValue -WebAssemblyTargetLowering::LowerINTRINSIC_VOID(SDValue Op, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); - SDLoc DL(Op); - - switch (IntNo) { - default: - return {}; // Don't custom lower most intrinsics. case Intrinsic::wasm_throw: { + // We only support C++ exceptions for now int Tag = cast(Op.getOperand(2).getNode())->getZExtValue(); - switch (Tag) { - case CPP_EXCEPTION: { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); - const char *SymName = MF.createExternalSymbolName("__cpp_exception"); - SDValue SymNode = - DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, - DAG.getTargetExternalSymbol( - SymName, PtrVT, WebAssemblyII::MO_SYMBOL_EVENT)); - return DAG.getNode(WebAssemblyISD::THROW, DL, - MVT::Other, // outchain type - { - Op.getOperand(0), // inchain - SymNode, // exception symbol - Op.getOperand(3) // thrown value - }); - } - default: + if (Tag != CPP_EXCEPTION) llvm_unreachable("Invalid tag!"); - } - break; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + const char *SymName = MF.createExternalSymbolName("__cpp_exception"); + SDValue SymNode = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, + DAG.getTargetExternalSymbol(SymName, PtrVT)); + return DAG.getNode(WebAssemblyISD::THROW, DL, + MVT::Other, // outchain type + { + Op.getOperand(0), // inchain + SymNode, // exception symbol + Op.getOperand(3) // thrown value + }); } } } @@ -1102,6 +1206,7 @@ WebAssemblyTargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); // If sign extension operations are disabled, allow sext_inreg only if operand // is a vector extract. SIMD does not depend on sign extension operations, but // allowing sext_inreg in this context lets us have simple patterns to select @@ -1109,12 +1214,136 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, // simpler in this file, but would necessitate large and brittle patterns to // undo the expansion and select extract_lane_s instructions. assert(!Subtarget->hasSignExt() && Subtarget->hasSIMD128()); - if (Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT) - return Op; + if (Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + const SDValue &Extract = Op.getOperand(0); + MVT VecT = Extract.getOperand(0).getSimpleValueType(); + MVT ExtractedLaneT = static_cast(Op.getOperand(1).getNode()) + ->getVT() + .getSimpleVT(); + MVT ExtractedVecT = + MVT::getVectorVT(ExtractedLaneT, 128 / ExtractedLaneT.getSizeInBits()); + if (ExtractedVecT == VecT) + return Op; + // Bitcast vector to appropriate type to ensure ISel pattern coverage + const SDValue &Index = Extract.getOperand(1); + unsigned IndexVal = + static_cast(Index.getNode())->getZExtValue(); + unsigned Scale = + ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements(); + assert(Scale > 1); + SDValue NewIndex = + DAG.getConstant(IndexVal * Scale, DL, Index.getValueType()); + SDValue NewExtract = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, Extract.getValueType(), + DAG.getBitcast(ExtractedVecT, Extract.getOperand(0)), NewIndex); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), + NewExtract, Op.getOperand(1)); + } // Otherwise expand return SDValue(); } +SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const EVT VecT = Op.getValueType(); + const EVT LaneT = Op.getOperand(0).getValueType(); + const size_t Lanes = Op.getNumOperands(); + auto IsConstant = [](const SDValue &V) { + return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP; + }; + + // Find the most common operand, which is approximately the best to splat + using Entry = std::pair; + SmallVector ValueCounts; + size_t NumConst = 0, NumDynamic = 0; + for (const SDValue &Lane : Op->op_values()) { + if (Lane.isUndef()) { + continue; + } else if (IsConstant(Lane)) { + NumConst++; + } else { + NumDynamic++; + } + auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(), + [&Lane](Entry A) { return A.first == Lane; }); + if (CountIt == ValueCounts.end()) { + ValueCounts.emplace_back(Lane, 1); + } else { + CountIt->second++; + } + } + auto CommonIt = + std::max_element(ValueCounts.begin(), ValueCounts.end(), + [](Entry A, Entry B) { return A.second < B.second; }); + assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector"); + SDValue SplatValue = CommonIt->first; + size_t NumCommon = CommonIt->second; + + // If v128.const is available, consider using it instead of a splat + if (Subtarget->hasUnimplementedSIMD128()) { + // {i32,i64,f32,f64}.const opcode, and value + const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes); + // SIMD prefix and opcode + const size_t SplatBytes = 2; + const size_t SplatConstBytes = SplatBytes + ConstBytes; + // SIMD prefix, opcode, and lane index + const size_t ReplaceBytes = 3; + const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes; + // SIMD prefix, v128.const opcode, and 128-bit value + const size_t VecConstBytes = 18; + // Initial v128.const and a replace_lane for each non-const operand + const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes; + // Initial splat and all necessary replace_lanes + const size_t SplatInitBytes = + IsConstant(SplatValue) + // Initial constant splat + ? (SplatConstBytes + + // Constant replace_lanes + (NumConst - NumCommon) * ReplaceConstBytes + + // Dynamic replace_lanes + (NumDynamic * ReplaceBytes)) + // Initial dynamic splat + : (SplatBytes + + // Constant replace_lanes + (NumConst * ReplaceConstBytes) + + // Dynamic replace_lanes + (NumDynamic - NumCommon) * ReplaceBytes); + if (ConstInitBytes < SplatInitBytes) { + // Create build_vector that will lower to initial v128.const + SmallVector ConstLanes; + for (const SDValue &Lane : Op->op_values()) { + if (IsConstant(Lane)) { + ConstLanes.push_back(Lane); + } else if (LaneT.isFloatingPoint()) { + ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT)); + } else { + ConstLanes.push_back(DAG.getConstant(0, DL, LaneT)); + } + } + SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes); + // Add replace_lane instructions for non-const lanes + for (size_t I = 0; I < Lanes; ++I) { + const SDValue &Lane = Op->getOperand(I); + if (!Lane.isUndef() && !IsConstant(Lane)) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane, + DAG.getConstant(I, DL, MVT::i32)); + } + return Result; + } + } + // Use a splat for the initial vector + SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue); + // Add replace_lane instructions for other values + for (size_t I = 0; I < Lanes; ++I) { + const SDValue &Lane = Op->getOperand(I); + if (Lane != SplatValue) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane, + DAG.getConstant(I, DL, MVT::i32)); + } + return Result; +} + SDValue WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { @@ -1131,11 +1360,10 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, Ops[OpIdx++] = Op.getOperand(1); // Expand mask indices to byte indices and materialize them as operands - for (size_t I = 0, Lanes = Mask.size(); I < Lanes; ++I) { + for (int M : Mask) { for (size_t J = 0; J < LaneBytes; ++J) { // Lower undefs (represented by -1 in mask) to zero - uint64_t ByteIndex = - Mask[I] == -1 ? 0 : (uint64_t)Mask[I] * LaneBytes + J; + uint64_t ByteIndex = M == -1 ? 0 : (uint64_t)M * LaneBytes + J; Ops[OpIdx++] = DAG.getConstant(ByteIndex, DL, MVT::i32); } } @@ -1155,7 +1383,7 @@ WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op, return SDValue(); } -static SDValue UnrollVectorShift(SDValue Op, SelectionDAG &DAG) { +static SDValue unrollVectorShift(SDValue Op, SelectionDAG &DAG) { EVT LaneT = Op.getSimpleValueType().getVectorElementType(); // 32-bit and 64-bit unrolled shifts will have proper semantics if (LaneT.bitsGE(MVT::i32)) @@ -1190,17 +1418,17 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op, // Expand all vector shifts until V8 fixes its implementation // TODO: remove this once V8 is fixed if (!Subtarget->hasUnimplementedSIMD128()) - return UnrollVectorShift(Op, DAG); + return unrollVectorShift(Op, DAG); // Unroll non-splat vector shifts BuildVectorSDNode *ShiftVec; SDValue SplatVal; if (!(ShiftVec = dyn_cast(Op.getOperand(1).getNode())) || !(SplatVal = ShiftVec->getSplatValue())) - return UnrollVectorShift(Op, DAG); + return unrollVectorShift(Op, DAG); // All splats except i64x2 const splats are handled by patterns - ConstantSDNode *SplatConst = dyn_cast(SplatVal); + auto *SplatConst = dyn_cast(SplatVal); if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64) return Op; diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 59f4230ed889..b3c7f3defd5f 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -1,9 +1,8 @@ //- WebAssemblyISelLowering.h - WebAssembly DAG Lowering Interface -*- C++ -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -47,7 +46,6 @@ private: AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const override; - bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -62,6 +60,7 @@ private: unsigned AS, Instruction *I = nullptr) const override; bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align, + MachineMemOperand::Flags Flags, bool *Fast) const override; bool isIntDivCheap(EVT VT, AttributeList Attr) const override; @@ -87,9 +86,17 @@ private: const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; + + const char *getClearCacheBuiltinName() const override { + report_fatal_error("llvm.clear_cache is not supported on wasm"); + } + // Custom lowering hooks. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; @@ -97,9 +104,9 @@ private: SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsic(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index 5fb8ef90bc43..e85aa57efc42 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -1,9 +1,8 @@ // WebAssemblyInstrAtomics.td-WebAssembly Atomic codegen support-*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -12,20 +11,132 @@ /// //===----------------------------------------------------------------------===// +let UseNamedOperandTable = 1 in +multiclass ATOMIC_I pattern_r, string asmstr_r = "", + string asmstr_s = "", bits<32> atomic_op = -1> { + defm "" : I, + Requires<[HasAtomics]>; +} + +multiclass ATOMIC_NRI pattern, string asmstr = "", + bits<32> atomic_op = -1> { + defm "" : NRI, + Requires<[HasAtomics]>; +} + +//===----------------------------------------------------------------------===// +// Atomic wait / notify +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1 in { +defm ATOMIC_NOTIFY : + ATOMIC_I<(outs I32:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count", + "atomic.notify \t${off}${p2align}", 0x00>; +let mayLoad = 1 in { +defm ATOMIC_WAIT_I32 : + ATOMIC_I<(outs I32:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp, + I64:$timeout), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout", + "i32.atomic.wait \t${off}${p2align}", 0x01>; +defm ATOMIC_WAIT_I64 : + ATOMIC_I<(outs I32:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp, + I64:$timeout), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout", + "i64.atomic.wait \t${off}${p2align}", 0x02>; +} // mayLoad = 1 +} // hasSideEffects = 1 + +let Predicates = [HasAtomics] in { +// Select notifys with no constant offset. +def NotifyPatNoOffset : + Pat<(i32 (int_wasm_atomic_notify I32:$addr, I32:$count)), + (ATOMIC_NOTIFY 0, 0, I32:$addr, I32:$count)>; + +// Select notifys with a constant offset. + +// Pattern with address + immediate offset +class NotifyPatImmOff : + Pat<(i32 (int_wasm_atomic_notify (operand I32:$addr, imm:$off), I32:$count)), + (ATOMIC_NOTIFY 0, imm:$off, I32:$addr, I32:$count)>; +def : NotifyPatImmOff; +def : NotifyPatImmOff; + +def NotifyPatGlobalAddr : + Pat<(i32 (int_wasm_atomic_notify (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)), + I32:$count)), + (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>; + +// Select notifys with just a constant offset. +def NotifyPatOffsetOnly : + Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)), + (ATOMIC_NOTIFY 0, imm:$off, (CONST_I32 0), I32:$count)>; + +def NotifyPatGlobalAddrOffOnly : + Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off), + I32:$count)), + (ATOMIC_NOTIFY 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>; + +// Select waits with no constant offset. +class WaitPatNoOffset : + Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)), + (inst 0, 0, I32:$addr, ty:$exp, I64:$timeout)>; +def : WaitPatNoOffset; +def : WaitPatNoOffset; + +// Select waits with a constant offset. + +// Pattern with address + immediate offset +class WaitPatImmOff : + Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)), + (inst 0, imm:$off, I32:$addr, ty:$exp, I64:$timeout)>; +def : WaitPatImmOff; +def : WaitPatImmOff; +def : WaitPatImmOff; +def : WaitPatImmOff; + +class WaitPatGlobalAddr : + Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), + ty:$exp, I64:$timeout)), + (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>; +def : WaitPatGlobalAddr; +def : WaitPatGlobalAddr; + +// Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset. +class WaitPatOffsetOnly : + Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)), + (inst 0, imm:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>; +def : WaitPatOffsetOnly; +def : WaitPatOffsetOnly; + +class WaitPatGlobalAddrOffOnly : + Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, I64:$timeout)), + (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>; +def : WaitPatGlobalAddrOffOnly; +def : WaitPatGlobalAddrOffOnly; +} // Predicates = [HasAtomics] + //===----------------------------------------------------------------------===// // Atomic loads //===----------------------------------------------------------------------===// -multiclass ATOMIC_I pattern_r, string asmstr_r = "", - string asmstr_s = "", bits<32> inst = -1> { - defm "" : I, +multiclass AtomicLoad { + defm "" : WebAssemblyLoad, Requires<[HasAtomics]>; } -defm ATOMIC_LOAD_I32 : WebAssemblyLoad; -defm ATOMIC_LOAD_I64 : WebAssemblyLoad; +defm ATOMIC_LOAD_I32 : AtomicLoad; +defm ATOMIC_LOAD_I64 : AtomicLoad; // Select loads with no constant offset. let Predicates = [HasAtomics] in { @@ -43,9 +154,6 @@ def : LoadPatImmOff; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; -def : LoadPatExternalSym; -def : LoadPatExternalSym; - // Select loads with just a constant offset. def : LoadPatOffsetOnly; def : LoadPatOffsetOnly; @@ -53,18 +161,15 @@ def : LoadPatOffsetOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; - } // Predicates = [HasAtomics] // Extending loads. Note that there are only zero-extending atomic loads, no // sign-extending loads. -defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad; -defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad; -defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad; -defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad; -defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad; +defm ATOMIC_LOAD8_U_I32 : AtomicLoad; +defm ATOMIC_LOAD16_U_I32 : AtomicLoad; +defm ATOMIC_LOAD8_U_I64 : AtomicLoad; +defm ATOMIC_LOAD16_U_I64 : AtomicLoad; +defm ATOMIC_LOAD32_U_I64 : AtomicLoad; // Fragments for extending loads. These are different from regular loads because // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and @@ -149,16 +254,6 @@ def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; - // Extending loads with just a constant offset def : LoadPatOffsetOnly; def : LoadPatOffsetOnly; @@ -180,24 +275,19 @@ def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; - } // Predicates = [HasAtomics] //===----------------------------------------------------------------------===// // Atomic stores //===----------------------------------------------------------------------===// -defm ATOMIC_STORE_I32 : WebAssemblyStore; -defm ATOMIC_STORE_I64 : WebAssemblyStore; +multiclass AtomicStore { + defm "" : WebAssemblyStore, + Requires<[HasAtomics]>; +} + +defm ATOMIC_STORE_I32 : AtomicStore; +defm ATOMIC_STORE_I64 : AtomicStore; // We need an 'atomic' version of store patterns because store and atomic_store // nodes have different operand orders: @@ -230,12 +320,6 @@ class AStorePatGlobalAddr : def : AStorePatGlobalAddr; def : AStorePatGlobalAddr; -class AStorePatExternalSym : - Pat<(kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), ty:$val), - (inst 0, texternalsym:$off, I32:$addr, ty:$val)>; -def : AStorePatExternalSym; -def : AStorePatExternalSym; - // Select stores with just a constant offset. class AStorePatOffsetOnly : Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>; @@ -248,20 +332,14 @@ class AStorePatGlobalAddrOffOnly : def : AStorePatGlobalAddrOffOnly; def : AStorePatGlobalAddrOffOnly; -class AStorePatExternSymOffOnly : - Pat<(kind (WebAssemblywrapper texternalsym:$off), ty:$val), - (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>; -def : AStorePatExternSymOffOnly; -def : AStorePatExternSymOffOnly; - } // Predicates = [HasAtomics] // Truncating stores. -defm ATOMIC_STORE8_I32 : WebAssemblyStore; -defm ATOMIC_STORE16_I32 : WebAssemblyStore; -defm ATOMIC_STORE8_I64 : WebAssemblyStore; -defm ATOMIC_STORE16_I64 : WebAssemblyStore; -defm ATOMIC_STORE32_I64 : WebAssemblyStore; +defm ATOMIC_STORE8_I32 : AtomicStore; +defm ATOMIC_STORE16_I32 : AtomicStore; +defm ATOMIC_STORE8_I64 : AtomicStore; +defm ATOMIC_STORE16_I64 : AtomicStore; +defm ATOMIC_STORE32_I64 : AtomicStore; // Fragments for truncating stores. @@ -302,12 +380,6 @@ def : AStorePatGlobalAddr; def : AStorePatGlobalAddr; def : AStorePatGlobalAddr; -def : AStorePatExternalSym; -def : AStorePatExternalSym; -def : AStorePatExternalSym; -def : AStorePatExternalSym; -def : AStorePatExternalSym; - // Truncating stores with just a constant offset def : AStorePatOffsetOnly; def : AStorePatOffsetOnly; @@ -321,105 +393,101 @@ def : AStorePatGlobalAddrOffOnly; def : AStorePatGlobalAddrOffOnly; def : AStorePatGlobalAddrOffOnly; -def : AStorePatExternSymOffOnly; -def : AStorePatExternSymOffOnly; -def : AStorePatExternSymOffOnly; -def : AStorePatExternSymOffOnly; -def : AStorePatExternSymOffOnly; - } // Predicates = [HasAtomics] //===----------------------------------------------------------------------===// // Atomic binary read-modify-writes //===----------------------------------------------------------------------===// -multiclass WebAssemblyBinRMW { - defm "" : I<(outs rc:$dst), - (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}, $val"), - !strconcat(Name, "\t${off}, ${p2align}"), Opcode>; +multiclass WebAssemblyBinRMW { + defm "" : + ATOMIC_I<(outs rc:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $val"), + !strconcat(name, "\t${off}${p2align}"), atomic_op>; } -defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW; -defm ATOMIC_RMW_ADD_I64 : WebAssemblyBinRMW; +defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW; +defm ATOMIC_RMW_ADD_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW8_U_ADD_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_ADD_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_ADD_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_ADD_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW32_U_ADD_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; -defm ATOMIC_RMW_SUB_I32 : WebAssemblyBinRMW; -defm ATOMIC_RMW_SUB_I64 : WebAssemblyBinRMW; +defm ATOMIC_RMW_SUB_I32 : WebAssemblyBinRMW; +defm ATOMIC_RMW_SUB_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW8_U_SUB_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_SUB_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_SUB_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_SUB_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW32_U_SUB_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; -defm ATOMIC_RMW_AND_I32 : WebAssemblyBinRMW; -defm ATOMIC_RMW_AND_I64 : WebAssemblyBinRMW; +defm ATOMIC_RMW_AND_I32 : WebAssemblyBinRMW; +defm ATOMIC_RMW_AND_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW8_U_AND_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_AND_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_AND_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_AND_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW32_U_AND_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; -defm ATOMIC_RMW_OR_I32 : WebAssemblyBinRMW; -defm ATOMIC_RMW_OR_I64 : WebAssemblyBinRMW; +defm ATOMIC_RMW_OR_I32 : WebAssemblyBinRMW; +defm ATOMIC_RMW_OR_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW8_U_OR_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_OR_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_OR_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_OR_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW32_U_OR_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; -defm ATOMIC_RMW_XOR_I32 : WebAssemblyBinRMW; -defm ATOMIC_RMW_XOR_I64 : WebAssemblyBinRMW; +defm ATOMIC_RMW_XOR_I32 : WebAssemblyBinRMW; +defm ATOMIC_RMW_XOR_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW8_U_XOR_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_XOR_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_XOR_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_XOR_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW32_U_XOR_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW_XCHG_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW_XCHG_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_XCHG_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_XCHG_I32 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW8_U_XCHG_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW16_U_XCHG_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; defm ATOMIC_RMW32_U_XCHG_I64 : - WebAssemblyBinRMW; + WebAssemblyBinRMW; // Select binary RMWs with no constant offset. class BinRMWPatNoOffset : @@ -437,11 +505,6 @@ class BinRMWPatGlobalAddr : ty:$val)), (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>; -class BinRMWPatExternalSym : - Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), - ty:$val)), - (inst 0, texternalsym:$off, I32:$addr, ty:$val)>; - // Select binary RMWs with just a constant offset. class BinRMWPatOffsetOnly : Pat<(ty (kind imm:$off, ty:$val)), @@ -451,10 +514,6 @@ class BinRMWPatGlobalAddrOffOnly : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)), (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>; -class BinRMWPatExternSymOffOnly : - Pat<(ty (kind (WebAssemblywrapper texternalsym:$off), ty:$val)), - (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>; - // Patterns for various addressing modes. multiclass BinRMWPattern { @@ -469,17 +528,11 @@ multiclass BinRMWPattern; def : BinRMWPatGlobalAddr; - def : BinRMWPatExternalSym; - def : BinRMWPatExternalSym; - def : BinRMWPatOffsetOnly; def : BinRMWPatOffsetOnly; def : BinRMWPatGlobalAddrOffOnly; def : BinRMWPatGlobalAddrOffOnly; - - def : BinRMWPatExternSymOffOnly; - def : BinRMWPatExternSymOffOnly; } let Predicates = [HasAtomics] in { @@ -580,17 +633,6 @@ multiclass BinRMWTruncExtPattern< def : BinRMWPatGlobalAddr, inst8_64>; def : BinRMWPatGlobalAddr, inst16_64>; - def : BinRMWPatExternalSym, inst8_32>; - def : BinRMWPatExternalSym, inst16_32>; - def : BinRMWPatExternalSym, inst8_64>; - def : BinRMWPatExternalSym, inst16_64>; - def : BinRMWPatExternalSym, inst32_64>; - - def : BinRMWPatExternalSym, inst8_32>; - def : BinRMWPatExternalSym, inst16_32>; - def : BinRMWPatExternalSym, inst8_64>; - def : BinRMWPatExternalSym, inst16_64>; - // Truncating-extending binary RMWs with just a constant offset def : BinRMWPatOffsetOnly, inst8_32>; def : BinRMWPatOffsetOnly, inst16_32>; @@ -613,17 +655,6 @@ multiclass BinRMWTruncExtPattern< def : BinRMWPatGlobalAddrOffOnly, inst16_32>; def : BinRMWPatGlobalAddrOffOnly, inst8_64>; def : BinRMWPatGlobalAddrOffOnly, inst16_64>; - - def : BinRMWPatExternSymOffOnly, inst8_32>; - def : BinRMWPatExternSymOffOnly, inst16_32>; - def : BinRMWPatExternSymOffOnly, inst8_64>; - def : BinRMWPatExternSymOffOnly, inst16_64>; - def : BinRMWPatExternSymOffOnly, inst32_64>; - - def : BinRMWPatExternSymOffOnly, inst8_32>; - def : BinRMWPatExternSymOffOnly, inst16_32>; - def : BinRMWPatExternSymOffOnly, inst8_64>; - def : BinRMWPatExternSymOffOnly, inst16_64>; } let Predicates = [HasAtomics] in { @@ -663,29 +694,31 @@ defm : BinRMWTruncExtPattern< // Consider adding a pass after instruction selection that optimizes this case // if it is frequent. -multiclass WebAssemblyTerRMW { - defm "" : I<(outs rc:$dst), - (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp, - rc:$new), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new"), - !strconcat(Name, "\t${off}, ${p2align}"), Opcode>; +multiclass WebAssemblyTerRMW { + defm "" : + ATOMIC_I<(outs rc:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp, + rc:$new_), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new_"), + !strconcat(name, "\t${off}${p2align}"), atomic_op>; } defm ATOMIC_RMW_CMPXCHG_I32 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; defm ATOMIC_RMW_CMPXCHG_I64 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; defm ATOMIC_RMW8_U_CMPXCHG_I32 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; defm ATOMIC_RMW16_U_CMPXCHG_I32 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; defm ATOMIC_RMW8_U_CMPXCHG_I64 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; defm ATOMIC_RMW16_U_CMPXCHG_I64 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; defm ATOMIC_RMW32_U_CMPXCHG_I64 : - WebAssemblyTerRMW; + WebAssemblyTerRMW; // Select ternary RMWs with no constant offset. class TerRMWPatNoOffset : @@ -704,11 +737,6 @@ class TerRMWPatGlobalAddr : ty:$exp, ty:$new)), (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, ty:$new)>; -class TerRMWPatExternalSym : - Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), - ty:$exp, ty:$new)), - (inst 0, texternalsym:$off, I32:$addr, ty:$exp, ty:$new)>; - // Select ternary RMWs with just a constant offset. class TerRMWPatOffsetOnly : Pat<(ty (kind imm:$off, ty:$exp, ty:$new)), @@ -718,10 +746,6 @@ class TerRMWPatGlobalAddrOffOnly : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)), (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, ty:$new)>; -class TerRMWPatExternSymOffOnly : - Pat<(ty (kind (WebAssemblywrapper texternalsym:$off), ty:$exp, ty:$new)), - (inst 0, texternalsym:$off, (CONST_I32 0), ty:$exp, ty:$new)>; - // Patterns for various addressing modes. multiclass TerRMWPattern { @@ -736,23 +760,16 @@ multiclass TerRMWPattern; def : TerRMWPatGlobalAddr; - def : TerRMWPatExternalSym; - def : TerRMWPatExternalSym; - def : TerRMWPatOffsetOnly; def : TerRMWPatOffsetOnly; def : TerRMWPatGlobalAddrOffOnly; def : TerRMWPatGlobalAddrOffOnly; - - def : TerRMWPatExternSymOffOnly; - def : TerRMWPatExternSymOffOnly; } -let Predicates = [HasAtomics] in { +let Predicates = [HasAtomics] in defm : TerRMWPattern; -} // Predicates = [HasAtomics] // Truncating & zero-extending ternary RMW patterns. // DAG legalization & optimization before instruction selection may introduce @@ -840,17 +857,6 @@ multiclass TerRMWTruncExtPattern< def : TerRMWPatGlobalAddr, inst8_64>; def : TerRMWPatGlobalAddr, inst16_64>; - def : TerRMWPatExternalSym, inst8_32>; - def : TerRMWPatExternalSym, inst16_32>; - def : TerRMWPatExternalSym, inst8_64>; - def : TerRMWPatExternalSym, inst16_64>; - def : TerRMWPatExternalSym, inst32_64>; - - def : TerRMWPatExternalSym, inst8_32>; - def : TerRMWPatExternalSym, inst16_32>; - def : TerRMWPatExternalSym, inst8_64>; - def : TerRMWPatExternalSym, inst16_64>; - // Truncating-extending ternary RMWs with just a constant offset def : TerRMWPatOffsetOnly, inst8_32>; def : TerRMWPatOffsetOnly, inst16_32>; @@ -873,147 +879,21 @@ multiclass TerRMWTruncExtPattern< def : TerRMWPatGlobalAddrOffOnly, inst16_32>; def : TerRMWPatGlobalAddrOffOnly, inst8_64>; def : TerRMWPatGlobalAddrOffOnly, inst16_64>; - - def : TerRMWPatExternSymOffOnly, inst8_32>; - def : TerRMWPatExternSymOffOnly, inst16_32>; - def : TerRMWPatExternSymOffOnly, inst8_64>; - def : TerRMWPatExternSymOffOnly, inst16_64>; - def : TerRMWPatExternSymOffOnly, inst32_64>; - - def : TerRMWPatExternSymOffOnly, inst8_32>; - def : TerRMWPatExternSymOffOnly, inst16_32>; - def : TerRMWPatExternSymOffOnly, inst8_64>; - def : TerRMWPatExternSymOffOnly, inst16_64>; } -let Predicates = [HasAtomics] in { +let Predicates = [HasAtomics] in defm : TerRMWTruncExtPattern< atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32, atomic_cmp_swap_64, ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32, ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64, ATOMIC_RMW32_U_CMPXCHG_I64>; -} //===----------------------------------------------------------------------===// -// Atomic wait / notify +// Atomic fences //===----------------------------------------------------------------------===// -let hasSideEffects = 1 in { -defm ATOMIC_NOTIFY : - I<(outs I32:$dst), - (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count", - "atomic.notify \t${off}, ${p2align}", 0xfe00>; -let mayLoad = 1 in { -defm ATOMIC_WAIT_I32 : - I<(outs I32:$dst), - (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp, I64:$timeout), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout", - "i32.atomic.wait \t${off}, ${p2align}", 0xfe01>; -defm ATOMIC_WAIT_I64 : - I<(outs I32:$dst), - (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp, I64:$timeout), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout", - "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>; -} // mayLoad = 1 -} // hasSideEffects = 1 - -let Predicates = [HasAtomics] in { -// Select notifys with no constant offset. -class NotifyPatNoOffset : - Pat<(i32 (kind I32:$addr, I32:$count)), - (ATOMIC_NOTIFY 0, 0, I32:$addr, I32:$count)>; -def : NotifyPatNoOffset; - -// Select notifys with a constant offset. - -// Pattern with address + immediate offset -class NotifyPatImmOff : - Pat<(i32 (kind (operand I32:$addr, imm:$off), I32:$count)), - (ATOMIC_NOTIFY 0, imm:$off, I32:$addr, I32:$count)>; -def : NotifyPatImmOff; -def : NotifyPatImmOff; - -class NotifyPatGlobalAddr : - Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), - I32:$count)), - (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>; -def : NotifyPatGlobalAddr; - -class NotifyPatExternalSym : - Pat<(i32 (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), - I32:$count)), - (ATOMIC_NOTIFY 0, texternalsym:$off, I32:$addr, I32:$count)>; -def : NotifyPatExternalSym; - -// Select notifys with just a constant offset. -class NotifyPatOffsetOnly : - Pat<(i32 (kind imm:$off, I32:$count)), - (ATOMIC_NOTIFY 0, imm:$off, (CONST_I32 0), I32:$count)>; -def : NotifyPatOffsetOnly; - -class NotifyPatGlobalAddrOffOnly : - Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), I32:$count)), - (ATOMIC_NOTIFY 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>; -def : NotifyPatGlobalAddrOffOnly; - -class NotifyPatExternSymOffOnly : - Pat<(i32 (kind (WebAssemblywrapper texternalsym:$off), I32:$count)), - (ATOMIC_NOTIFY 0, texternalsym:$off, (CONST_I32 0), I32:$count)>; -def : NotifyPatExternSymOffOnly; - -// Select waits with no constant offset. -class WaitPatNoOffset : - Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)), - (inst 0, 0, I32:$addr, ty:$exp, I64:$timeout)>; -def : WaitPatNoOffset; -def : WaitPatNoOffset; - -// Select waits with a constant offset. - -// Pattern with address + immediate offset -class WaitPatImmOff : - Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)), - (inst 0, imm:$off, I32:$addr, ty:$exp, I64:$timeout)>; -def : WaitPatImmOff; -def : WaitPatImmOff; -def : WaitPatImmOff; -def : WaitPatImmOff; - -class WaitPatGlobalAddr : - Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), - ty:$exp, I64:$timeout)), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>; -def : WaitPatGlobalAddr; -def : WaitPatGlobalAddr; - -class WaitPatExternalSym : - Pat<(i32 (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), - ty:$exp, I64:$timeout)), - (inst 0, texternalsym:$off, I32:$addr, ty:$exp, I64:$timeout)>; -def : WaitPatExternalSym; -def : WaitPatExternalSym; - -// Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset. -class WaitPatOffsetOnly : - Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)), - (inst 0, imm:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>; -def : WaitPatOffsetOnly; -def : WaitPatOffsetOnly; - -class WaitPatGlobalAddrOffOnly : - Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, I64:$timeout)), - (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>; -def : WaitPatGlobalAddrOffOnly; -def : WaitPatGlobalAddrOffOnly; - -class WaitPatExternSymOffOnly : - Pat<(i32 (kind (WebAssemblywrapper texternalsym:$off), ty:$exp, - I64:$timeout)), - (inst 0, texternalsym:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>; -def : WaitPatExternSymOffOnly; -def : WaitPatExternSymOffOnly; -} // Predicates = [HasAtomics] +// A compiler fence instruction that prevents reordering of instructions. +let Defs = [ARGUMENTS] in { +let isPseudo = 1, hasSideEffects = 1 in +defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">; +} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td new file mode 100644 index 000000000000..f4352e3d12ec --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td @@ -0,0 +1,71 @@ +// WebAssemblyInstrBulkMemory.td - bulk memory codegen support --*- tablegen -*- +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// WebAssembly bulk memory codegen constructs. +/// +//===----------------------------------------------------------------------===// + +// Instruction requiring HasBulkMemory and the bulk memory prefix byte +multiclass BULK_I pattern_r, string asmstr_r = "", + string asmstr_s = "", bits<32> simdop = -1> { + defm "" : I, + Requires<[HasBulkMemory]>; +} + +// Bespoke types and nodes for bulk memory ops +def wasm_memcpy_t : SDTypeProfile<0, 5, + [SDTCisInt<0>, SDTCisInt<1>, SDTCisPtrTy<2>, SDTCisPtrTy<3>, SDTCisInt<4>] +>; +def wasm_memcpy : SDNode<"WebAssemblyISD::MEMORY_COPY", wasm_memcpy_t, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; + +def wasm_memset_t : SDTypeProfile<0, 4, + [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>, SDTCisInt<3>] +>; +def wasm_memset : SDNode<"WebAssemblyISD::MEMORY_FILL", wasm_memset_t, + [SDNPHasChain, SDNPMayStore]>; + +let mayStore = 1, hasSideEffects = 1 in +defm MEMORY_INIT : + BULK_I<(outs), + (ins i32imm_op:$seg, i32imm_op:$idx, I32:$dest, + I32:$offset, I32:$size), + (outs), (ins i32imm_op:$seg, i32imm_op:$idx), + [(int_wasm_memory_init (i32 imm:$seg), (i32 imm:$idx), I32:$dest, + I32:$offset, I32:$size + )], + "memory.init\t$seg, $idx, $dest, $offset, $size", + "memory.init\t$seg, $idx", 0x08>; + +let hasSideEffects = 1 in +defm DATA_DROP : + BULK_I<(outs), (ins i32imm_op:$seg), (outs), (ins i32imm_op:$seg), + [(int_wasm_data_drop (i32 imm:$seg))], + "data.drop\t$seg", "data.drop\t$seg", 0x09>; + +let mayLoad = 1, mayStore = 1 in +defm MEMORY_COPY : + BULK_I<(outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx, + I32:$dst, I32:$src, I32:$len), + (outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx), + [(wasm_memcpy (i32 imm:$src_idx), (i32 imm:$dst_idx), + I32:$dst, I32:$src, I32:$len + )], + "memory.copy\t$src_idx, $dst_idx, $dst, $src, $len", + "memory.copy\t$src_idx, $dst_idx", 0x0a>; + +let mayStore = 1 in +defm MEMORY_FILL : + BULK_I<(outs), (ins i32imm_op:$idx, I32:$dst, I32:$value, I32:$size), + (outs), (ins i32imm_op:$idx), + [(wasm_memset (i32 imm:$idx), I32:$dst, I32:$value, I32:$size)], + "memory.fill\t$idx, $dst, $value, $size", + "memory.fill\t$idx", 0x0b>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 07839b790114..703c15d58c93 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -1,9 +1,8 @@ //===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -22,109 +21,112 @@ defm ADJCALLSTACKDOWN : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2), [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>; defm ADJCALLSTACKUP : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2), [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>; -} // isCodeGenOnly = 1 +} // Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 -multiclass CALL { - defm CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops), - (outs), (ins function32_op:$callee), - [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))], - !strconcat(prefix, "call\t$dst, $callee"), - !strconcat(prefix, "call\t$callee"), - 0x10>; +multiclass CALL preds = []> { + defm CALL_#vt : + I<(outs rt:$dst), (ins function32_op:$callee, variable_ops), + (outs), (ins function32_op:$callee), + [(set (vt rt:$dst), (WebAssemblycall1 (i32 imm:$callee)))], + !strconcat(prefix, "call\t$dst, $callee"), + !strconcat(prefix, "call\t$callee"), + 0x10>, + Requires; - let isCodeGenOnly = 1 in { - defm PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops), - (outs), (ins I32:$callee), - [(set vt:$dst, (WebAssemblycall1 I32:$callee))], - "PSEUDO CALL INDIRECT\t$callee", - "PSEUDO CALL INDIRECT\t$callee">; - } // isCodeGenOnly = 1 + let isCodeGenOnly = 1 in + defm PCALL_INDIRECT_#vt : + I<(outs rt:$dst), (ins I32:$callee, variable_ops), + (outs), (ins I32:$callee), + [(set (vt rt:$dst), (WebAssemblycall1 I32:$callee))], + "PSEUDO CALL INDIRECT\t$callee", + "PSEUDO CALL INDIRECT\t$callee">, + Requires; - defm CALL_INDIRECT_#vt : I<(outs vt:$dst), - (ins TypeIndex:$type, i32imm:$flags, variable_ops), - (outs), (ins TypeIndex:$type, i32imm:$flags), - [], - !strconcat(prefix, "call_indirect\t$dst"), - !strconcat(prefix, "call_indirect\t$type"), - 0x11>; + defm CALL_INDIRECT_#vt : + I<(outs rt:$dst), + (ins TypeIndex:$type, i32imm:$flags, variable_ops), + (outs), (ins TypeIndex:$type, i32imm:$flags), + [], + !strconcat(prefix, "call_indirect\t$dst"), + !strconcat(prefix, "call_indirect\t$type"), + 0x11>, + Requires; } -multiclass SIMD_CALL { +let Uses = [SP32, SP64], isCall = 1 in { +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; +defm "" : CALL; - defm CALL_#vt : I<(outs V128:$dst), (ins function32_op:$callee, variable_ops), - (outs), (ins function32_op:$callee), - [(set (vt V128:$dst), - (WebAssemblycall1 (i32 imm:$callee)))], - !strconcat(prefix, "call\t$dst, $callee"), - !strconcat(prefix, "call\t$callee"), - 0x10>, - Requires<[HasSIMD128]>; +let IsCanonical = 1 in { +defm CALL_VOID : + I<(outs), (ins function32_op:$callee, variable_ops), + (outs), (ins function32_op:$callee), + [(WebAssemblycall0 (i32 imm:$callee))], + "call \t$callee", "call\t$callee", 0x10>; - let isCodeGenOnly = 1 in { - defm PCALL_INDIRECT_#vt : I<(outs V128:$dst), - (ins I32:$callee, variable_ops), - (outs), (ins I32:$callee), - [(set (vt V128:$dst), - (WebAssemblycall1 I32:$callee))], - "PSEUDO CALL INDIRECT\t$callee", - "PSEUDO CALL INDIRECT\t$callee">, - Requires<[HasSIMD128]>; - } // isCodeGenOnly = 1 +let isReturn = 1 in +defm RET_CALL : + I<(outs), (ins function32_op:$callee, variable_ops), + (outs), (ins function32_op:$callee), + [(WebAssemblyretcall (i32 imm:$callee))], + "return_call \t$callee", "return_call\t$callee", 0x12>, + Requires<[HasTailCall]>; - defm CALL_INDIRECT_#vt : I<(outs V128:$dst), - (ins TypeIndex:$type, i32imm:$flags, variable_ops), - (outs), (ins TypeIndex:$type, i32imm:$flags), - [], - !strconcat(prefix, "call_indirect\t$dst"), - !strconcat(prefix, "call_indirect\t$type"), - 0x11>, - Requires<[HasSIMD128]>; -} +let isCodeGenOnly = 1 in +defm PCALL_INDIRECT_VOID : + I<(outs), (ins I32:$callee, variable_ops), + (outs), (ins I32:$callee), + [(WebAssemblycall0 I32:$callee)], + "PSEUDO CALL INDIRECT\t$callee", + "PSEUDO CALL INDIRECT\t$callee">; -let Uses = [SP32, SP64], isCall = 1 in { - defm "" : CALL; - defm "" : CALL; - defm "" : CALL; - defm "" : CALL; - defm "" : CALL; - defm "" : SIMD_CALL; - defm "" : SIMD_CALL; - defm "" : SIMD_CALL; - defm "" : SIMD_CALL; - defm "" : SIMD_CALL; - defm "" : SIMD_CALL; +defm CALL_INDIRECT_VOID : + I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops), + (outs), (ins TypeIndex:$type, i32imm:$flags), + [], + "call_indirect\t", "call_indirect\t$type", + 0x11>; - defm CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops), - (outs), (ins function32_op:$callee), - [(WebAssemblycall0 (i32 imm:$callee))], - "call \t$callee", "call\t$callee", 0x10>; +let isReturn = 1 in +defm RET_CALL_INDIRECT : + I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops), + (outs), (ins TypeIndex:$type, i32imm:$flags), + [], + "return_call_indirect\t", "return_call_indirect\t$type", + 0x13>, + Requires<[HasTailCall]>; - let isCodeGenOnly = 1 in { - defm PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops), - (outs), (ins I32:$callee), - [(WebAssemblycall0 I32:$callee)], - "PSEUDO CALL INDIRECT\t$callee", - "PSEUDO CALL INDIRECT\t$callee">; - } // isCodeGenOnly = 1 +let isCodeGenOnly = 1, isReturn = 1 in +defm PRET_CALL_INDIRECT: + I<(outs), (ins I32:$callee, variable_ops), + (outs), (ins I32:$callee), + [(WebAssemblyretcall I32:$callee)], + "PSEUDO RET_CALL INDIRECT\t$callee", + "PSEUDO RET_CALL INDIRECT\t$callee">, + Requires<[HasTailCall]>; - defm CALL_INDIRECT_VOID : I<(outs), - (ins TypeIndex:$type, i32imm:$flags, - variable_ops), - (outs), (ins TypeIndex:$type, i32imm:$flags), - [], - "call_indirect\t", "call_indirect\t$type", - 0x11>; +} // IsCanonical = 1 } // Uses = [SP32,SP64], isCall = 1 // Patterns for matching a direct call to a global address. def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), - (CALL_I32 tglobaladdr:$callee)>; + (CALL_i32 tglobaladdr:$callee)>; def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), - (CALL_I64 tglobaladdr:$callee)>; + (CALL_i64 tglobaladdr:$callee)>; def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), - (CALL_F32 tglobaladdr:$callee)>; + (CALL_f32 tglobaladdr:$callee)>; def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), - (CALL_F64 tglobaladdr:$callee)>; + (CALL_f64 tglobaladdr:$callee)>; def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), (CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>; def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), @@ -137,21 +139,23 @@ def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>; def : Pat<(v2f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), (CALL_v2f64 tglobaladdr:$callee)>, Requires<[HasSIMD128]>; -def : Pat<(ExceptRef - (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), - (CALL_EXCEPT_REF tglobaladdr:$callee)>; +def : Pat<(exnref (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_exnref tglobaladdr:$callee)>, + Requires<[HasExceptionHandling]>; def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)), (CALL_VOID tglobaladdr:$callee)>; +def : Pat<(WebAssemblyretcall (WebAssemblywrapper tglobaladdr:$callee)), + (RET_CALL tglobaladdr:$callee)>, Requires<[HasTailCall]>; // Patterns for matching a direct call to an external symbol. def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), - (CALL_I32 texternalsym:$callee)>; + (CALL_i32 texternalsym:$callee)>; def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), - (CALL_I64 texternalsym:$callee)>; + (CALL_i64 texternalsym:$callee)>; def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), - (CALL_F32 texternalsym:$callee)>; + (CALL_f32 texternalsym:$callee)>; def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), - (CALL_F64 texternalsym:$callee)>; + (CALL_f64 texternalsym:$callee)>; def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), (CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>; def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), @@ -164,8 +168,10 @@ def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>; def : Pat<(v2f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), (CALL_v2f64 texternalsym:$callee)>, Requires<[HasSIMD128]>; -def : Pat<(ExceptRef - (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), - (CALL_EXCEPT_REF texternalsym:$callee)>; +def : Pat<(exnref (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_exnref texternalsym:$callee)>, + Requires<[HasExceptionHandling]>; def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)), (CALL_VOID texternalsym:$callee)>; +def : Pat<(WebAssemblyretcall (WebAssemblywrapper texternalsym:$callee)), + (RET_CALL texternalsym:$callee)>, Requires<[HasTailCall]>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 7eb6cbf4d249..1870c5bc34b0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -1,9 +1,8 @@ //===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -21,11 +20,10 @@ defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond), let isCodeGenOnly = 1 in defm BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), (outs), (ins bb_op:$dst), []>; -let isBarrier = 1 in { +let isBarrier = 1 in defm BR : NRI<(outs), (ins bb_op:$dst), [(br bb:$dst)], "br \t$dst", 0x0c>; -} // isBarrier = 1 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), @@ -36,14 +34,11 @@ def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), // A list of branch targets enclosed in {} and separated by comma. // Used by br_table only. def BrListAsmOperand : AsmOperandClass { let Name = "BrList"; } -let OperandNamespace = "WebAssembly" in { -let OperandType = "OPERAND_BRLIST" in { +let OperandNamespace = "WebAssembly", OperandType = "OPERAND_BRLIST" in def brlist : Operand { let ParserMatchClass = BrListAsmOperand; let PrintMethod = "printBrList"; } -} // OPERAND_BRLIST -} // OperandNamespace = "WebAssembly" // TODO: SelectionDAG's lowering insists on using a pointer as the index for // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode @@ -82,6 +77,9 @@ defm ELSE : NRI<(outs), (ins), [], "else", 0x05>; defm END_BLOCK : NRI<(outs), (ins), [], "end_block", 0x0b>; defm END_LOOP : NRI<(outs), (ins), [], "end_loop", 0x0b>; defm END_IF : NRI<(outs), (ins), [], "end_if", 0x0b>; +// Generic instruction, for disassembler. +let IsCanonical = 1 in +defm END : NRI<(outs), (ins), [], "end", 0x0b>; let isTerminator = 1, isBarrier = 1 in defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>; } // Uses = [VALUE_STACK], Defs = [VALUE_STACK] @@ -106,7 +104,7 @@ multiclass SIMD_RETURN { let isCodeGenOnly = 1 in defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins), []>, - Requires<[HasSIMD128]>; + Requires<[HasSIMD128]>; } let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { @@ -116,7 +114,7 @@ let isReturn = 1 in { defm "": RETURN; defm "": RETURN; defm "": RETURN; - defm "": RETURN; + defm "": RETURN; defm "": SIMD_RETURN; defm "": SIMD_RETURN; defm "": SIMD_RETURN; @@ -142,23 +140,17 @@ let Predicates = [HasExceptionHandling] in { // Throwing an exception: throw / rethrow let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { -defm THROW_I32 : I<(outs), (ins event_op:$tag, I32:$val), - (outs), (ins event_op:$tag), - [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag), - I32:$val)], - "throw \t$tag, $val", "throw \t$tag", - 0x08>; -defm THROW_I64 : I<(outs), (ins event_op:$tag, I64:$val), - (outs), (ins event_op:$tag), - [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag), - I64:$val)], - "throw \t$tag, $val", "throw \t$tag", - 0x08>; -defm RETHROW : NRI<(outs), (ins bb_op:$dst), [], "rethrow \t$dst", 0x09>; -let isCodeGenOnly = 1 in -// This is used when the destination for rethrow is the caller function. This -// will be converted to a rethrow in CFGStackify. -defm RETHROW_TO_CALLER : NRI<(outs), (ins), [], "rethrow">; +defm THROW : I<(outs), (ins event_op:$tag, variable_ops), + (outs), (ins event_op:$tag), + [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag))], + "throw \t$tag", "throw \t$tag", 0x08>; +defm RETHROW : I<(outs), (ins EXNREF:$exn), (outs), (ins), [], + "rethrow \t$exn", "rethrow", 0x09>; +// Pseudo instruction to be the lowering target of int_wasm_rethrow_in_catch +// intrinsic. Will be converted to the real rethrow instruction later. +let isPseudo = 1 in +defm RETHROW_IN_CATCH : NRI<(outs), (ins), [(int_wasm_rethrow_in_catch)], + "rethrow_in_catch", 0>; } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 // Region within which an exception is caught: try / end_try @@ -167,24 +159,33 @@ defm TRY : NRI<(outs), (ins Signature:$sig), [], "try \t$sig", 0x06>; defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>; } // Uses = [VALUE_STACK], Defs = [VALUE_STACK] -// Catching an exception: catch / catch_all -let hasCtrlDep = 1, hasSideEffects = 1 in { -defm CATCH_I32 : I<(outs I32:$dst), (ins i32imm:$tag), - (outs), (ins i32imm:$tag), - [(set I32:$dst, (int_wasm_catch imm:$tag))], - "i32.catch \t$dst, $tag", "i32.catch \t$tag", 0x07>; -defm CATCH_I64 : I<(outs I64:$dst), (ins i32imm:$tag), - (outs), (ins i32imm:$tag), - [(set I64:$dst, (int_wasm_catch imm:$tag))], - "i64.catch \t$dst, $tag", "i64.catch \t$tag", 0x07>; -defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>; -} +// Catching an exception: catch / extract_exception +let hasCtrlDep = 1, hasSideEffects = 1 in +defm CATCH : I<(outs EXNREF:$dst), (ins), (outs), (ins), [], + "catch \t$dst", "catch", 0x07>; + +// Querying / extracing exception: br_on_exn +// br_on_exn queries an exnref to see if it matches the corresponding exception +// tag index. If true it branches to the given label and pushes the +// corresponding argument values of the exception onto the stack. +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in +defm BR_ON_EXN : I<(outs), (ins bb_op:$dst, event_op:$tag, EXNREF:$exn), + (outs), (ins bb_op:$dst, event_op:$tag), [], + "br_on_exn \t$dst, $tag, $exn", "br_on_exn \t$dst, $tag", + 0x0a>; +// This is a pseudo instruction that simulates popping a value from stack, which +// has been pushed by br_on_exn +let isCodeGenOnly = 1, hasSideEffects = 1 in +defm EXTRACT_EXCEPTION_I32 : NRI<(outs I32:$dst), (ins), + [(set I32:$dst, (int_wasm_extract_exception))], + "extract_exception\t$dst">; // Pseudo instructions: cleanupret / catchret let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, - isCodeGenOnly = 1, isEHScopeReturn = 1 in { - defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "", 0>; + isPseudo = 1, isEHScopeReturn = 1 in { + defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "cleanupret", 0>; defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from), - [(catchret bb:$dst, bb:$from)], "", 0>; -} -} + [(catchret bb:$dst, bb:$from)], "catchret", 0>; +} // isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + // isPseudo = 1, isEHScopeReturn = 1 +} // Predicates = [HasExceptionHandling] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td index e128656a142c..661fee2715ba 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -1,9 +1,8 @@ //===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-= // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td deleted file mode 100644 index a251d60b89ee..000000000000 --- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td +++ /dev/null @@ -1,27 +0,0 @@ -// WebAssemblyInstrExceptRef.td-WebAssembly except_ref codegen --*- tablegen -*- -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// WebAssembly except_ref operand code-gen constructs. -/// -//===----------------------------------------------------------------------===// - -defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), - (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond), - (outs), (ins), - [(set EXCEPT_REF:$dst, - (select I32:$cond, EXCEPT_REF:$lhs, - EXCEPT_REF:$rhs))], - "except_ref.select\t$dst, $lhs, $rhs, $cond", - "except_ref.select", 0x1b>; - -def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), - (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>; -def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), - (SELECT_EXCEPT_REF EXCEPT_REF:$rhs, EXCEPT_REF:$lhs, I32:$cond)>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td index c5290f00b431..5c9b34f44734 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -1,9 +1,8 @@ // WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 15a9714a55a1..aff4d20d8d82 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -1,9 +1,8 @@ //=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -23,6 +22,9 @@ class WebAssemblyInst inst, string asmstr, string stack> : StackRel, let Namespace = "WebAssembly"; let Pattern = []; let AsmString = asmstr; + // When there are multiple instructions that map to the same encoding (in + // e.g. the disassembler use case) prefer the one where IsCanonical == 1. + bit IsCanonical = 0; } // Normal instructions. Default instantiation of a WebAssemblyInst. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 5efff32d6167..a86c9af28f0d 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyInstrInfo.cpp - WebAssembly Instruction Information ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -28,6 +27,10 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "WebAssemblyGenInstrInfo.inc" +// defines WebAssembly::getNamedOperandIdx +#define GET_INSTRINFO_NAMED_OPS +#include "WebAssemblyGenInstrInfo.inc" + WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN, WebAssembly::ADJCALLSTACKUP, @@ -72,6 +75,8 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, CopyOpcode = WebAssembly::COPY_F64; else if (RC == &WebAssembly::V128RegClass) CopyOpcode = WebAssembly::COPY_V128; + else if (RC == &WebAssembly::EXNREFRegClass) + CopyOpcode = WebAssembly::COPY_EXNREF; else llvm_unreachable("Unexpected register class"); @@ -98,6 +103,13 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool /*AllowModify*/) const { + const auto &MFI = *MBB.getParent()->getInfo(); + // WebAssembly has control flow that doesn't have explicit branches or direct + // fallthrough (e.g. try/catch), which can't be modeled by analyzeBranch. It + // is created after CFGStackify. + if (MFI.isCFGStackified()) + return true; + bool HaveCond = false; for (MachineInstr &MI : MBB.terminators()) { switch (MI.getOpcode()) { @@ -107,9 +119,6 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB, case WebAssembly::BR_IF: if (HaveCond) return true; - // If we're running after CFGStackify, we can't optimize further. - if (!MI.getOperand(0).isMBB()) - return true; Cond.push_back(MachineOperand::CreateImm(true)); Cond.push_back(MI.getOperand(1)); TBB = MI.getOperand(0).getMBB(); @@ -118,23 +127,25 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB, case WebAssembly::BR_UNLESS: if (HaveCond) return true; - // If we're running after CFGStackify, we can't optimize further. - if (!MI.getOperand(0).isMBB()) - return true; Cond.push_back(MachineOperand::CreateImm(false)); Cond.push_back(MI.getOperand(1)); TBB = MI.getOperand(0).getMBB(); HaveCond = true; break; case WebAssembly::BR: - // If we're running after CFGStackify, we can't optimize further. - if (!MI.getOperand(0).isMBB()) - return true; if (!HaveCond) TBB = MI.getOperand(0).getMBB(); else FBB = MI.getOperand(0).getMBB(); break; + case WebAssembly::BR_ON_EXN: + if (HaveCond) + return true; + Cond.push_back(MachineOperand::CreateImm(true)); + Cond.push_back(MI.getOperand(2)); + TBB = MI.getOperand(0).getMBB(); + HaveCond = true; + break; } if (MI.isBarrier()) break; @@ -180,9 +191,22 @@ unsigned WebAssemblyInstrInfo::insertBranch( assert(Cond.size() == 2 && "Expected a flag and a successor block"); + MachineFunction &MF = *MBB.getParent(); + auto &MRI = MF.getRegInfo(); + bool IsBrOnExn = Cond[1].isReg() && MRI.getRegClass(Cond[1].getReg()) == + &WebAssembly::EXNREFRegClass; + if (Cond[0].getImm()) { - BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]); + if (IsBrOnExn) { + const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception"); + BuildMI(&MBB, DL, get(WebAssembly::BR_ON_EXN)) + .addMBB(TBB) + .addExternalSymbol(CPPExnSymbol) + .add(Cond[1]); + } else + BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]); } else { + assert(!IsBrOnExn && "br_on_exn does not have a reversed condition"); BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)).addMBB(TBB).add(Cond[1]); } if (!FBB) @@ -194,7 +218,15 @@ unsigned WebAssemblyInstrInfo::insertBranch( bool WebAssemblyInstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { - assert(Cond.size() == 2 && "Expected a flag and a successor block"); + assert(Cond.size() == 2 && "Expected a flag and a condition expression"); + + // br_on_exn's condition cannot be reversed + MachineFunction &MF = *Cond[1].getParent()->getParent()->getParent(); + auto &MRI = MF.getRegInfo(); + if (Cond[1].isReg() && + MRI.getRegClass(Cond[1].getReg()) == &WebAssembly::EXNREFRegClass) + return true; + Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm()); return false; } diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index 4a3763c345b0..df1051b4f42c 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -1,9 +1,8 @@ //=- WebAssemblyInstrInfo.h - WebAssembly Instruction Information -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -22,8 +21,17 @@ #define GET_INSTRINFO_HEADER #include "WebAssemblyGenInstrInfo.inc" +#define GET_INSTRINFO_OPERAND_ENUM +#include "WebAssemblyGenInstrInfo.inc" + namespace llvm { +namespace WebAssembly { + +int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); + +} + class WebAssemblySubtarget; class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo { diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index e3d795f2aab1..73ddbe85d551 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -1,9 +1,8 @@ // WebAssemblyInstrInfo.td-Describe the WebAssembly Instructions-*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -16,41 +15,52 @@ // WebAssembly Instruction Predicate Definitions. //===----------------------------------------------------------------------===// +def IsPIC : Predicate<"TM.isPositionIndependent()">; +def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; + def HasAddr32 : Predicate<"!Subtarget->hasAddr64()">; + def HasAddr64 : Predicate<"Subtarget->hasAddr64()">; -def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, - AssemblerPredicate<"FeatureSIMD128", "simd128">; + +def HasSIMD128 : + Predicate<"Subtarget->hasSIMD128()">, + AssemblerPredicate<"FeatureSIMD128", "simd128">; + def HasUnimplementedSIMD128 : Predicate<"Subtarget->hasUnimplementedSIMD128()">, AssemblerPredicate<"FeatureUnimplementedSIMD128", "unimplemented-simd128">; -def HasAtomics : Predicate<"Subtarget->hasAtomics()">, - AssemblerPredicate<"FeatureAtomics", "atomics">; + +def HasAtomics : + Predicate<"Subtarget->hasAtomics()">, + AssemblerPredicate<"FeatureAtomics", "atomics">; + +def HasMultivalue : + Predicate<"Subtarget->hasMultivalue()">, + AssemblerPredicate<"FeatureMultivalue", "multivalue">; + def HasNontrappingFPToInt : Predicate<"Subtarget->hasNontrappingFPToInt()">, - AssemblerPredicate<"FeatureNontrappingFPToInt", - "nontrapping-fptoint">; + AssemblerPredicate<"FeatureNontrappingFPToInt", "nontrapping-fptoint">; + def NotHasNontrappingFPToInt : Predicate<"!Subtarget->hasNontrappingFPToInt()">, - AssemblerPredicate<"!FeatureNontrappingFPToInt", - "nontrapping-fptoint">; + AssemblerPredicate<"!FeatureNontrappingFPToInt", "nontrapping-fptoint">; + def HasSignExt : Predicate<"Subtarget->hasSignExt()">, - AssemblerPredicate<"FeatureSignExt", - "sign-ext">; -def NotHasSignExt : - Predicate<"!Subtarget->hasSignExt()">, - AssemblerPredicate<"!FeatureSignExt", - "sign-ext">; + AssemblerPredicate<"FeatureSignExt", "sign-ext">; + +def HasTailCall : + Predicate<"Subtarget->hasTailCall()">, + AssemblerPredicate<"FeatureTailCall", "tail-call">; def HasExceptionHandling : Predicate<"Subtarget->hasExceptionHandling()">, - AssemblerPredicate<"FeatureExceptionHandling", - "exception-handling">; + AssemblerPredicate<"FeatureExceptionHandling", "exception-handling">; -def NotHasExceptionHandling : - Predicate<"!Subtarget->hasExceptionHandling()">, - AssemblerPredicate<"!FeatureExceptionHandling", - "exception-handling">; +def HasBulkMemory : + Predicate<"Subtarget->hasBulkMemory()">, + AssemblerPredicate<"FeatureBulkMemory", "bulk-memory">; //===----------------------------------------------------------------------===// // WebAssembly-specific DAG Node Types. @@ -60,14 +70,16 @@ def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; def SDT_WebAssemblyCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; -def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; -def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>; -def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; -def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>; -def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>; -def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, - SDTCisPtrTy<0>]>; -def SDT_WebAssemblyThrow : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>; +def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>; +def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>; +def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<0>]>; +def SDT_WebAssemblyWrapperPIC : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<0>]>; +def SDT_WebAssemblyThrow : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; //===----------------------------------------------------------------------===// // WebAssembly-specific DAG Nodes. @@ -85,6 +97,9 @@ def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0", def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1", SDT_WebAssemblyCall1, [SDNPHasChain, SDNPVariadic]>; +def WebAssemblyretcall : SDNode<"WebAssemblyISD::RET_CALL", + SDT_WebAssemblyCall0, + [SDNPHasChain, SDNPVariadic]>; def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE", SDT_WebAssemblyBrTable, [SDNPHasChain, SDNPVariadic]>; @@ -94,13 +109,26 @@ def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN", SDT_WebAssemblyReturn, [SDNPHasChain]>; def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper", SDT_WebAssemblyWrapper>; +def WebAssemblywrapperPIC : SDNode<"WebAssemblyISD::WrapperPIC", + SDT_WebAssemblyWrapperPIC>; def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow, - [SDNPHasChain]>; + [SDNPHasChain, SDNPVariadic]>; //===----------------------------------------------------------------------===// // WebAssembly-specific Operands. //===----------------------------------------------------------------------===// +// Default Operand has AsmOperandClass "Imm" which is for integers (and +// symbols), so specialize one for floats: +def FPImmAsmOperand : AsmOperandClass { + let Name = "FPImm"; + let PredicateMethod = "isFPImm"; +} + +class FPOperand : Operand { + AsmOperandClass ParserMatchClass = FPImmAsmOperand; +} + let OperandNamespace = "WebAssembly" in { let OperandType = "OPERAND_BASIC_BLOCK" in @@ -119,10 +147,10 @@ let OperandType = "OPERAND_I64IMM" in def i64imm_op : Operand; let OperandType = "OPERAND_F32IMM" in -def f32imm_op : Operand; +def f32imm_op : FPOperand; let OperandType = "OPERAND_F64IMM" in -def f64imm_op : Operand; +def f64imm_op : FPOperand; let OperandType = "OPERAND_VEC_I8IMM" in def vec_i8imm_op : Operand; @@ -152,11 +180,10 @@ def event_op : Operand; } // OperandType = "OPERAND_P2ALIGN" -let OperandType = "OPERAND_SIGNATURE" in { +let OperandType = "OPERAND_SIGNATURE" in def Signature : Operand { let PrintMethod = "printWebAssemblySignatureOperand"; } -} // OperandType = "OPERAND_SIGNATURE" let OperandType = "OPERAND_TYPEINDEX" in def TypeIndex : Operand; @@ -187,8 +214,8 @@ include "WebAssemblyInstrFormats.td" //===----------------------------------------------------------------------===// multiclass ARGUMENT { - let hasSideEffects = 1, isCodeGenOnly = 1, - Defs = [], Uses = [ARGUMENTS] in + let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [], + Uses = [ARGUMENTS] in defm ARGUMENT_#vt : I<(outs reg:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno), [(set (vt reg:$res), (WebAssemblyargument timm:$argno))]>; @@ -197,12 +224,12 @@ defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; -defm "": ARGUMENT; +defm "": ARGUMENT; // local.get and local.set are not generated by instruction selection; they // are implied by virtual register uses and defs. multiclass LOCAL { -let hasSideEffects = 0 in { + let hasSideEffects = 0 in { // COPY is not an actual instruction in wasm, but since we allow local.get and // local.set to be implicit during most of codegen, we can have a COPY which // is actually a no-op because all the work is done in the implied local.get @@ -267,7 +294,7 @@ defm "" : LOCAL; defm "" : LOCAL; defm "" : LOCAL; defm "" : LOCAL, Requires<[HasSIMD128]>; -defm "" : LOCAL, Requires<[HasExceptionHandling]>; +defm "" : LOCAL, Requires<[HasExceptionHandling]>; let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm), @@ -289,9 +316,20 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm), } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), - (CONST_I32 tglobaladdr:$addr)>; + (CONST_I32 tglobaladdr:$addr)>, Requires<[IsNotPIC]>; + +def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), + (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>; + +def : Pat<(i32 (WebAssemblywrapperPIC tglobaladdr:$addr)), + (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>; + def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), - (CONST_I32 texternalsym:$addr)>; + (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC]>; + +def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), + (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC]>; + def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>; def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>; @@ -307,4 +345,5 @@ include "WebAssemblyInstrConv.td" include "WebAssemblyInstrFloat.td" include "WebAssemblyInstrAtomics.td" include "WebAssemblyInstrSIMD.td" -include "WebAssemblyInstrExceptRef.td" +include "WebAssemblyInstrRef.td" +include "WebAssemblyInstrBulkMemory.td" diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index d5b63d643697..18250cf8ef85 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -1,9 +1,8 @@ // WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -122,10 +121,3 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs), (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>; def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs), (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>; - -// The legalizer inserts an unnecessary `and 1` to make input conform -// to getBooleanContents, which we can lower away. -def : Pat<(select (i32 (and I32:$cond, 1)), I32:$lhs, I32:$rhs), - (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>; -def : Pat<(select (i32 (and I32:$cond, 1)), I64:$lhs, I64:$rhs), - (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 518f81c61dc4..6916b165f970 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -1,9 +1,8 @@ // WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -53,7 +52,7 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off), // Defines atomic and non-atomic loads, regular and extending. multiclass WebAssemblyLoad { - let mayLoad = 1 in + let mayLoad = 1, UseNamedOperandTable = 1 in defm "": I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), (outs), (ins P2Align:$p2align, offset32_op:$off), @@ -96,22 +95,13 @@ def : LoadPatImmOff; class LoadPatGlobalAddr : Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))), - (inst 0, tglobaladdr:$off, I32:$addr)>; + (inst 0, tglobaladdr:$off, I32:$addr)>, Requires<[IsNotPIC]>; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; -class LoadPatExternalSym : - Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), - (inst 0, texternalsym:$off, I32:$addr)>; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; - - // Select loads with just a constant offset. class LoadPatOffsetOnly : Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>; @@ -123,21 +113,13 @@ def : LoadPatOffsetOnly; class LoadPatGlobalAddrOffOnly : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))), - (inst 0, tglobaladdr:$off, (CONST_I32 0))>; + (inst 0, tglobaladdr:$off, (CONST_I32 0))>, Requires<[IsNotPIC]>; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; -class LoadPatExternSymOffOnly : - Pat<(ty (kind (WebAssemblywrapper texternalsym:$off))), - (inst 0, texternalsym:$off, (CONST_I32 0))>; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; - // Extending load. defm LOAD8_S_I32 : WebAssemblyLoad; defm LOAD8_U_I32 : WebAssemblyLoad; @@ -197,18 +179,6 @@ def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; - - // Select extending loads with just a constant offset. def : LoadPatOffsetOnly; def : LoadPatOffsetOnly; @@ -233,17 +203,6 @@ def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; - // Resolve "don't care" extending loads to zero-extending loads. This is // somewhat arbitrary, but zero-extending is conceptually simpler. @@ -270,11 +229,6 @@ def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; def : LoadPatGlobalAddr; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; -def : LoadPatExternalSym; // Select "don't care" extending loads with just a constant offset. def : LoadPatOffsetOnly; @@ -287,15 +241,10 @@ def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; def : LoadPatGlobalAddrOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; -def : LoadPatExternSymOffOnly; // Defines atomic and non-atomic stores, regular and truncating multiclass WebAssemblyStore { - let mayStore = 1 in + let mayStore = 1, UseNamedOperandTable = 1 in defm "" : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val), (outs), @@ -336,20 +285,12 @@ def : StorePatImmOff; class StorePatGlobalAddr : Pat<(kind ty:$val, (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>; + (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>, Requires<[IsNotPIC]>; def : StorePatGlobalAddr; def : StorePatGlobalAddr; def : StorePatGlobalAddr; def : StorePatGlobalAddr; -class StorePatExternalSym : - Pat<(kind ty:$val, (add I32:$addr, (WebAssemblywrapper texternalsym:$off))), - (inst 0, texternalsym:$off, I32:$addr, ty:$val)>; -def : StorePatExternalSym; -def : StorePatExternalSym; -def : StorePatExternalSym; -def : StorePatExternalSym; - // Select stores with just a constant offset. class StorePatOffsetOnly : Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>; @@ -360,20 +301,12 @@ def : StorePatOffsetOnly; class StorePatGlobalAddrOffOnly : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)), - (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>; + (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>, Requires<[IsNotPIC]>; def : StorePatGlobalAddrOffOnly; def : StorePatGlobalAddrOffOnly; def : StorePatGlobalAddrOffOnly; def : StorePatGlobalAddrOffOnly; -class StorePatExternSymOffOnly : - Pat<(kind ty:$val, (WebAssemblywrapper texternalsym:$off)), - (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; - // Truncating store. defm STORE8_I32 : WebAssemblyStore; defm STORE16_I32 : WebAssemblyStore; @@ -405,11 +338,6 @@ def : StorePatGlobalAddr; def : StorePatGlobalAddr; def : StorePatGlobalAddr; def : StorePatGlobalAddr; -def : StorePatExternalSym; -def : StorePatExternalSym; -def : StorePatExternalSym; -def : StorePatExternalSym; -def : StorePatExternalSym; // Select truncating stores with just a constant offset. def : StorePatOffsetOnly; @@ -422,11 +350,6 @@ def : StorePatGlobalAddrOffOnly; def : StorePatGlobalAddrOffOnly; def : StorePatGlobalAddrOffOnly; def : StorePatGlobalAddrOffOnly; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; -def : StorePatExternSymOffOnly; // Current memory size. defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/lib/Target/WebAssembly/WebAssemblyInstrRef.td new file mode 100644 index 000000000000..afe89de60b36 --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -0,0 +1,25 @@ +// WebAssemblyInstrRef.td - WebAssembly reference type codegen --*- tablegen -*- +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// WebAssembly refence type operand codegen constructs. +/// +//===----------------------------------------------------------------------===// + +defm SELECT_EXNREF : I<(outs EXNREF:$dst), + (ins EXNREF:$lhs, EXNREF:$rhs, I32:$cond), + (outs), (ins), + [(set EXNREF:$dst, + (select I32:$cond, EXNREF:$lhs, EXNREF:$rhs))], + "exnref.select\t$dst, $lhs, $rhs, $cond", + "exnref.select", 0x1b>; + +def : Pat<(select (i32 (setne I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs), + (SELECT_EXNREF EXNREF:$lhs, EXNREF:$rhs, I32:$cond)>; +def : Pat<(select (i32 (seteq I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs), + (SELECT_EXNREF EXNREF:$rhs, EXNREF:$lhs, I32:$cond)>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 587515c5b299..dd8930f079b0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1,9 +1,8 @@ // WebAssemblyInstrSIMD.td - WebAssembly SIMD codegen support -*- tablegen -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -31,7 +30,7 @@ defm "" : ARGUMENT; // Constrained immediate argument types foreach SIZE = [8, 16] in def ImmI#SIZE : ImmLeaf; foreach SIZE = [2, 4, 8, 16, 32] in def LaneIdx#SIZE : ImmLeaf; @@ -42,12 +41,12 @@ def LaneIdx#SIZE : ImmLeaf; // Load: v128.load multiclass SIMDLoad { - let mayLoad = 1 in + let mayLoad = 1, UseNamedOperandTable = 1 in defm LOAD_#vec_t : - SIMD_I<(outs V128:$dst), (ins P2Align:$align, offset32_op:$off, I32:$addr), - (outs), (ins P2Align:$align, offset32_op:$off), [], - "v128.load\t$dst, ${off}(${addr})$align", - "v128.load\t$off$align", 0>; + SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "v128.load\t$dst, ${off}(${addr})$p2align", + "v128.load\t$off$p2align", 0>; } foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { @@ -58,20 +57,18 @@ def : LoadPatNoOffset("LOAD_"#vec_t)>; def : LoadPatImmOff("LOAD_"#vec_t)>; def : LoadPatImmOff("LOAD_"#vec_t)>; def : LoadPatGlobalAddr("LOAD_"#vec_t)>; -def : LoadPatExternalSym("LOAD_"#vec_t)>; def : LoadPatOffsetOnly("LOAD_"#vec_t)>; def : LoadPatGlobalAddrOffOnly("LOAD_"#vec_t)>; -def : LoadPatExternSymOffOnly("LOAD_"#vec_t)>; } // Store: v128.store multiclass SIMDStore { - let mayStore = 1 in + let mayStore = 1, UseNamedOperandTable = 1 in defm STORE_#vec_t : - SIMD_I<(outs), (ins P2Align:$align, offset32_op:$off, I32:$addr, V128:$vec), - (outs), (ins P2Align:$align, offset32_op:$off), [], - "v128.store\t${off}(${addr})$align, $vec", - "v128.store\t$off$align", 1>; + SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "v128.store\t${off}(${addr})$p2align, $vec", + "v128.store\t$off$p2align", 1>; } foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { @@ -82,10 +79,8 @@ def : StorePatNoOffset("STORE_"#vec_t)>; def : StorePatImmOff("STORE_"#vec_t)>; def : StorePatImmOff("STORE_"#vec_t)>; def : StorePatGlobalAddr("STORE_"#vec_t)>; -def : StorePatExternalSym("STORE_"#vec_t)>; def : StorePatOffsetOnly("STORE_"#vec_t)>; def : StorePatGlobalAddrOffOnly("STORE_"#vec_t)>; -def : StorePatExternSymOffOnly("STORE_"#vec_t)>; } //===----------------------------------------------------------------------===// @@ -95,7 +90,7 @@ def : StorePatExternSymOffOnly("STORE_"#vec_t)>; // Constant: v128.const multiclass ConstVec { let isMoveImm = 1, isReMaterializable = 1, - Predicates = [HasSIMD128, HasUnimplementedSIMD128] in + Predicates = [HasSIMD128, HasUnimplementedSIMD128] in defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops, [(set V128:$dst, (vec_t pat))], "v128.const\t$dst, "#args, @@ -126,6 +121,7 @@ defm "" : ConstVec; +let IsCanonical = 1 in defm "" : ConstVec; defm "" : Splat; defm "" : Splat; +// scalar_to_vector leaves high lanes undefined, so can be a splat +class ScalarSplatPat : + Pat<(vec_t (scalar_to_vector (lane_t reg_t:$x))), + (!cast("SPLAT_"#vec_t) reg_t:$x)>; + +def : ScalarSplatPat; +def : ScalarSplatPat; +def : ScalarSplatPat; +def : ScalarSplatPat; +def : ScalarSplatPat; +def : ScalarSplatPat; + //===----------------------------------------------------------------------===// // Accessing lanes //===----------------------------------------------------------------------===// @@ -347,118 +356,6 @@ def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef), def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef), (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>; -// Arbitrary other BUILD_VECTOR patterns -def : Pat<(v16i8 (build_vector - (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3), - (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7), - (i32 I32:$x8), (i32 I32:$x9), (i32 I32:$x10), (i32 I32:$x11), - (i32 I32:$x12), (i32 I32:$x13), (i32 I32:$x14), (i32 I32:$x15) - )), - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (SPLAT_v16i8 (i32 I32:$x0))), - 1, I32:$x1 - )), - 2, I32:$x2 - )), - 3, I32:$x3 - )), - 4, I32:$x4 - )), - 5, I32:$x5 - )), - 6, I32:$x6 - )), - 7, I32:$x7 - )), - 8, I32:$x8 - )), - 9, I32:$x9 - )), - 10, I32:$x10 - )), - 11, I32:$x11 - )), - 12, I32:$x12 - )), - 13, I32:$x13 - )), - 14, I32:$x14 - )), - 15, I32:$x15 - ))>; -def : Pat<(v8i16 (build_vector - (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3), - (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7) - )), - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (SPLAT_v8i16 (i32 I32:$x0))), - 1, I32:$x1 - )), - 2, I32:$x2 - )), - 3, I32:$x3 - )), - 4, I32:$x4 - )), - 5, I32:$x5 - )), - 6, I32:$x6 - )), - 7, I32:$x7 - ))>; -def : Pat<(v4i32 (build_vector - (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3) - )), - (v4i32 (REPLACE_LANE_v4i32 - (v4i32 (REPLACE_LANE_v4i32 - (v4i32 (REPLACE_LANE_v4i32 - (v4i32 (SPLAT_v4i32 (i32 I32:$x0))), - 1, I32:$x1 - )), - 2, I32:$x2 - )), - 3, I32:$x3 - ))>; -def : Pat<(v2i64 (build_vector (i64 I64:$x0), (i64 I64:$x1))), - (v2i64 (REPLACE_LANE_v2i64 - (v2i64 (SPLAT_v2i64 (i64 I64:$x0))), 1, I64:$x1))>; -def : Pat<(v4f32 (build_vector - (f32 F32:$x0), (f32 F32:$x1), (f32 F32:$x2), (f32 F32:$x3) - )), - (v4f32 (REPLACE_LANE_v4f32 - (v4f32 (REPLACE_LANE_v4f32 - (v4f32 (REPLACE_LANE_v4f32 - (v4f32 (SPLAT_v4f32 (f32 F32:$x0))), - 1, F32:$x1 - )), - 2, F32:$x2 - )), - 3, F32:$x3 - ))>; -def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))), - (v2f64 (REPLACE_LANE_v2f64 - (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>; - //===----------------------------------------------------------------------===// // Comparisons //===----------------------------------------------------------------------===// @@ -520,16 +417,18 @@ defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 33>; defm GE : SIMDConditionFP<"ge", SETOGE, 69>; // Lower float comparisons that don't care about NaN to standard WebAssembly -// float comparisons. These instructions are generated in the target-independent -// expansion of unordered comparisons and ordered ne. -def : Pat<(v4i32 (seteq (v4f32 V128:$lhs), (v4f32 V128:$rhs))), - (v4i32 (EQ_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; -def : Pat<(v4i32 (setne (v4f32 V128:$lhs), (v4f32 V128:$rhs))), - (v4i32 (NE_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; -def : Pat<(v2i64 (seteq (v2f64 V128:$lhs), (v2f64 V128:$rhs))), - (v2i64 (EQ_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; -def : Pat<(v2i64 (setne (v2f64 V128:$lhs), (v2f64 V128:$rhs))), - (v2i64 (NE_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; +// float comparisons. These instructions are generated with nnan and in the +// target-independent expansion of unordered comparisons and ordered ne. +foreach nodes = [[seteq, EQ_v4f32], [setne, NE_v4f32], [setlt, LT_v4f32], + [setgt, GT_v4f32], [setle, LE_v4f32], [setge, GE_v4f32]] in +def : Pat<(v4i32 (nodes[0] (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (v4i32 (nodes[1] (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; + +foreach nodes = [[seteq, EQ_v2f64], [setne, NE_v2f64], [setlt, LT_v2f64], + [setgt, GT_v2f64], [setle, LE_v2f64], [setge, GE_v2f64]] in +def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (v2i64 (nodes[1] (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; + //===----------------------------------------------------------------------===// // Bitwise operations @@ -628,6 +527,28 @@ defm ANYTRUE : SIMDReduce; // All lanes true: all_true defm ALLTRUE : SIMDReduce; +// Reductions already return 0 or 1, so and 1, setne 0, and seteq 1 +// can be folded out +foreach reduction = + [["int_wasm_anytrue", "ANYTRUE"], ["int_wasm_alltrue", "ALLTRUE"]] in +foreach ty = [v16i8, v8i16, v4i32, v2i64] in { +def : Pat<(i32 (and + (i32 (!cast(reduction[0]) (ty V128:$x))), + (i32 1) + )), + (i32 (!cast(reduction[1]#"_"#ty) (ty V128:$x)))>; +def : Pat<(i32 (setne + (i32 (!cast(reduction[0]) (ty V128:$x))), + (i32 0) + )), + (i32 (!cast(reduction[1]#"_"#ty) (ty V128:$x)))>; +def : Pat<(i32 (seteq + (i32 (!cast(reduction[0]) (ty V128:$x))), + (i32 1) + )), + (i32 (!cast(reduction[1]#"_"#ty) (ty V128:$x)))>; +} + //===----------------------------------------------------------------------===// // Bit shifts //===----------------------------------------------------------------------===// @@ -658,10 +579,16 @@ defm SHL : SIMDShiftInt; defm SHR_S : SIMDShiftInt; defm SHR_U : SIMDShiftInt; -// Truncate i64 shift operands to i32s -foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in +// Truncate i64 shift operands to i32s, except if they are already i32s +foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in { +def : Pat<(v2i64 (shifts[0] + (v2i64 V128:$vec), + (v2i64 (splat2 (i64 (sext I32:$x)))) + )), + (v2i64 (shifts[1] (v2i64 V128:$vec), (i32 I32:$x)))>; def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))), (v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>; +} // 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping def wasm_shift_t : SDTypeProfile<1, 2, diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp index ad838dfb574a..e92b34430272 100644 --- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp @@ -1,9 +1,8 @@ //=== WebAssemblyLateEHPrepare.cpp - WebAssembly Exception Preparation -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -16,29 +15,26 @@ #include "WebAssembly.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyUtilities.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/WasmEHFuncInfo.h" #include "llvm/MC/MCAsmInfo.h" using namespace llvm; -#define DEBUG_TYPE "wasm-exception-prepare" +#define DEBUG_TYPE "wasm-late-eh-prepare" namespace { class WebAssemblyLateEHPrepare final : public MachineFunctionPass { StringRef getPassName() const override { - return "WebAssembly Prepare Exception"; + return "WebAssembly Late Prepare Exception"; } bool runOnMachineFunction(MachineFunction &MF) override; - - bool removeUnnecessaryUnreachables(MachineFunction &MF); + bool addCatches(MachineFunction &MF); bool replaceFuncletReturns(MachineFunction &MF); - bool hoistCatches(MachineFunction &MF); - bool addCatchAlls(MachineFunction &MF); - bool addRethrows(MachineFunction &MF); - bool ensureSingleBBTermPads(MachineFunction &MF); - bool mergeTerminatePads(MachineFunction &MF); - bool addCatchAllTerminatePads(MachineFunction &MF); + bool removeUnnecessaryUnreachables(MachineFunction &MF); + bool addExceptionExtraction(MachineFunction &MF); + bool restoreStackPointer(MachineFunction &MF); public: static char ID; // Pass identification, replacement for typeid @@ -112,48 +108,40 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) { return false; bool Changed = false; + if (MF.getFunction().hasPersonalityFn()) { + Changed |= addCatches(MF); + Changed |= replaceFuncletReturns(MF); + } Changed |= removeUnnecessaryUnreachables(MF); - Changed |= addRethrows(MF); - if (!MF.getFunction().hasPersonalityFn()) - return Changed; - Changed |= replaceFuncletReturns(MF); - Changed |= hoistCatches(MF); - Changed |= addCatchAlls(MF); - Changed |= ensureSingleBBTermPads(MF); - Changed |= mergeTerminatePads(MF); - Changed |= addCatchAllTerminatePads(MF); + if (MF.getFunction().hasPersonalityFn()) { + Changed |= addExceptionExtraction(MF); + Changed |= restoreStackPointer(MF); + } return Changed; } -bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables( - MachineFunction &MF) { +// Add catch instruction to beginning of catchpads and cleanuppads. +bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) { bool Changed = false; + const auto &TII = *MF.getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto &MBB : MF) { - for (auto &MI : MBB) { - if (!WebAssembly::isThrow(MI)) - continue; + if (MBB.isEHPad()) { Changed = true; - - // The instruction after the throw should be an unreachable or a branch to - // another BB that should eventually lead to an unreachable. Delete it - // because throw itself is a terminator, and also delete successors if - // any. - MBB.erase(std::next(MachineBasicBlock::iterator(MI)), MBB.end()); - SmallVector Succs(MBB.succ_begin(), - MBB.succ_end()); - for (auto *Succ : Succs) - MBB.removeSuccessor(Succ); - eraseDeadBBsAndChildren(Succs); + auto InsertPos = MBB.begin(); + if (InsertPos->isEHLabel()) // EH pad starts with an EH label + ++InsertPos; + unsigned DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); + BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(), + TII.get(WebAssembly::CATCH), DstReg); } } - return Changed; } bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { bool Changed = false; const auto &TII = *MF.getSubtarget().getInstrInfo(); - auto *EHInfo = MF.getWasmEHFuncInfo(); for (auto &MBB : MF) { auto Pos = MBB.getFirstTerminator(); @@ -172,15 +160,17 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { Changed = true; break; } - case WebAssembly::CLEANUPRET: { - // Replace a cleanupret with a rethrow - if (EHInfo->hasThrowUnwindDest(&MBB)) - BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW)) - .addMBB(EHInfo->getThrowUnwindDest(&MBB)); - else - BuildMI(MBB, TI, TI->getDebugLoc(), - TII.get(WebAssembly::RETHROW_TO_CALLER)); - + case WebAssembly::CLEANUPRET: + case WebAssembly::RETHROW_IN_CATCH: { + // Replace a cleanupret/rethrow_in_catch with a rethrow + auto *EHPad = getMatchingEHPad(TI); + auto CatchPos = EHPad->begin(); + if (CatchPos->isEHLabel()) // EH pad starts with an EH label + ++CatchPos; + MachineInstr *Catch = &*CatchPos; + unsigned ExnReg = Catch->getOperand(0).getReg(); + BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW)) + .addReg(ExnReg); TI->eraseFromParent(); Changed = true; break; @@ -190,233 +180,208 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { return Changed; } -// Hoist catch instructions to the beginning of their matching EH pad BBs in -// case, -// (1) catch instruction is not the first instruction in EH pad. -// ehpad: -// some_other_instruction -// ... -// %exn = catch 0 -// (2) catch instruction is in a non-EH pad BB. For example, -// ehpad: -// br bb0 -// bb0: -// %exn = catch 0 -bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) { - bool Changed = false; - SmallVector Catches; - for (auto &MBB : MF) - for (auto &MI : MBB) - if (WebAssembly::isCatch(MI)) - Catches.push_back(&MI); - - for (auto *Catch : Catches) { - MachineBasicBlock *EHPad = getMatchingEHPad(Catch); - assert(EHPad && "No matching EH pad for catch"); - if (EHPad->begin() == Catch) - continue; - Changed = true; - EHPad->insert(EHPad->begin(), Catch->removeFromParent()); - } - return Changed; -} - -// Add catch_all to beginning of cleanup pads. -bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) { +bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables( + MachineFunction &MF) { bool Changed = false; - const auto &TII = *MF.getSubtarget().getInstrInfo(); - for (auto &MBB : MF) { - if (!MBB.isEHPad()) - continue; - // This runs after hoistCatches(), so we assume that if there is a catch, - // that should be the first instruction in an EH pad. - if (!WebAssembly::isCatch(*MBB.begin())) { - Changed = true; - BuildMI(MBB, MBB.begin(), MBB.begin()->getDebugLoc(), - TII.get(WebAssembly::CATCH_ALL)); - } - } - return Changed; -} - -// Add a 'rethrow' instruction after __cxa_rethrow() call -bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) { - bool Changed = false; - const auto &TII = *MF.getSubtarget().getInstrInfo(); - auto *EHInfo = MF.getWasmEHFuncInfo(); - - for (auto &MBB : MF) for (auto &MI : MBB) { - // Check if it is a call to __cxa_rethrow() - if (!MI.isCall()) + if (MI.getOpcode() != WebAssembly::THROW && + MI.getOpcode() != WebAssembly::RETHROW) continue; - MachineOperand &CalleeOp = MI.getOperand(0); - if (!CalleeOp.isGlobal() || - CalleeOp.getGlobal()->getName() != WebAssembly::CxaRethrowFn) - continue; - - // Now we have __cxa_rethrow() call Changed = true; - auto InsertPt = std::next(MachineBasicBlock::iterator(MI)); - while (InsertPt != MBB.end() && InsertPt->isLabel()) // Skip EH_LABELs - ++InsertPt; - MachineInstr *Rethrow = nullptr; - if (EHInfo->hasThrowUnwindDest(&MBB)) - Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(), - TII.get(WebAssembly::RETHROW)) - .addMBB(EHInfo->getThrowUnwindDest(&MBB)); - else - Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(), - TII.get(WebAssembly::RETHROW_TO_CALLER)); - // Because __cxa_rethrow does not return, the instruction after the - // rethrow should be an unreachable or a branch to another BB that should - // eventually lead to an unreachable. Delete it because rethrow itself is - // a terminator, and also delete non-EH pad successors if any. - MBB.erase(std::next(MachineBasicBlock::iterator(Rethrow)), MBB.end()); - SmallVector NonPadSuccessors; - for (auto *Succ : MBB.successors()) + // The instruction after the throw should be an unreachable or a branch to + // another BB that should eventually lead to an unreachable. Delete it + // because throw itself is a terminator, and also delete successors if + // any. + MBB.erase(std::next(MI.getIterator()), MBB.end()); + SmallVector Succs(MBB.succ_begin(), + MBB.succ_end()); + for (auto *Succ : Succs) if (!Succ->isEHPad()) - NonPadSuccessors.push_back(Succ); - for (auto *Succ : NonPadSuccessors) - MBB.removeSuccessor(Succ); - eraseDeadBBsAndChildren(NonPadSuccessors); + MBB.removeSuccessor(Succ); + eraseDeadBBsAndChildren(Succs); } + } + return Changed; } -// Terminate pads are an single-BB EH pad in the form of -// termpad: -// %exn = catch 0 -// call @__clang_call_terminate(%exn) -// unreachable -// (There can be local.set and local.gets before the call if we didn't run -// RegStackify) -// But code transformations can change or add more control flow, so the call to -// __clang_call_terminate() function may not be in the original EH pad anymore. -// This ensures every terminate pad is a single BB in the form illustrated -// above. -bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) { +// Wasm uses 'br_on_exn' instruction to check the tag of an exception. It takes +// exnref type object returned by 'catch', and branches to the destination if it +// matches a given tag. We currently use __cpp_exception symbol to represent the +// tag for all C++ exceptions. +// +// block $l (result i32) +// ... +// ;; exnref $e is on the stack at this point +// br_on_exn $l $e ;; branch to $l with $e's arguments +// ... +// end +// ;; Here we expect the extracted values are on top of the wasm value stack +// ... Handle exception using values ... +// +// br_on_exn takes an exnref object and branches if it matches the given tag. +// There can be multiple br_on_exn instructions if we want to match for another +// tag, but for now we only test for __cpp_exception tag, and if it does not +// match, i.e., it is a foreign exception, we rethrow it. +// +// In the destination BB that's the target of br_on_exn, extracted exception +// values (in C++'s case a single i32, which represents an exception pointer) +// are placed on top of the wasm stack. Because we can't model wasm stack in +// LLVM instruction, we use 'extract_exception' pseudo instruction to retrieve +// it. The pseudo instruction will be deleted later. +bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) { const auto &TII = *MF.getSubtarget().getInstrInfo(); + auto *EHInfo = MF.getWasmEHFuncInfo(); + SmallVector ExtractInstrs; + SmallVector ToDelete; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) { + if (MI.getOperand(0).isDead()) + ToDelete.push_back(&MI); + else + ExtractInstrs.push_back(&MI); + } + } + } + bool Changed = !ToDelete.empty() || !ExtractInstrs.empty(); + for (auto *MI : ToDelete) + MI->eraseFromParent(); + if (ExtractInstrs.empty()) + return Changed; - // Find calls to __clang_call_terminate() - SmallVector ClangCallTerminateCalls; - for (auto &MBB : MF) - for (auto &MI : MBB) + // Find terminate pads. + SmallSet TerminatePads; + for (auto &MBB : MF) { + for (auto &MI : MBB) { if (MI.isCall()) { const MachineOperand &CalleeOp = MI.getOperand(0); if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() == WebAssembly::ClangCallTerminateFn) - ClangCallTerminateCalls.push_back(&MI); + TerminatePads.insert(getMatchingEHPad(&MI)); } - - bool Changed = false; - for (auto *Call : ClangCallTerminateCalls) { - MachineBasicBlock *EHPad = getMatchingEHPad(Call); - assert(EHPad && "No matching EH pad for catch"); - - // If it is already the form we want, skip it - if (Call->getParent() == EHPad && - Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE) - continue; - - // In case the __clang_call_terminate() call is not in its matching EH pad, - // move the call to the end of EH pad and add an unreachable instruction - // after that. Delete all successors and their children if any, because here - // the program terminates. - Changed = true; - MachineInstr *Catch = &*EHPad->begin(); - // This runs after hoistCatches(), so catch instruction should be at the top - assert(WebAssembly::isCatch(*Catch)); - // Takes the result register of the catch instruction as argument. There may - // have been some other local.set/local.gets in between, but at this point - // we don't care. - Call->getOperand(1).setReg(Catch->getOperand(0).getReg()); - auto InsertPos = std::next(MachineBasicBlock::iterator(Catch)); - EHPad->insert(InsertPos, Call->removeFromParent()); - BuildMI(*EHPad, InsertPos, Call->getDebugLoc(), - TII.get(WebAssembly::UNREACHABLE)); - EHPad->erase(InsertPos, EHPad->end()); - SmallVector Succs(EHPad->succ_begin(), - EHPad->succ_end()); - for (auto *Succ : Succs) - EHPad->removeSuccessor(Succ); - eraseDeadBBsAndChildren(Succs); + } } - return Changed; -} -// In case there are multiple terminate pads, merge them into one for code size. -// This runs after ensureSingleBBTermPads() and assumes every terminate pad is a -// single BB. -// In principle this violates EH scope relationship because it can merge -// multiple inner EH scopes, each of which is in different outer EH scope. But -// getEHScopeMembership() function will not be called after this, so it is fine. -bool WebAssemblyLateEHPrepare::mergeTerminatePads(MachineFunction &MF) { - SmallVector TermPads; - for (auto &MBB : MF) - if (WebAssembly::isCatchTerminatePad(MBB)) - TermPads.push_back(&MBB); - if (TermPads.empty()) - return false; - - MachineBasicBlock *UniqueTermPad = TermPads.front(); - for (auto *TermPad : - llvm::make_range(std::next(TermPads.begin()), TermPads.end())) { - SmallVector Preds(TermPad->pred_begin(), - TermPad->pred_end()); - for (auto *Pred : Preds) - Pred->replaceSuccessor(TermPad, UniqueTermPad); - TermPad->eraseFromParent(); + for (auto *Extract : ExtractInstrs) { + MachineBasicBlock *EHPad = getMatchingEHPad(Extract); + assert(EHPad && "No matching EH pad for extract_exception"); + auto CatchPos = EHPad->begin(); + if (CatchPos->isEHLabel()) // EH pad starts with an EH label + ++CatchPos; + MachineInstr *Catch = &*CatchPos; + + if (Catch->getNextNode() != Extract) + EHPad->insert(Catch->getNextNode(), Extract->removeFromParent()); + + // - Before: + // ehpad: + // %exnref:exnref = catch + // %exn:i32 = extract_exception + // ... use exn ... + // + // - After: + // ehpad: + // %exnref:exnref = catch + // br_on_exn %thenbb, $__cpp_exception, %exnref + // br %elsebb + // elsebb: + // rethrow + // thenbb: + // %exn:i32 = extract_exception + // ... use exn ... + unsigned ExnReg = Catch->getOperand(0).getReg(); + auto *ThenMBB = MF.CreateMachineBasicBlock(); + auto *ElseMBB = MF.CreateMachineBasicBlock(); + MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB); + MF.insert(std::next(MachineFunction::iterator(ElseMBB)), ThenMBB); + ThenMBB->splice(ThenMBB->end(), EHPad, Extract, EHPad->end()); + ThenMBB->transferSuccessors(EHPad); + EHPad->addSuccessor(ThenMBB); + EHPad->addSuccessor(ElseMBB); + + DebugLoc DL = Extract->getDebugLoc(); + const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception"); + BuildMI(EHPad, DL, TII.get(WebAssembly::BR_ON_EXN)) + .addMBB(ThenMBB) + .addExternalSymbol(CPPExnSymbol) + .addReg(ExnReg); + BuildMI(EHPad, DL, TII.get(WebAssembly::BR)).addMBB(ElseMBB); + + // When this is a terminate pad with __clang_call_terminate() call, we don't + // rethrow it anymore and call __clang_call_terminate() with a nullptr + // argument, which will call std::terminate(). + // + // - Before: + // ehpad: + // %exnref:exnref = catch + // %exn:i32 = extract_exception + // call @__clang_call_terminate(%exn) + // unreachable + // + // - After: + // ehpad: + // %exnref:exnref = catch + // br_on_exn %thenbb, $__cpp_exception, %exnref + // br %elsebb + // elsebb: + // call @__clang_call_terminate(0) + // unreachable + // thenbb: + // %exn:i32 = extract_exception + // call @__clang_call_terminate(%exn) + // unreachable + if (TerminatePads.count(EHPad)) { + Function *ClangCallTerminateFn = + MF.getFunction().getParent()->getFunction( + WebAssembly::ClangCallTerminateFn); + assert(ClangCallTerminateFn && + "There is no __clang_call_terminate() function"); + BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL_VOID)) + .addGlobalAddress(ClangCallTerminateFn) + .addImm(0); + BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE)); + + } else { + BuildMI(ElseMBB, DL, TII.get(WebAssembly::RETHROW)).addReg(ExnReg); + if (EHInfo->hasEHPadUnwindDest(EHPad)) + ElseMBB->addSuccessor(EHInfo->getEHPadUnwindDest(EHPad)); + } } + return true; } -// Terminate pads are cleanup pads, so they should start with a 'catch_all' -// instruction. But in the Itanium model, when we have a C++ exception object, -// we pass them to __clang_call_terminate function, which calls __cxa_end_catch -// with the passed exception pointer and then std::terminate. This is the reason -// that terminate pads are generated with not a catch_all but a catch -// instruction in clang and earlier llvm passes. Here we append a terminate pad -// with a catch_all after each existing terminate pad so we can also catch -// foreign exceptions. For every terminate pad: -// %exn = catch 0 -// call @__clang_call_terminate(%exn) -// unreachable -// We append this BB right after that: -// catch_all -// call @std::terminate() -// unreachable -bool WebAssemblyLateEHPrepare::addCatchAllTerminatePads(MachineFunction &MF) { - const auto &TII = *MF.getSubtarget().getInstrInfo(); - SmallVector TermPads; - for (auto &MBB : MF) - if (WebAssembly::isCatchTerminatePad(MBB)) - TermPads.push_back(&MBB); - if (TermPads.empty()) +// After the stack is unwound due to a thrown exception, the __stack_pointer +// global can point to an invalid address. This inserts instructions that +// restore __stack_pointer global. +bool WebAssemblyLateEHPrepare::restoreStackPointer(MachineFunction &MF) { + const auto *FrameLowering = static_cast( + MF.getSubtarget().getFrameLowering()); + if (!FrameLowering->needsPrologForEH(MF)) return false; + bool Changed = false; - Function *StdTerminateFn = - MF.getFunction().getParent()->getFunction(WebAssembly::StdTerminateFn); - assert(StdTerminateFn && "There is no std::terminate() function"); - for (auto *CatchTermPad : TermPads) { - DebugLoc DL = CatchTermPad->findDebugLoc(CatchTermPad->begin()); - auto *CatchAllTermPad = MF.CreateMachineBasicBlock(); - MF.insert(std::next(MachineFunction::iterator(CatchTermPad)), - CatchAllTermPad); - CatchAllTermPad->setIsEHPad(); - BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CATCH_ALL)); - BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CALL_VOID)) - .addGlobalAddress(StdTerminateFn); - BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::UNREACHABLE)); + for (auto &MBB : MF) { + if (!MBB.isEHPad()) + continue; + Changed = true; - // Actually this CatchAllTermPad (new terminate pad with a catch_all) is not - // a successor of an existing terminate pad. CatchAllTermPad should have all - // predecessors CatchTermPad has instead. This is a hack to force - // CatchAllTermPad be always sorted right after CatchTermPad; the correct - // predecessor-successor relationships will be restored in CFGStackify pass. - CatchTermPad->addSuccessor(CatchAllTermPad); + // Insert __stack_pointer restoring instructions at the beginning of each EH + // pad, after the catch instruction. Here it is safe to assume that SP32 + // holds the latest value of __stack_pointer, because the only exception for + // this case is when a function uses the red zone, but that only happens + // with leaf functions, and we don't restore __stack_pointer in leaf + // functions anyway. + auto InsertPos = MBB.begin(); + if (InsertPos->isEHLabel()) // EH pad starts with an EH label + ++InsertPos; + if (InsertPos->getOpcode() == WebAssembly::CATCH) + ++InsertPos; + FrameLowering->writeSPToGlobal(WebAssembly::SP32, MF, MBB, InsertPos, + MBB.begin()->getDebugLoc()); } - return true; + return Changed; } diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp index c9a3527d3fbd..34a8195ac4b4 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 0491f71cea7f..960d5134f6e9 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -1,9 +1,8 @@ //=== WebAssemblyLowerEmscriptenEHSjLj.cpp - Lower exceptions for Emscripten =// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -240,16 +239,16 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass { bool EnableEH; // Enable exception handling bool EnableSjLj; // Enable setjmp/longjmp handling - GlobalVariable *ThrewGV; - GlobalVariable *ThrewValueGV; - Function *GetTempRet0Func; - Function *SetTempRet0Func; - Function *ResumeF; - Function *EHTypeIDF; - Function *EmLongjmpF; - Function *EmLongjmpJmpbufF; - Function *SaveSetjmpF; - Function *TestSetjmpF; + GlobalVariable *ThrewGV = nullptr; + GlobalVariable *ThrewValueGV = nullptr; + Function *GetTempRet0Func = nullptr; + Function *SetTempRet0Func = nullptr; + Function *ResumeF = nullptr; + Function *EHTypeIDF = nullptr; + Function *EmLongjmpF = nullptr; + Function *EmLongjmpJmpbufF = nullptr; + Function *SaveSetjmpF = nullptr; + Function *TestSetjmpF = nullptr; // __cxa_find_matching_catch_N functions. // Indexed by the number of clauses in an original landingpad instruction. @@ -282,11 +281,7 @@ public: static char ID; WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true) - : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj), - ThrewGV(nullptr), ThrewValueGV(nullptr), GetTempRet0Func(nullptr), - SetTempRet0Func(nullptr), ResumeF(nullptr), EHTypeIDF(nullptr), - EmLongjmpF(nullptr), EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr), - TestSetjmpF(nullptr) { + : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj) { EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end()); } bool runOnModule(Module &M) override; @@ -339,11 +334,12 @@ static bool canThrow(const Value *V) { // which will generate an import and asssumes that it will exist at link time. static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB, const char *Name) { - if (M.getNamedGlobal(Name)) - report_fatal_error(Twine("variable name is reserved: ") + Name); - return new GlobalVariable(M, IRB.getInt32Ty(), false, - GlobalValue::ExternalLinkage, nullptr, Name); + auto* GV = dyn_cast(M.getOrInsertGlobal(Name, IRB.getInt32Ty())); + if (!GV) + report_fatal_error(Twine("unable to create global: ") + Name); + + return GV; } // Simple function name mangler. @@ -433,8 +429,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) { // No attributes for the callee pointer. ArgAttributes.push_back(AttributeSet()); // Copy the argument attributes from the original - for (unsigned i = 0, e = CI->getNumArgOperands(); i < e; ++i) - ArgAttributes.push_back(InvokeAL.getParamAttributes(i)); + for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I) + ArgAttributes.push_back(InvokeAL.getParamAttributes(I)); // Reconstruct the AttributesList based on the vector we constructed. AttributeList NewCallAL = @@ -446,7 +442,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) { // Post-invoke // %__THREW__.val = __THREW__; __THREW__ = 0; - Value *Threw = IRB.CreateLoad(ThrewGV, ThrewGV->getName() + ".val"); + Value *Threw = + IRB.CreateLoad(IRB.getInt32Ty(), ThrewGV, ThrewGV->getName() + ".val"); IRB.CreateStore(IRB.getInt32(0), ThrewGV); return Threw; } @@ -488,6 +485,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M, if (CalleeF->isIntrinsic()) return false; + // Attempting to transform inline assembly will result in something like: + // call void @__invoke_void(void ()* asm ...) + // which is invalid because inline assembly blocks do not have addresses + // and can't be passed by pointer. The result is a crash with illegal IR. + if (isa(Callee)) + return false; + // The reason we include malloc/free here is to exclude the malloc/free // calls generated in setjmp prep / cleanup routines. Function *SetjmpF = M.getFunction("setjmp"); @@ -549,8 +553,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp( BasicBlock *ElseBB1 = BasicBlock::Create(C, "if.else1", F); BasicBlock *EndBB1 = BasicBlock::Create(C, "if.end", F); Value *ThrewCmp = IRB.CreateICmpNE(Threw, IRB.getInt32(0)); - Value *ThrewValue = - IRB.CreateLoad(ThrewValueGV, ThrewValueGV->getName() + ".val"); + Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV, + ThrewValueGV->getName() + ".val"); Value *ThrewValueCmp = IRB.CreateICmpNE(ThrewValue, IRB.getInt32(0)); Value *Cmp1 = IRB.CreateAnd(ThrewCmp, ThrewValueCmp, "cmp1"); IRB.CreateCondBr(Cmp1, ThenBB1, ElseBB1); @@ -562,8 +566,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp( BasicBlock *EndBB2 = BasicBlock::Create(C, "if.end2", F); Value *ThrewInt = IRB.CreateIntToPtr(Threw, Type::getInt32PtrTy(C), Threw->getName() + ".i32p"); - Value *LoadedThrew = - IRB.CreateLoad(ThrewInt, ThrewInt->getName() + ".loaded"); + Value *LoadedThrew = IRB.CreateLoad(IRB.getInt32Ty(), ThrewInt, + ThrewInt->getName() + ".loaded"); Value *ThenLabel = IRB.CreateCall( TestSetjmpF, {LoadedThrew, SetjmpTable, SetjmpTableSize}, "label"); Value *Cmp2 = IRB.CreateICmpEQ(ThenLabel, IRB.getInt32(0)); @@ -606,11 +610,11 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { ++UI; SSA.Initialize(I.getType(), I.getName()); SSA.AddAvailableValue(&BB, &I); - Instruction *User = cast(U.getUser()); + auto *User = cast(U.getUser()); if (User->getParent() == &BB) continue; - if (PHINode *UserPN = dyn_cast(User)) + if (auto *UserPN = dyn_cast(User)) if (UserPN->getIncomingBlock(U) == &BB) continue; @@ -769,7 +773,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) { // This can't throw, and we don't need this invoke, just replace it with a // call+branch SmallVector Args(II->arg_begin(), II->arg_end()); - CallInst *NewCall = IRB.CreateCall(II->getCalledValue(), Args); + CallInst *NewCall = + IRB.CreateCall(II->getFunctionType(), II->getCalledValue(), Args); NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); NewCall->setDebugLoc(II->getDebugLoc()); @@ -836,15 +841,15 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) { for (LandingPadInst *LPI : LandingPads) { IRB.SetInsertPoint(LPI); SmallVector FMCArgs; - for (unsigned i = 0, e = LPI->getNumClauses(); i < e; ++i) { - Constant *Clause = LPI->getClause(i); + for (unsigned I = 0, E = LPI->getNumClauses(); I < E; ++I) { + Constant *Clause = LPI->getClause(I); // As a temporary workaround for the lack of aggregate varargs support // in the interface between JS and wasm, break out filter operands into // their component elements. - if (LPI->isFilter(i)) { + if (LPI->isFilter(I)) { auto *ATy = cast(Clause->getType()); - for (unsigned j = 0, e = ATy->getNumElements(); j < e; ++j) { - Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(j), "filter"); + for (unsigned J = 0, E = ATy->getNumElements(); J < E; ++J) { + Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(J), "filter"); FMCArgs.push_back(EV); } } else @@ -954,8 +959,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { BBs.push_back(&BB); // BBs.size() will change within the loop, so we query it every time - for (unsigned i = 0; i < BBs.size(); i++) { - BasicBlock *BB = BBs[i]; + for (unsigned I = 0; I < BBs.size(); I++) { + BasicBlock *BB = BBs[I]; for (Instruction &I : *BB) { assert(!isa(&I)); auto *CI = dyn_cast(&I); @@ -1028,9 +1033,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // switch case). 0 means a longjmp that is not ours to handle, needs a // rethrow. Otherwise the index is the same as the index in P+1 (to avoid // 0). - for (unsigned i = 0; i < SetjmpRetPHIs.size(); i++) { - SI->addCase(IRB.getInt32(i + 1), SetjmpRetPHIs[i]->getParent()); - SetjmpRetPHIs[i]->addIncoming(LongjmpResult, EndBB); + for (unsigned I = 0; I < SetjmpRetPHIs.size(); I++) { + SI->addCase(IRB.getInt32(I + 1), SetjmpRetPHIs[I]->getParent()); + SetjmpRetPHIs[I]->addIncoming(LongjmpResult, EndBB); } // We are splitting the block here, and must continue to find other calls @@ -1077,7 +1082,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { Use &U = *UI; // Increment the iterator before removing the use from the list. ++UI; - if (Instruction *I = dyn_cast(U.getUser())) + if (auto *I = dyn_cast(U.getUser())) if (I->getParent() != &EntryBB) SetjmpTableSSA.RewriteUse(U); } @@ -1085,7 +1090,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { UI != UE;) { Use &U = *UI; ++UI; - if (Instruction *I = dyn_cast(U.getUser())) + if (auto *I = dyn_cast(U.getUser())) if (I->getParent() != &EntryBB) SetjmpTableSizeSSA.RewriteUse(U); } diff --git a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp index 84c877cb8d02..494d3fadbc8c 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyLowerGlobalDtors.cpp - Lower @llvm.global_dtors --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -62,7 +61,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) { LLVM_DEBUG(dbgs() << "********** Lower Global Destructors **********\n"); GlobalVariable *GV = M.getGlobalVariable("llvm.global_dtors"); - if (!GV) + if (!GV || !GV->hasInitializer()) return false; const ConstantArray *InitList = dyn_cast(GV->getInitializer()); @@ -70,7 +69,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) { return false; // Sanity-check @llvm.global_dtor's type. - StructType *ETy = dyn_cast(InitList->getType()->getElementType()); + auto *ETy = dyn_cast(InitList->getType()->getElementType()); if (!ETy || ETy->getNumElements() != 3 || !ETy->getTypeAtIndex(0U)->isIntegerTy() || !ETy->getTypeAtIndex(1U)->isPointerTy() || @@ -81,11 +80,11 @@ bool LowerGlobalDtors::runOnModule(Module &M) { // associated symbol. std::map>> DtorFuncs; for (Value *O : InitList->operands()) { - ConstantStruct *CS = dyn_cast(O); + auto *CS = dyn_cast(O); if (!CS) continue; // Malformed. - ConstantInt *Priority = dyn_cast(CS->getOperand(0)); + auto *Priority = dyn_cast(CS->getOperand(0)); if (!Priority) continue; // Malformed. uint16_t PriorityValue = Priority->getLimitedValue(UINT16_MAX); @@ -110,10 +109,11 @@ bool LowerGlobalDtors::runOnModule(Module &M) { FunctionType::get(Type::getVoidTy(C), AtExitFuncArgs, /*isVarArg=*/false); - Type *AtExitArgs[] = {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar}; - FunctionType *AtExitTy = FunctionType::get(Type::getInt32Ty(C), AtExitArgs, - /*isVarArg=*/false); - Constant *AtExit = M.getOrInsertFunction("__cxa_atexit", AtExitTy); + FunctionCallee AtExit = M.getOrInsertFunction( + "__cxa_atexit", + FunctionType::get(Type::getInt32Ty(C), + {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar}, + /*isVarArg=*/false)); // Declare __dso_local. Constant *DsoHandle = M.getNamedValue("__dso_handle"); @@ -143,13 +143,13 @@ bool LowerGlobalDtors::runOnModule(Module &M) { : Twine()), &M); BasicBlock *BB = BasicBlock::Create(C, "body", CallDtors); + FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C), + /*isVarArg=*/false); for (auto Dtor : AssociatedAndMore.second) - CallInst::Create(Dtor, "", BB); + CallInst::Create(VoidVoid, Dtor, "", BB); ReturnInst::Create(C, BB); - FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C), - /*isVarArg=*/false); Function *RegisterCallDtors = Function::Create( VoidVoid, Function::PrivateLinkage, "register_call_dtors" + diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index fa862fbaa634..288b991ae2c5 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -1,9 +1,8 @@ // WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst // // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -17,7 +16,7 @@ #include "WebAssemblyAsmPrinter.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblyRuntimeLibcallSignatures.h" -#include "WebAssemblyUtilities.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Constants.h" @@ -37,7 +36,7 @@ using namespace llvm; // This disables the removal of registers when lowering into MC, as required // by some current tests. -static cl::opt +cl::opt WasmKeepRegisters("wasm-keep-registers", cl::Hidden, cl::desc("WebAssembly: output stack registers in" " instruction output for test purposes only."), @@ -48,7 +47,7 @@ static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI); MCSymbol * WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { const GlobalValue *Global = MO.getGlobal(); - MCSymbolWasm *WasmSym = cast(Printer.getSymbol(Global)); + auto *WasmSym = cast(Printer.getSymbol(Global)); if (const auto *FuncTy = dyn_cast(Global->getValueType())) { const MachineFunction &MF = *MO.getParent()->getParent()->getParent(); @@ -57,9 +56,9 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { SmallVector ResultMVTs; SmallVector ParamMVTs; - ComputeSignatureVTs(FuncTy, CurrentFunc, TM, ParamMVTs, ResultMVTs); + computeSignatureVTs(FuncTy, CurrentFunc, TM, ParamMVTs, ResultMVTs); - auto Signature = SignatureFromMVTs(ResultMVTs, ParamMVTs); + auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs); WasmSym->setSignature(Signature.get()); Printer.addSignature(std::move(Signature)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); @@ -71,20 +70,23 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( const MachineOperand &MO) const { const char *Name = MO.getSymbolName(); - MCSymbolWasm *WasmSym = - cast(Printer.GetExternalSymbolSymbol(Name)); + auto *WasmSym = cast(Printer.GetExternalSymbolSymbol(Name)); const WebAssemblySubtarget &Subtarget = Printer.getSubtarget(); - // Except for the two exceptions (__stack_pointer and __cpp_exception), all - // other external symbols used by CodeGen are functions. It's OK to hardcode - // knowledge of specific symbols here; this method is precisely there for - // fetching the signatures of known Clang-provided symbols. - if (strcmp(Name, "__stack_pointer") == 0) { + // Except for certain known symbols, all symbols used by CodeGen are + // functions. It's OK to hardcode knowledge of specific symbols here; this + // method is precisely there for fetching the signatures of known + // Clang-provided symbols. + if (strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0 || + strcmp(Name, "__memory_base") == 0 || strcmp(Name, "__table_base") == 0 || + strcmp(Name, "__tls_size") == 0) { + bool Mutable = + strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0; WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); WasmSym->setGlobalType(wasm::WasmGlobalType{ uint8_t(Subtarget.hasAddr64() ? wasm::WASM_TYPE_I64 : wasm::WASM_TYPE_I32), - true}); + Mutable}); return WasmSym; } @@ -110,7 +112,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( : wasm::ValType::I32); } else { // Function symbols WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); - GetLibcallSignature(Subtarget, Name, Returns, Params); + getLibcallSignature(Subtarget, Name, Returns, Params); } auto Signature = make_unique(std::move(Returns), std::move(Params)); @@ -120,27 +122,42 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( return WasmSym; } -MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym, - int64_t Offset, - bool IsFunc, bool IsGlob, - bool IsEvent) const { - MCSymbolRefExpr::VariantKind VK = - IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION - : IsGlob ? MCSymbolRefExpr::VK_WebAssembly_GLOBAL - : IsEvent ? MCSymbolRefExpr::VK_WebAssembly_EVENT - : MCSymbolRefExpr::VK_None; +MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None; + unsigned TargetFlags = MO.getTargetFlags(); + + switch (TargetFlags) { + case WebAssemblyII::MO_NO_FLAG: + break; + case WebAssemblyII::MO_GOT: + Kind = MCSymbolRefExpr::VK_GOT; + break; + case WebAssemblyII::MO_MEMORY_BASE_REL: + Kind = MCSymbolRefExpr::VK_WASM_MBREL; + break; + case WebAssemblyII::MO_TABLE_BASE_REL: + Kind = MCSymbolRefExpr::VK_WASM_TBREL; + break; + default: + llvm_unreachable("Unknown target flag on GV operand"); + } - const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx); + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Kind, Ctx); - if (Offset != 0) { - if (IsFunc) + if (MO.getOffset() != 0) { + const auto *WasmSym = cast(Sym); + if (TargetFlags == WebAssemblyII::MO_GOT) + report_fatal_error("GOT symbol references do not support offsets"); + if (WasmSym->isFunction()) report_fatal_error("Function addresses with offsets not supported"); - if (IsGlob) + if (WasmSym->isGlobal()) report_fatal_error("Global indexes with offsets not supported"); - if (IsEvent) + if (WasmSym->isEvent()) report_fatal_error("Event indexes with offsets not supported"); - Expr = - MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx); + + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); } return MCOperand::createExpr(Expr); @@ -161,13 +178,13 @@ static wasm::ValType getType(const TargetRegisterClass *RC) { llvm_unreachable("Unexpected register class"); } -void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, +void WebAssemblyMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); const MCInstrDesc &Desc = MI->getDesc(); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { + const MachineOperand &MO = MI->getOperand(I); MCOperand MCOp; switch (MO.getType()) { @@ -188,8 +205,8 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, break; } case MachineOperand::MO_Immediate: - if (i < Desc.NumOperands) { - const MCOperandInfo &Info = Desc.OpInfo[i]; + if (I < Desc.NumOperands) { + const MCOperandInfo &Info = Desc.OpInfo[I]; if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) { MCSymbol *Sym = Printer.createTempSymbol("typeindex"); @@ -206,10 +223,10 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, // call_indirect instructions have a callee operand at the end which // doesn't count as a param. - if (WebAssembly::isCallIndirect(*MI)) + if (WebAssembly::isCallIndirect(MI->getOpcode())) Params.pop_back(); - MCSymbolWasm *WasmSym = cast(Sym); + auto *WasmSym = cast(Sym); auto Signature = make_unique(std::move(Returns), std::move(Params)); WasmSym->setSignature(Signature.get()); @@ -217,7 +234,7 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = MCSymbolRefExpr::create( - WasmSym, MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX, Ctx); + WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx); MCOp = MCOperand::createExpr(Expr); break; } @@ -237,30 +254,21 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, break; } case MachineOperand::MO_GlobalAddress: - assert(MO.getTargetFlags() == WebAssemblyII::MO_NO_FLAG && - "WebAssembly does not use target flags on GlobalAddresses"); - MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(), - MO.getGlobal()->getValueType()->isFunctionTy(), - false, false); + MCOp = lowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); break; case MachineOperand::MO_ExternalSymbol: // The target flag indicates whether this is a symbol for a // variable or a function. - assert((MO.getTargetFlags() & ~WebAssemblyII::MO_SYMBOL_MASK) == 0 && + assert(MO.getTargetFlags() == 0 && "WebAssembly uses only symbol flags on ExternalSymbols"); - MCOp = LowerSymbolOperand( - GetExternalSymbolSymbol(MO), /*Offset=*/0, - (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0, - (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0, - (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_EVENT) != 0); + MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); break; case MachineOperand::MO_MCSymbol: // This is currently used only for LSDA symbols (GCC_except_table), // because global addresses or other external symbols are handled above. assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags on MCSymbol"); - MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false, - false); + MCOp = lowerSymbolOperand(MO, MO.getMCSymbol()); break; } diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h index fa7a0ea61b3b..2c375a01a7f5 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -1,9 +1,8 @@ //===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -33,13 +32,12 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; - MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, bool IsFunc, - bool IsGlob, bool IsEvent) const; + MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; public: WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer) : Ctx(ctx), Printer(printer) {} - void Lower(const MachineInstr *MI, MCInst &OutMI) const; + void lower(const MachineInstr *MI, MCInst &OutMI) const; }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 0157af0f8510..d31c1226bfdb 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //=- WebAssemblyMachineFunctionInfo.cpp - WebAssembly Machine Function Info -=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -19,7 +18,7 @@ #include "llvm/CodeGen/Analysis.h" using namespace llvm; -WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {} +WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor. void WebAssemblyFunctionInfo::initWARegs() { assert(WARegs.empty()); @@ -27,7 +26,7 @@ void WebAssemblyFunctionInfo::initWARegs() { WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg); } -void llvm::ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, +void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty, SmallVectorImpl &ValueVTs) { const DataLayout &DL(F.getParent()->getDataLayout()); const WebAssemblyTargetLowering &TLI = @@ -38,16 +37,16 @@ void llvm::ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, for (EVT VT : VTs) { unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT); MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT); - for (unsigned i = 0; i != NumRegs; ++i) + for (unsigned I = 0; I != NumRegs; ++I) ValueVTs.push_back(RegisterVT); } } -void llvm::ComputeSignatureVTs(const FunctionType *Ty, const Function &F, +void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F, const TargetMachine &TM, SmallVectorImpl &Params, SmallVectorImpl &Results) { - ComputeLegalValueVTs(F, TM, Ty->getReturnType(), Results); + computeLegalValueVTs(F, TM, Ty->getReturnType(), Results); MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); if (Results.size() > 1) { @@ -59,22 +58,35 @@ void llvm::ComputeSignatureVTs(const FunctionType *Ty, const Function &F, } for (auto *Param : Ty->params()) - ComputeLegalValueVTs(F, TM, Param, Params); + computeLegalValueVTs(F, TM, Param, Params); if (Ty->isVarArg()) Params.push_back(PtrVT); } -void llvm::ValTypesFromMVTs(const ArrayRef &In, +void llvm::valTypesFromMVTs(const ArrayRef &In, SmallVectorImpl &Out) { for (MVT Ty : In) Out.push_back(WebAssembly::toValType(Ty)); } std::unique_ptr -llvm::SignatureFromMVTs(const SmallVectorImpl &Results, +llvm::signatureFromMVTs(const SmallVectorImpl &Results, const SmallVectorImpl &Params) { auto Sig = make_unique(); - ValTypesFromMVTs(Results, Sig->Returns); - ValTypesFromMVTs(Params, Sig->Params); + valTypesFromMVTs(Results, Sig->Returns); + valTypesFromMVTs(Params, Sig->Params); return Sig; } + +yaml::WebAssemblyFunctionInfo::WebAssemblyFunctionInfo( + const llvm::WebAssemblyFunctionInfo &MFI) + : CFGStackified(MFI.isCFGStackified()) {} + +void yaml::WebAssemblyFunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits::mapping(YamlIO, *this); +} + +void WebAssemblyFunctionInfo::initializeBaseYamlFields( + const yaml::WebAssemblyFunctionInfo &YamlMFI) { + CFGStackified = YamlMFI.CFGStackified; +} diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 4be4beb85d04..4b9ba491dee6 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -1,9 +1,8 @@ // WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -18,11 +17,16 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "llvm/BinaryFormat/Wasm.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCSymbolWasm.h" namespace llvm { +namespace yaml { +struct WebAssemblyFunctionInfo; +} + /// This class is derived from MachineFunctionInfo and contains private /// WebAssembly-specific information for each MachineFunction. class WebAssemblyFunctionInfo final : public MachineFunctionInfo { @@ -52,9 +56,13 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo { // overaligned values on the user stack. unsigned BasePtrVreg = -1U; + // Function properties. + bool CFGStackified = false; + public: explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {} ~WebAssemblyFunctionInfo() override; + void initializeBaseYamlFields(const yaml::WebAssemblyFunctionInfo &YamlMFI); void addParam(MVT VT) { Params.push_back(VT); } const std::vector &getParams() const { return Params; } @@ -118,24 +126,47 @@ public: assert(Reg & INT32_MIN); return Reg & INT32_MAX; } + + bool isCFGStackified() const { return CFGStackified; } + void setCFGStackified(bool Value = true) { CFGStackified = Value; } }; -void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty, +void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty, SmallVectorImpl &ValueVTs); // Compute the signature for a given FunctionType (Ty). Note that it's not the // signature for F (F is just used to get varous context) -void ComputeSignatureVTs(const FunctionType *Ty, const Function &F, +void computeSignatureVTs(const FunctionType *Ty, const Function &F, const TargetMachine &TM, SmallVectorImpl &Params, SmallVectorImpl &Results); -void ValTypesFromMVTs(const ArrayRef &In, +void valTypesFromMVTs(const ArrayRef &In, SmallVectorImpl &Out); std::unique_ptr -SignatureFromMVTs(const SmallVectorImpl &Results, +signatureFromMVTs(const SmallVectorImpl &Results, const SmallVectorImpl &Params); +namespace yaml { + +struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo { + bool CFGStackified = false; + + WebAssemblyFunctionInfo() = default; + WebAssemblyFunctionInfo(const llvm::WebAssemblyFunctionInfo &MFI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~WebAssemblyFunctionInfo() = default; +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, WebAssemblyFunctionInfo &MFI) { + YamlIO.mapOptional("isCFGStackified", MFI.CFGStackified, false); + } +}; + +} // end namespace yaml + } // end namespace llvm #endif diff --git a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp index c4b5e96db0c7..7ac0511c28b0 100644 --- a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp @@ -1,9 +1,8 @@ //== WebAssemblyMemIntrinsicResults.cpp - Optimize memory intrinsic results ==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -82,7 +81,7 @@ FunctionPass *llvm::createWebAssemblyMemIntrinsicResults() { } // Replace uses of FromReg with ToReg if they are dominated by MI. -static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI, +static bool replaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI, unsigned FromReg, unsigned ToReg, const MachineRegisterInfo &MRI, MachineDominatorTree &MDT, @@ -157,10 +156,10 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI, return false; StringRef Name(Op1.getSymbolName()); - bool callReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) || + bool CallReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) || Name == TLI.getLibcallName(RTLIB::MEMMOVE) || Name == TLI.getLibcallName(RTLIB::MEMSET); - if (!callReturnsInput) + if (!CallReturnsInput) return false; LibFunc Func; @@ -172,7 +171,7 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI, if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg)) report_fatal_error("Memory Intrinsic results: call to builtin function " "with wrong signature, from/to mismatch"); - return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS); + return replaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS); } bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) { @@ -182,11 +181,11 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) { }); MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineDominatorTree &MDT = getAnalysis(); + auto &MDT = getAnalysis(); const WebAssemblyTargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); const auto &LibInfo = getAnalysis().getTLI(); - LiveIntervals &LIS = getAnalysis(); + auto &LIS = getAnalysis(); bool Changed = false; // We don't preserve SSA form. @@ -201,8 +200,8 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { default: break; - case WebAssembly::CALL_I32: - case WebAssembly::CALL_I64: + case WebAssembly::CALL_i32: + case WebAssembly::CALL_i64: Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo); break; } diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp index 3d0a15244ee0..8c7c3305c201 100644 --- a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp +++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp @@ -1,9 +1,8 @@ //===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -72,7 +71,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction( << MF.getName() << '\n'); MachineRegisterInfo &MRI = MF.getRegInfo(); - LiveIntervals &LIS = getAnalysis(); + auto &LIS = getAnalysis(); // We don't preserve SSA form. MRI.leaveSSA(); @@ -81,8 +80,8 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction( // Split multiple-VN LiveIntervals into multiple LiveIntervals. SmallVector SplitLIs; - for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(I); if (MRI.reg_nodbg_empty(Reg)) continue; diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp index 2c018d0785a7..d20352259e07 100644 --- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp +++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -37,11 +36,11 @@ class OptimizeReturned final : public FunctionPass, bool runOnFunction(Function &F) override; - DominatorTree *DT; + DominatorTree *DT = nullptr; public: static char ID; - OptimizeReturned() : FunctionPass(ID), DT(nullptr) {} + OptimizeReturned() : FunctionPass(ID) {} void visitCallSite(CallSite CS); }; @@ -57,10 +56,10 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() { } void OptimizeReturned::visitCallSite(CallSite CS) { - for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i) - if (CS.paramHasAttr(i, Attribute::Returned)) { + for (unsigned I = 0, E = CS.getNumArgOperands(); I < E; ++I) + if (CS.paramHasAttr(I, Attribute::Returned)) { Instruction *Inst = CS.getInstruction(); - Value *Arg = CS.getArgOperand(i); + Value *Arg = CS.getArgOperand(I); // Ignore constants, globals, undef, etc. if (isa(Arg)) continue; diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp index 2dfd85953f14..e11cdeaa0e79 100644 --- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp +++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -58,7 +57,7 @@ FunctionPass *llvm::createWebAssemblyPeephole() { } /// If desirable, rewrite NewReg to a drop register. -static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg, +static bool maybeRewriteToDrop(unsigned OldReg, unsigned NewReg, MachineOperand &MO, WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI) { bool Changed = false; @@ -72,7 +71,7 @@ static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg, return Changed; } -static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, +static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, const MachineFunction &MF, WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI, @@ -129,8 +128,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { default: break; - case WebAssembly::CALL_I32: - case WebAssembly::CALL_I64: { + case WebAssembly::CALL_i32: + case WebAssembly::CALL_i64: { MachineOperand &Op1 = MI.getOperand(1); if (Op1.isSymbol()) { StringRef Name(Op1.getSymbolName()); @@ -150,7 +149,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg)) report_fatal_error("Peephole: call to builtin function with " "wrong signature, from/to mismatch"); - Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI); + Changed |= maybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI); } } } @@ -158,57 +157,57 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { } // Optimize away an explicit void return at the end of the function. case WebAssembly::RETURN_I32: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32, WebAssembly::COPY_I32); break; case WebAssembly::RETURN_I64: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64, WebAssembly::COPY_I64); break; case WebAssembly::RETURN_F32: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32, WebAssembly::COPY_F32); break; case WebAssembly::RETURN_F64: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64, WebAssembly::COPY_F64); break; case WebAssembly::RETURN_v16i8: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8, WebAssembly::COPY_V128); break; case WebAssembly::RETURN_v8i16: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16, WebAssembly::COPY_V128); break; case WebAssembly::RETURN_v4i32: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32, WebAssembly::COPY_V128); break; case WebAssembly::RETURN_v2i64: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2i64, WebAssembly::COPY_V128); break; case WebAssembly::RETURN_v4f32: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32, WebAssembly::COPY_V128); break; case WebAssembly::RETURN_v2f64: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2f64, WebAssembly::COPY_V128); break; case WebAssembly::RETURN_VOID: - Changed |= MaybeRewriteToFallthrough( + Changed |= maybeRewriteToFallthrough( MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID, WebAssembly::INSTRUCTION_LIST_END); break; diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp index 0be0ba657830..3bfbf607344d 100644 --- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp +++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp @@ -1,9 +1,8 @@ //===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -63,9 +62,9 @@ FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() { } // Test whether the given register has an ARGUMENT def. -static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) { +static bool hasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) { for (const auto &Def : MRI.def_instructions(Reg)) - if (WebAssembly::isArgument(Def)) + if (WebAssembly::isArgument(Def.getOpcode())) return true; return false; } @@ -95,15 +94,15 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction( // // TODO: This is fairly heavy-handed; find a better approach. // - for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(I); // Skip unused registers. if (MRI.use_nodbg_empty(Reg)) continue; // Skip registers that have an ARGUMENT definition. - if (HasArgumentDef(Reg, MRI)) + if (hasArgumentDef(Reg, MRI)) continue; BuildMI(Entry, Entry.begin(), DebugLoc(), @@ -115,7 +114,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction( // liveness reflects the fact that these really are live-in values. for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE;) { MachineInstr &MI = *MII++; - if (WebAssembly::isArgument(MI)) { + if (WebAssembly::isArgument(MI.getOpcode())) { MI.removeFromParent(); Entry.insert(Entry.begin(), &MI); } diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp index d97b13a8d699..6f09c45b6642 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -66,11 +65,11 @@ FunctionPass *llvm::createWebAssemblyRegColoring() { static float computeWeight(const MachineRegisterInfo *MRI, const MachineBlockFrequencyInfo *MBFI, unsigned VReg) { - float weight = 0.0f; + float Weight = 0.0f; for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg)) - weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI, + Weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI, *MO.getParent()); - return weight; + return Weight; } bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { @@ -98,8 +97,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { SortedIntervals.reserve(NumVRegs); LLVM_DEBUG(dbgs() << "Interesting register intervals:\n"); - for (unsigned i = 0; i < NumVRegs; ++i) { - unsigned VReg = TargetRegisterInfo::index2VirtReg(i); + for (unsigned I = 0; I < NumVRegs; ++I) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(I); if (MFI.isVRegStackified(VReg)) continue; // Skip unused registers, which can use $drop. @@ -134,10 +133,10 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { SortedIntervals.size()); BitVector UsedColors(SortedIntervals.size()); bool Changed = false; - for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) { - LiveInterval *LI = SortedIntervals[i]; + for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) { + LiveInterval *LI = SortedIntervals[I]; unsigned Old = LI->reg; - size_t Color = i; + size_t Color = I; const TargetRegisterClass *RC = MRI->getRegClass(Old); // Check if it's possible to reuse any of the used colors. @@ -154,7 +153,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { } unsigned New = SortedIntervals[Color]->reg; - SlotMapping[i] = New; + SlotMapping[I] = New; Changed |= Old != New; UsedColors.set(Color); Assignments[Color].push_back(LI); @@ -166,9 +165,9 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { return false; // Rewrite register operands. - for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) { - unsigned Old = SortedIntervals[i]->reg; - unsigned New = SlotMapping[i]; + for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) { + unsigned Old = SortedIntervals[I]->reg; + unsigned New = SlotMapping[I]; if (Old != New) MRI->replaceRegWith(Old, New); } diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp index 1e2a248f097e..cdca23f55b29 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -73,7 +72,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { // variables. Assign the numbers for them first. MachineBasicBlock &EntryMBB = MF.front(); for (MachineInstr &MI : EntryMBB) { - if (!WebAssembly::isArgument(MI)) + if (!WebAssembly::isArgument(MI.getOpcode())) break; int64_t Imm = MI.getOperand(1).getImm(); diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 1eb32ed64494..a120a6471014 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -80,7 +79,7 @@ FunctionPass *llvm::createWebAssemblyRegStackify() { // Decorate the given instruction with implicit operands that enforce the // expression stack ordering constraints for an instruction which is on // the expression stack. -static void ImposeStackOrdering(MachineInstr *MI) { +static void imposeStackOrdering(MachineInstr *MI) { // Write the opaque VALUE_STACK register. if (!MI->definesRegister(WebAssembly::VALUE_STACK)) MI->addOperand(MachineOperand::CreateReg(WebAssembly::VALUE_STACK, @@ -96,7 +95,7 @@ static void ImposeStackOrdering(MachineInstr *MI) { // Convert an IMPLICIT_DEF instruction into an instruction which defines // a constant zero value. -static void ConvertImplicitDefToConstZero(MachineInstr *MI, +static void convertImplicitDefToConstZero(MachineInstr *MI, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineFunction &MF, @@ -112,12 +111,12 @@ static void ConvertImplicitDefToConstZero(MachineInstr *MI, MI->addOperand(MachineOperand::CreateImm(0)); } else if (RegClass == &WebAssembly::F32RegClass) { MI->setDesc(TII->get(WebAssembly::CONST_F32)); - ConstantFP *Val = cast(Constant::getNullValue( + auto *Val = cast(Constant::getNullValue( Type::getFloatTy(MF.getFunction().getContext()))); MI->addOperand(MachineOperand::CreateFPImm(Val)); } else if (RegClass == &WebAssembly::F64RegClass) { MI->setDesc(TII->get(WebAssembly::CONST_F64)); - ConstantFP *Val = cast(Constant::getNullValue( + auto *Val = cast(Constant::getNullValue( Type::getDoubleTy(MF.getFunction().getContext()))); MI->addOperand(MachineOperand::CreateFPImm(Val)); } else if (RegClass == &WebAssembly::V128RegClass) { @@ -136,7 +135,7 @@ static void ConvertImplicitDefToConstZero(MachineInstr *MI, // Determine whether a call to the callee referenced by // MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side // effects. -static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read, +static void queryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read, bool &Write, bool &Effects, bool &StackPointer) { // All calls can use the stack pointer. StackPointer = true; @@ -144,11 +143,11 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read, const MachineOperand &MO = MI.getOperand(CalleeOpNo); if (MO.isGlobal()) { const Constant *GV = MO.getGlobal(); - if (const GlobalAlias *GA = dyn_cast(GV)) + if (const auto *GA = dyn_cast(GV)) if (!GA->isInterposable()) GV = GA->getAliasee(); - if (const Function *F = dyn_cast(GV)) { + if (const auto *F = dyn_cast(GV)) { if (!F->doesNotThrow()) Effects = true; if (F->doesNotAccessMemory()) @@ -168,7 +167,7 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read, // Determine whether MI reads memory, writes memory, has side effects, // and/or uses the stack pointer value. -static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, +static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, bool &Write, bool &Effects, bool &StackPointer) { assert(!MI.isTerminator()); @@ -253,13 +252,13 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, // Analyze calls. if (MI.isCall()) { - unsigned CalleeOpNo = WebAssembly::getCalleeOpNo(MI); - QueryCallee(MI, CalleeOpNo, Read, Write, Effects, StackPointer); + unsigned CalleeOpNo = WebAssembly::getCalleeOpNo(MI.getOpcode()); + queryCallee(MI, CalleeOpNo, Read, Write, Effects, StackPointer); } } // Test whether Def is safe and profitable to rematerialize. -static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA, +static bool shouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA, const WebAssemblyInstrInfo *TII) { return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA); } @@ -267,7 +266,7 @@ static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA, // Identify the definition for this register at this point. This is a // generalization of MachineRegisterInfo::getUniqueVRegDef that uses // LiveIntervals to handle complex cases. -static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert, +static MachineInstr *getVRegDef(unsigned Reg, const MachineInstr *Insert, const MachineRegisterInfo &MRI, const LiveIntervals &LIS) { // Most registers are in SSA form here so we try a quick MRI query first. @@ -285,7 +284,7 @@ static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert, // Test whether Reg, as defined at Def, has exactly one use. This is a // generalization of MachineRegisterInfo::hasOneUse that uses LiveIntervals // to handle complex cases. -static bool HasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI, +static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI, MachineDominatorTree &MDT, LiveIntervals &LIS) { // Most registers are in SSA form here so we try a quick MRI query first. if (MRI.hasOneUse(Reg)) @@ -314,10 +313,22 @@ static bool HasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI, // walking the block. // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be // more precise. -static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, +static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, AliasAnalysis &AA, const MachineRegisterInfo &MRI) { assert(Def->getParent() == Insert->getParent()); + // 'catch' and 'extract_exception' should be the first instruction of a BB and + // cannot move. + if (Def->getOpcode() == WebAssembly::CATCH || + Def->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) { + const MachineBasicBlock *MBB = Def->getParent(); + auto NextI = std::next(MachineBasicBlock::const_iterator(Def)); + for (auto E = MBB->end(); NextI != E && NextI->isDebugInstr(); ++NextI) + ; + if (NextI != Insert) + return false; + } + // Check for register dependencies. SmallVector MutableRegisters; for (const MachineOperand &MO : Def->operands()) { @@ -350,7 +361,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, } bool Read = false, Write = false, Effects = false, StackPointer = false; - Query(*Def, AA, Read, Write, Effects, StackPointer); + query(*Def, AA, Read, Write, Effects, StackPointer); // If the instruction does not access memory and has no side effects, it has // no additional dependencies. @@ -365,7 +376,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, bool InterveningWrite = false; bool InterveningEffects = false; bool InterveningStackPointer = false; - Query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects, + query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects, InterveningStackPointer); if (Effects && InterveningEffects) return false; @@ -386,7 +397,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, } /// Test whether OneUse, a use of Reg, dominates all of Reg's other uses. -static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse, +static bool oneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse, const MachineBasicBlock &MBB, const MachineRegisterInfo &MRI, const MachineDominatorTree &MDT, @@ -445,7 +456,7 @@ static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse, } /// Get the appropriate tee opcode for the given register class. -static unsigned GetTeeOpcode(const TargetRegisterClass *RC) { +static unsigned getTeeOpcode(const TargetRegisterClass *RC) { if (RC == &WebAssembly::I32RegClass) return WebAssembly::TEE_I32; if (RC == &WebAssembly::I64RegClass) @@ -460,7 +471,7 @@ static unsigned GetTeeOpcode(const TargetRegisterClass *RC) { } // Shrink LI to its uses, cleaning up LI. -static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) { +static void shrinkToUses(LiveInterval &LI, LiveIntervals &LIS) { if (LIS.shrinkToUses(&LI)) { SmallVector SplitLIs; LIS.splitSeparateComponents(LI, SplitLIs); @@ -469,7 +480,7 @@ static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) { /// A single-use def in the same block with no intervening memory or register /// dependencies; move the def down and nest it with the current instruction. -static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand &Op, +static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB, MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI, @@ -508,13 +519,13 @@ static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand &Op, LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump()); } - ImposeStackOrdering(Def); + imposeStackOrdering(Def); return Def; } /// A trivially cloneable instruction; clone it and nest the new copy with the /// current instruction. -static MachineInstr *RematerializeCheapDef( +static MachineInstr *rematerializeCheapDef( unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI, @@ -531,7 +542,7 @@ static MachineInstr *RematerializeCheapDef( LIS.InsertMachineInstrInMaps(*Clone); LIS.createAndComputeVirtRegInterval(NewReg); MFI.stackifyVReg(NewReg); - ImposeStackOrdering(Clone); + imposeStackOrdering(Clone); LLVM_DEBUG(dbgs() << " - Cloned to "; Clone->dump()); @@ -539,7 +550,7 @@ static MachineInstr *RematerializeCheapDef( bool IsDead = MRI.use_empty(Reg); if (!IsDead) { LiveInterval &LI = LIS.getInterval(Reg); - ShrinkToUses(LI, LIS); + shrinkToUses(LI, LIS); IsDead = !LI.liveAt(LIS.getInstructionIndex(Def).getDeadSlot()); } @@ -582,7 +593,7 @@ static MachineInstr *RematerializeCheapDef( /// /// with DefReg and TeeReg stackified. This eliminates a local.get from the /// resulting code. -static MachineInstr *MoveAndTeeForMultiUse( +static MachineInstr *moveAndTeeForMultiUse( unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB, MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) { @@ -600,7 +611,7 @@ static MachineInstr *MoveAndTeeForMultiUse( unsigned DefReg = MRI.createVirtualRegister(RegClass); MachineOperand &DefMO = Def->getOperand(0); MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(), - TII->get(GetTeeOpcode(RegClass)), TeeReg) + TII->get(getTeeOpcode(RegClass)), TeeReg) .addReg(Reg, RegState::Define) .addReg(DefReg, getUndefRegState(DefMO.isDead())); Op.setReg(TeeReg); @@ -616,15 +627,15 @@ static MachineInstr *MoveAndTeeForMultiUse( VNInfo *ValNo = LI.getVNInfoAt(DefIdx); I->start = TeeIdx; ValNo->def = TeeIdx; - ShrinkToUses(LI, LIS); + shrinkToUses(LI, LIS); // Finish stackifying the new regs. LIS.createAndComputeVirtRegInterval(TeeReg); LIS.createAndComputeVirtRegInterval(DefReg); MFI.stackifyVReg(DefReg); MFI.stackifyVReg(TeeReg); - ImposeStackOrdering(Def); - ImposeStackOrdering(Tee); + imposeStackOrdering(Def); + imposeStackOrdering(Tee); DefDIs.clone(Tee, DefReg); DefDIs.clone(Insert, TeeReg); @@ -638,9 +649,9 @@ namespace { /// A stack for walking the tree of instructions being built, visiting the /// MachineOperands in DFS order. class TreeWalkerState { - typedef MachineInstr::mop_iterator mop_iterator; - typedef std::reverse_iterator mop_reverse_iterator; - typedef iterator_range RangeTy; + using mop_iterator = MachineInstr::mop_iterator; + using mop_reverse_iterator = std::reverse_iterator; + using RangeTy = iterator_range; SmallVector Worklist; public: @@ -650,9 +661,9 @@ public: Worklist.push_back(reverse(Range)); } - bool Done() const { return Worklist.empty(); } + bool done() const { return Worklist.empty(); } - MachineOperand &Pop() { + MachineOperand &pop() { RangeTy &Range = Worklist.back(); MachineOperand &Op = *Range.begin(); Range = drop_begin(Range, 1); @@ -665,7 +676,7 @@ public: } /// Push Instr's operands onto the stack to be visited. - void PushOperands(MachineInstr *Instr) { + void pushOperands(MachineInstr *Instr) { const iterator_range &Range(Instr->explicit_uses()); if (Range.begin() != Range.end()) Worklist.push_back(reverse(Range)); @@ -673,8 +684,8 @@ public: /// Some of Instr's operands are on the top of the stack; remove them and /// re-insert them starting from the beginning (because we've commuted them). - void ResetTopOperands(MachineInstr *Instr) { - assert(HasRemainingOperands(Instr) && + void resetTopOperands(MachineInstr *Instr) { + assert(hasRemainingOperands(Instr) && "Reseting operands should only be done when the instruction has " "an operand still on the stack"); Worklist.back() = reverse(Instr->explicit_uses()); @@ -682,7 +693,7 @@ public: /// Test whether Instr has operands remaining to be visited at the top of /// the stack. - bool HasRemainingOperands(const MachineInstr *Instr) const { + bool hasRemainingOperands(const MachineInstr *Instr) const { if (Worklist.empty()) return false; const RangeTy &Range = Worklist.back(); @@ -695,7 +706,7 @@ public: /// /// This is needed as a consequence of using implicit local.gets for /// uses and implicit local.sets for defs. - bool IsOnStack(unsigned Reg) const { + bool isOnStack(unsigned Reg) const { for (const RangeTy &Range : Worklist) for (const MachineOperand &MO : Range) if (MO.isReg() && MO.getReg() == Reg) @@ -712,20 +723,18 @@ class CommutingState { /// state where we've commuted the operands of the current instruction and are /// revisiting it, and the declined state where we've reverted the operands /// back to their original order and will no longer commute it further. - bool TentativelyCommuting; - bool Declined; + bool TentativelyCommuting = false; + bool Declined = false; /// During the tentative state, these hold the operand indices of the commuted /// operands. unsigned Operand0, Operand1; public: - CommutingState() : TentativelyCommuting(false), Declined(false) {} - /// Stackification for an operand was not successful due to ordering /// constraints. If possible, and if we haven't already tried it and declined /// it, commute Insert's operands and prepare to revisit it. - void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker, + void maybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker, const WebAssemblyInstrInfo *TII) { if (TentativelyCommuting) { assert(!Declined && @@ -734,13 +743,13 @@ public: TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1); TentativelyCommuting = false; Declined = true; - } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) { + } else if (!Declined && TreeWalker.hasRemainingOperands(Insert)) { Operand0 = TargetInstrInfo::CommuteAnyOperandIndex; Operand1 = TargetInstrInfo::CommuteAnyOperandIndex; if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) { // Tentatively commute the operands and try again. TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1); - TreeWalker.ResetTopOperands(Insert); + TreeWalker.resetTopOperands(Insert); TentativelyCommuting = true; Declined = false; } @@ -749,7 +758,7 @@ public: /// Stackification for some operand was successful. Reset to the default /// state. - void Reset() { + void reset() { TentativelyCommuting = false; Declined = false; } @@ -767,8 +776,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { const auto *TII = MF.getSubtarget().getInstrInfo(); const auto *TRI = MF.getSubtarget().getRegisterInfo(); AliasAnalysis &AA = getAnalysis().getAAResults(); - MachineDominatorTree &MDT = getAnalysis(); - LiveIntervals &LIS = getAnalysis(); + auto &MDT = getAnalysis(); + auto &LIS = getAnalysis(); // Walk the instructions from the bottom up. Currently we don't look past // block boundaries, and the blocks aren't ordered so the block visitation @@ -780,19 +789,19 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { MachineInstr *Insert = &*MII; // Don't nest anything inside an inline asm, because we don't have // constraints for $push inputs. - if (Insert->getOpcode() == TargetOpcode::INLINEASM) + if (Insert->isInlineAsm()) continue; // Ignore debugging intrinsics. - if (Insert->getOpcode() == TargetOpcode::DBG_VALUE) + if (Insert->isDebugValue()) continue; // Iterate through the inputs in reverse order, since we'll be pulling // operands off the stack in LIFO order. CommutingState Commuting; TreeWalkerState TreeWalker(Insert); - while (!TreeWalker.Done()) { - MachineOperand &Op = TreeWalker.Pop(); + while (!TreeWalker.done()) { + MachineOperand &Op = TreeWalker.pop(); // We're only interested in explicit virtual register operands. if (!Op.isReg()) @@ -806,18 +815,36 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { continue; // Identify the definition for this register at this point. - MachineInstr *Def = GetVRegDef(Reg, Insert, MRI, LIS); + MachineInstr *Def = getVRegDef(Reg, Insert, MRI, LIS); if (!Def) continue; // Don't nest an INLINE_ASM def into anything, because we don't have // constraints for $pop outputs. - if (Def->getOpcode() == TargetOpcode::INLINEASM) + if (Def->isInlineAsm()) continue; // Argument instructions represent live-in registers and not real // instructions. - if (WebAssembly::isArgument(*Def)) + if (WebAssembly::isArgument(Def->getOpcode())) + continue; + + // Currently catch's return value register cannot be stackified, because + // the wasm LLVM backend currently does not support live-in values + // entering blocks, which is a part of multi-value proposal. + // + // Once we support live-in values of wasm blocks, this can be: + // catch ; push exnref value onto stack + // block exnref -> i32 + // br_on_exn $__cpp_exception ; pop the exnref value + // end_block + // + // But because we don't support it yet, the catch instruction's dst + // register should be assigned to a local to be propagated across + // 'block' boundary now. + // + // TODO Fix this once we support the multi-value proposal. + if (Def->getOpcode() == WebAssembly::CATCH) continue; // Decide which strategy to take. Prefer to move a single-use value @@ -827,23 +854,23 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // supports intra-block moves) and it's MachineSink's job to catch all // the sinking opportunities anyway. bool SameBlock = Def->getParent() == &MBB; - bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, MRI) && - !TreeWalker.IsOnStack(Reg); - if (CanMove && HasOneUse(Reg, Def, MRI, MDT, LIS)) { - Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI); - } else if (ShouldRematerialize(*Def, AA, TII)) { + bool CanMove = SameBlock && isSafeToMove(Def, Insert, AA, MRI) && + !TreeWalker.isOnStack(Reg); + if (CanMove && hasOneUse(Reg, Def, MRI, MDT, LIS)) { + Insert = moveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI); + } else if (shouldRematerialize(*Def, AA, TII)) { Insert = - RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(), + rematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(), LIS, MFI, MRI, TII, TRI); } else if (CanMove && - OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) { - Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI, + oneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) { + Insert = moveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI, TII); } else { // We failed to stackify the operand. If the problem was ordering // constraints, Commuting may be able to help. if (!CanMove && SameBlock) - Commuting.MaybeCommute(Insert, TreeWalker, TII); + Commuting.maybeCommute(Insert, TreeWalker, TII); // Proceed to the next operand. continue; } @@ -852,18 +879,18 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // to a constant 0 so that the def is explicit, and the push/pop // correspondence is maintained. if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF) - ConvertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS); + convertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS); // We stackified an operand. Add the defining instruction's operands to // the worklist stack now to continue to build an ever deeper tree. - Commuting.Reset(); - TreeWalker.PushOperands(Insert); + Commuting.reset(); + TreeWalker.pushOperands(Insert); } // If we stackified any operands, skip over the tree to start looking for // the next instruction we can build a tree on. if (Insert != &*MII) { - ImposeStackOrdering(&*MII); + imposeStackOrdering(&*MII); MII = MachineBasicBlock::iterator(Insert).getReverse(); Changed = true; } diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index 1f0870865b06..ea9cfc00adfd 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyRegisterInfo.cpp - WebAssembly Register Information ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -67,19 +66,22 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex( assert(MFI.getObjectSize(FrameIndex) != 0 && "We assume that variable-sized objects have already been lowered, " "and don't use FrameIndex operands."); - unsigned FrameRegister = getFrameRegister(MF); + Register FrameRegister = getFrameRegister(MF); // If this is the address operand of a load or store, make it relative to SP // and fold the frame offset directly in. - if ((MI.mayLoad() && FIOperandNum == WebAssembly::LoadAddressOperandNo) || - (MI.mayStore() && FIOperandNum == WebAssembly::StoreAddressOperandNo)) { - assert(FrameOffset >= 0 && MI.getOperand(FIOperandNum - 1).getImm() >= 0); - int64_t Offset = MI.getOperand(FIOperandNum - 1).getImm() + FrameOffset; + unsigned AddrOperandNum = WebAssembly::getNamedOperandIdx( + MI.getOpcode(), WebAssembly::OpName::addr); + if (AddrOperandNum == FIOperandNum) { + unsigned OffsetOperandNum = WebAssembly::getNamedOperandIdx( + MI.getOpcode(), WebAssembly::OpName::off); + assert(FrameOffset >= 0 && MI.getOperand(OffsetOperandNum).getImm() >= 0); + int64_t Offset = MI.getOperand(OffsetOperandNum).getImm() + FrameOffset; if (static_cast(Offset) <= std::numeric_limits::max()) { - MI.getOperand(FIOperandNum - 1).setImm(Offset); + MI.getOperand(OffsetOperandNum).setImm(Offset); MI.getOperand(FIOperandNum) - .ChangeToRegister(FrameRegister, /*IsDef=*/false); + .ChangeToRegister(FrameRegister, /*isDef=*/false); return; } } @@ -100,7 +102,7 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex( MachineOperand &ImmMO = Def->getOperand(1); ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset)); MI.getOperand(FIOperandNum) - .ChangeToRegister(FrameRegister, /*IsDef=*/false); + .ChangeToRegister(FrameRegister, /*isDef=*/false); return; } } @@ -125,10 +127,10 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex( .addReg(FrameRegister) .addReg(OffsetOp); } - MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false); + MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*isDef=*/false); } -unsigned +Register WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const { static const unsigned Regs[2][2] = { /* !isArch64Bit isArch64Bit */ diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h index 2a73dfd4b065..7880eb217dbf 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h @@ -1,9 +1,8 @@ // WebAssemblyRegisterInfo.h - WebAssembly Register Information Impl -*- C++ -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -40,7 +39,7 @@ public: RegScavenger *RS = nullptr) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index a7c3d177724d..6d3d6c723277 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -1,9 +1,8 @@ //WebAssemblyRegisterInfo.td-Describe the WebAssembly Registers -*- tablegen -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -44,7 +43,7 @@ def F64_0 : WebAssemblyReg<"%f64.0">; def V128_0: WebAssemblyReg<"%v128">; -def EXCEPT_REF_0 : WebAssemblyReg<"%except_ref.0">; +def EXNREF_0 : WebAssemblyReg<"%exnref.0">; // The value stack "register". This is an opaque entity which serves to order // uses and defs that must remain in LIFO order. @@ -65,4 +64,4 @@ def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>; def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>; def V128 : WebAssemblyRegClass<[v4f32, v2f64, v2i64, v4i32, v16i8, v8i16], 128, (add V128_0)>; -def EXCEPT_REF : WebAssemblyRegClass<[ExceptRef], 0, (add EXCEPT_REF_0)>; +def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>; diff --git a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp index e5a3e47a3bcd..5eafd6c54e78 100644 --- a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp +++ b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 6cf81a9d77b3..7b9ae90326f0 100644 --- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -1,9 +1,8 @@ // CodeGen/RuntimeLibcallSignatures.cpp - R.T. Lib. Call Signatures -*- C++ -*-- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -52,6 +51,8 @@ enum RuntimeLibcallSignature { f64_func_f64_i32, f64_func_i64_i64, i16_func_f32, + i16_func_f64, + i16_func_i64_i64, i8_func_i8_i8, func_f32_iPTR_iPTR, func_f64_iPTR_iPTR, @@ -85,6 +86,9 @@ enum RuntimeLibcallSignature { func_iPTR_i64_i64_i64_i64_i64_i64, i32_func_i64_i64, i32_func_i64_i64_i64_i64, + iPTR_func_f32, + iPTR_func_f64, + iPTR_func_i64_i64, unsupported }; @@ -215,6 +219,18 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::ROUND_F32] = f32_func_f32; Table[RTLIB::ROUND_F64] = f64_func_f64; Table[RTLIB::ROUND_F128] = func_iPTR_i64_i64; + Table[RTLIB::LROUND_F32] = iPTR_func_f32; + Table[RTLIB::LROUND_F64] = iPTR_func_f64; + Table[RTLIB::LROUND_F128] = iPTR_func_i64_i64; + Table[RTLIB::LLROUND_F32] = i64_func_f32; + Table[RTLIB::LLROUND_F64] = i64_func_f64; + Table[RTLIB::LLROUND_F128] = i64_func_i64_i64; + Table[RTLIB::LRINT_F32] = iPTR_func_f32; + Table[RTLIB::LRINT_F64] = iPTR_func_f64; + Table[RTLIB::LRINT_F128] = iPTR_func_i64_i64; + Table[RTLIB::LLRINT_F32] = i64_func_f32; + Table[RTLIB::LLRINT_F64] = i64_func_f64; + Table[RTLIB::LLRINT_F128] = i64_func_i64_i64; Table[RTLIB::FLOOR_F32] = f32_func_f32; Table[RTLIB::FLOOR_F64] = f64_func_f64; Table[RTLIB::FLOOR_F128] = func_iPTR_i64_i64; @@ -229,13 +245,15 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::FMAX_F128] = func_iPTR_i64_i64_i64_i64; // Conversion - // All F80 and PPCF128 routines are unspported. + // All F80 and PPCF128 routines are unsupported. Table[RTLIB::FPEXT_F64_F128] = func_iPTR_f64; Table[RTLIB::FPEXT_F32_F128] = func_iPTR_f32; Table[RTLIB::FPEXT_F32_F64] = f64_func_f32; Table[RTLIB::FPEXT_F16_F32] = f32_func_i16; Table[RTLIB::FPROUND_F32_F16] = i16_func_f32; + Table[RTLIB::FPROUND_F64_F16] = i16_func_f64; Table[RTLIB::FPROUND_F64_F32] = f32_func_f64; + Table[RTLIB::FPROUND_F128_F16] = i16_func_i64_i64; Table[RTLIB::FPROUND_F128_F32] = f32_func_i64_i64; Table[RTLIB::FPROUND_F128_F64] = f64_func_i64_i64; Table[RTLIB::FPTOSINT_F32_I32] = i32_func_f32; @@ -310,6 +328,12 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::MEMSET] = iPTR_func_iPTR_i32_iPTR; Table[RTLIB::MEMMOVE] = iPTR_func_iPTR_iPTR_iPTR; + // __stack_chk_fail + Table[RTLIB::STACKPROTECTOR_CHECK_FAIL] = func; + + // Return address handling + Table[RTLIB::RETURN_ADDRESS] = i32_func_i32; + // Element-wise Atomic memory // TODO: Fix these when we implement atomic support Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_1] = unsupported; @@ -480,19 +504,25 @@ struct StaticLibcallNameMap { Map[NameLibcall.first] = NameLibcall.second; } } + // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is + // consistent with the f64 and f128 names. + Map["__extendhfsf2"] = RTLIB::FPEXT_F16_F32; + Map["__truncsfhf2"] = RTLIB::FPROUND_F32_F16; + + Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS; } }; } // end anonymous namespace -void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, +void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, RTLIB::Libcall LC, SmallVectorImpl &Rets, SmallVectorImpl &Params) { assert(Rets.empty()); assert(Params.empty()); - wasm::ValType iPTR = + wasm::ValType PtrTy = Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32; auto &Table = RuntimeLibcallSignatures->Table; @@ -593,6 +623,15 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I32); Params.push_back(wasm::ValType::F32); break; + case i16_func_f64: + Rets.push_back(wasm::ValType::I32); + Params.push_back(wasm::ValType::F64); + break; + case i16_func_i64_i64: + Rets.push_back(wasm::ValType::I32); + Params.push_back(wasm::ValType::I64); + Params.push_back(wasm::ValType::I64); + break; case i8_func_i8_i8: Rets.push_back(wasm::ValType::I32); Params.push_back(wasm::ValType::I32); @@ -600,13 +639,13 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, break; case func_f32_iPTR_iPTR: Params.push_back(wasm::ValType::F32); - Params.push_back(iPTR); - Params.push_back(iPTR); + Params.push_back(PtrTy); + Params.push_back(PtrTy); break; case func_f64_iPTR_iPTR: Params.push_back(wasm::ValType::F64); - Params.push_back(iPTR); - Params.push_back(iPTR); + Params.push_back(PtrTy); + Params.push_back(PtrTy); break; case i16_func_i16_i16: Rets.push_back(wasm::ValType::I32); @@ -632,7 +671,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I32); Params.push_back(wasm::ValType::I32); Params.push_back(wasm::ValType::I32); - Params.push_back(iPTR); + Params.push_back(PtrTy); break; case i64_func_i64_i64: Rets.push_back(wasm::ValType::I64); @@ -643,14 +682,14 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); - Params.push_back(iPTR); + Params.push_back(PtrTy); break; case i64_i64_func_f32: #if 0 // TODO: Enable this when wasm gets multiple-return-value support. Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::F32); break; @@ -659,7 +698,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::F64); break; @@ -668,7 +707,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I32); Params.push_back(wasm::ValType::I32); @@ -678,7 +717,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I32); Params.push_back(wasm::ValType::I32); @@ -688,7 +727,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); @@ -698,7 +737,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); @@ -710,13 +749,13 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); - Params.push_back(iPTR); + Params.push_back(PtrTy); break; case i64_i64_i64_i64_func_i64_i64_i64_i64: #if 0 // TODO: Enable this when wasm gets multiple-return-value support. @@ -725,7 +764,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); @@ -739,23 +778,23 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); #else - Params.push_back(iPTR); + Params.push_back(PtrTy); #endif Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I32); break; case iPTR_func_iPTR_i32_iPTR: - Rets.push_back(iPTR); - Params.push_back(iPTR); + Rets.push_back(PtrTy); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::I32); - Params.push_back(iPTR); + Params.push_back(PtrTy); break; case iPTR_func_iPTR_iPTR_iPTR: - Rets.push_back(iPTR); - Params.push_back(iPTR); - Params.push_back(iPTR); - Params.push_back(iPTR); + Rets.push_back(PtrTy); + Params.push_back(PtrTy); + Params.push_back(PtrTy); + Params.push_back(PtrTy); break; case f32_func_f32_f32_f32: Rets.push_back(wasm::ValType::F32); @@ -772,39 +811,39 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, case func_i64_i64_iPTR_iPTR: Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); - Params.push_back(iPTR); - Params.push_back(iPTR); + Params.push_back(PtrTy); + Params.push_back(PtrTy); break; case func_iPTR_f32: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::F32); break; case func_iPTR_f64: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::F64); break; case func_iPTR_i32: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::I32); break; case func_iPTR_i64: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::I64); break; case func_iPTR_i64_i64: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); break; case func_iPTR_i64_i64_i64_i64: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); break; case func_iPTR_i64_i64_i64_i64_i64_i64: - Params.push_back(iPTR); + Params.push_back(PtrTy); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); @@ -824,6 +863,19 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); Params.push_back(wasm::ValType::I64); break; + case iPTR_func_f32: + Rets.push_back(PtrTy); + Params.push_back(wasm::ValType::F32); + break; + case iPTR_func_f64: + Rets.push_back(PtrTy); + Params.push_back(wasm::ValType::F64); + break; + case iPTR_func_i64_i64: + Rets.push_back(PtrTy); + Params.push_back(wasm::ValType::I64); + Params.push_back(wasm::ValType::I64); + break; case unsupported: llvm_unreachable("unsupported runtime library signature"); } @@ -832,12 +884,17 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, static ManagedStatic LibcallNameMap; // TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed // other than here, just roll its logic into this version. -void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget, +void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, const char *Name, SmallVectorImpl &Rets, SmallVectorImpl &Params) { auto &Map = LibcallNameMap->Map; - auto val = Map.find(Name); - assert(val != Map.end() && "unexpected runtime library name"); - return GetLibcallSignature(Subtarget, val->second, Rets, Params); + auto Val = Map.find(Name); +#ifndef NDEBUG + if (Val == Map.end()) { + auto message = std::string("unexpected runtime library name: ") + Name; + llvm_unreachable(message.c_str()); + } +#endif + return getLibcallSignature(Subtarget, Val->second, Rets, Params); } diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h index 7fa70bea96de..6ae8aaaba59c 100644 --- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h +++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h @@ -1,9 +1,8 @@ // CodeGen/RuntimeLibcallSignatures.h - R.T. Lib. Call Signatures -*- C++ -*--// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -23,12 +22,12 @@ namespace llvm { class WebAssemblySubtarget; -extern void GetLibcallSignature(const WebAssemblySubtarget &Subtarget, +extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget, RTLIB::Libcall LC, SmallVectorImpl &Rets, SmallVectorImpl &Params); -extern void GetLibcallSignature(const WebAssemblySubtarget &Subtarget, +extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget, const char *Name, SmallVectorImpl &Rets, SmallVectorImpl &Params); diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp index bec72049258a..890e4b8e4e2a 100644 --- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblySelectionDAGInfo.cpp - WebAssembly SelectionDAG Info ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -17,4 +16,44 @@ using namespace llvm; #define DEBUG_TYPE "wasm-selectiondag-info" -WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {} +WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor + +SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + if (!DAG.getMachineFunction() + .getSubtarget() + .hasBulkMemory()) + return SDValue(); + + SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32); + return DAG.getNode(WebAssemblyISD::MEMORY_COPY, DL, MVT::Other, + {Chain, MemIdx, MemIdx, Dst, Src, + DAG.getZExtOrTrunc(Size, DL, MVT::i32)}); +} + +SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemmove( + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, bool IsVolatile, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + return EmitTargetCodeForMemcpy(DAG, DL, Chain, Op1, Op2, Op3, Align, + IsVolatile, false, DstPtrInfo, + SrcPtrInfo); +} + +SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val, + SDValue Size, unsigned Align, bool IsVolatile, + MachinePointerInfo DstPtrInfo) const { + if (!DAG.getMachineFunction() + .getSubtarget() + .hasBulkMemory()) + return SDValue(); + + SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32); + // Only low byte matters for val argument, so anyext the i8 + return DAG.getNode(WebAssemblyISD::MEMORY_FILL, DL, MVT::Other, Chain, MemIdx, + Dst, DAG.getAnyExtOrTrunc(Val, DL, MVT::i32), + DAG.getZExtOrTrunc(Size, DL, MVT::i32)); +} diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h index 31d150eded67..0b90ece27dff 100644 --- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -1,9 +1,8 @@ //=- WebAssemblySelectionDAGInfo.h - WebAssembly SelectionDAG Info -*- C++ -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -23,6 +22,21 @@ namespace llvm { class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo { public: ~WebAssemblySelectionDAGInfo() override; + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, bool isVolatile, + bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL, + SDValue Chain, SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, bool IsVolatile, + MachinePointerInfo DstPtrInfo) const override; }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp index c95af88c6f43..a249ccf17638 100644 --- a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp +++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp @@ -1,9 +1,8 @@ //=- WebAssemblySetP2AlignOperands.cpp - Set alignments on loads and stores -=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -14,6 +13,7 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" +#include "WebAssemblyInstrInfo.h" #include "WebAssemblyMachineFunctionInfo.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -54,7 +54,7 @@ FunctionPass *llvm::createWebAssemblySetP2AlignOperands() { return new WebAssemblySetP2AlignOperands(); } -static void RewriteP2Align(MachineInstr &MI, unsigned OperandNo) { +static void rewriteP2Align(MachineInstr &MI, unsigned OperandNo) { assert(MI.getOperand(OperandNo).getImm() == 0 && "ISel should set p2align operands to 0"); assert(MI.hasOneMemOperand() && @@ -84,114 +84,11 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) { for (auto &MBB : MF) { for (auto &MI : MBB) { - switch (MI.getOpcode()) { - case WebAssembly::LOAD_I32: - case WebAssembly::LOAD_I64: - case WebAssembly::LOAD_F32: - case WebAssembly::LOAD_F64: - case WebAssembly::LOAD_v16i8: - case WebAssembly::LOAD_v8i16: - case WebAssembly::LOAD_v4i32: - case WebAssembly::LOAD_v2i64: - case WebAssembly::LOAD_v4f32: - case WebAssembly::LOAD_v2f64: - case WebAssembly::LOAD8_S_I32: - case WebAssembly::LOAD8_U_I32: - case WebAssembly::LOAD16_S_I32: - case WebAssembly::LOAD16_U_I32: - case WebAssembly::LOAD8_S_I64: - case WebAssembly::LOAD8_U_I64: - case WebAssembly::LOAD16_S_I64: - case WebAssembly::LOAD16_U_I64: - case WebAssembly::LOAD32_S_I64: - case WebAssembly::LOAD32_U_I64: - case WebAssembly::ATOMIC_LOAD_I32: - case WebAssembly::ATOMIC_LOAD8_U_I32: - case WebAssembly::ATOMIC_LOAD16_U_I32: - case WebAssembly::ATOMIC_LOAD_I64: - case WebAssembly::ATOMIC_LOAD8_U_I64: - case WebAssembly::ATOMIC_LOAD16_U_I64: - case WebAssembly::ATOMIC_LOAD32_U_I64: - case WebAssembly::ATOMIC_RMW8_U_ADD_I32: - case WebAssembly::ATOMIC_RMW8_U_ADD_I64: - case WebAssembly::ATOMIC_RMW8_U_SUB_I32: - case WebAssembly::ATOMIC_RMW8_U_SUB_I64: - case WebAssembly::ATOMIC_RMW8_U_AND_I32: - case WebAssembly::ATOMIC_RMW8_U_AND_I64: - case WebAssembly::ATOMIC_RMW8_U_OR_I32: - case WebAssembly::ATOMIC_RMW8_U_OR_I64: - case WebAssembly::ATOMIC_RMW8_U_XOR_I32: - case WebAssembly::ATOMIC_RMW8_U_XOR_I64: - case WebAssembly::ATOMIC_RMW8_U_XCHG_I32: - case WebAssembly::ATOMIC_RMW8_U_XCHG_I64: - case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32: - case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64: - case WebAssembly::ATOMIC_RMW16_U_ADD_I32: - case WebAssembly::ATOMIC_RMW16_U_ADD_I64: - case WebAssembly::ATOMIC_RMW16_U_SUB_I32: - case WebAssembly::ATOMIC_RMW16_U_SUB_I64: - case WebAssembly::ATOMIC_RMW16_U_AND_I32: - case WebAssembly::ATOMIC_RMW16_U_AND_I64: - case WebAssembly::ATOMIC_RMW16_U_OR_I32: - case WebAssembly::ATOMIC_RMW16_U_OR_I64: - case WebAssembly::ATOMIC_RMW16_U_XOR_I32: - case WebAssembly::ATOMIC_RMW16_U_XOR_I64: - case WebAssembly::ATOMIC_RMW16_U_XCHG_I32: - case WebAssembly::ATOMIC_RMW16_U_XCHG_I64: - case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32: - case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64: - case WebAssembly::ATOMIC_RMW_ADD_I32: - case WebAssembly::ATOMIC_RMW32_U_ADD_I64: - case WebAssembly::ATOMIC_RMW_SUB_I32: - case WebAssembly::ATOMIC_RMW32_U_SUB_I64: - case WebAssembly::ATOMIC_RMW_AND_I32: - case WebAssembly::ATOMIC_RMW32_U_AND_I64: - case WebAssembly::ATOMIC_RMW_OR_I32: - case WebAssembly::ATOMIC_RMW32_U_OR_I64: - case WebAssembly::ATOMIC_RMW_XOR_I32: - case WebAssembly::ATOMIC_RMW32_U_XOR_I64: - case WebAssembly::ATOMIC_RMW_XCHG_I32: - case WebAssembly::ATOMIC_RMW32_U_XCHG_I64: - case WebAssembly::ATOMIC_RMW_CMPXCHG_I32: - case WebAssembly::ATOMIC_RMW32_U_CMPXCHG_I64: - case WebAssembly::ATOMIC_RMW_ADD_I64: - case WebAssembly::ATOMIC_RMW_SUB_I64: - case WebAssembly::ATOMIC_RMW_AND_I64: - case WebAssembly::ATOMIC_RMW_OR_I64: - case WebAssembly::ATOMIC_RMW_XOR_I64: - case WebAssembly::ATOMIC_RMW_XCHG_I64: - case WebAssembly::ATOMIC_RMW_CMPXCHG_I64: - case WebAssembly::ATOMIC_NOTIFY: - case WebAssembly::ATOMIC_WAIT_I32: - case WebAssembly::ATOMIC_WAIT_I64: - RewriteP2Align(MI, WebAssembly::LoadP2AlignOperandNo); - break; - case WebAssembly::STORE_I32: - case WebAssembly::STORE_I64: - case WebAssembly::STORE_F32: - case WebAssembly::STORE_F64: - case WebAssembly::STORE_v16i8: - case WebAssembly::STORE_v8i16: - case WebAssembly::STORE_v4i32: - case WebAssembly::STORE_v2i64: - case WebAssembly::STORE_v4f32: - case WebAssembly::STORE_v2f64: - case WebAssembly::STORE8_I32: - case WebAssembly::STORE16_I32: - case WebAssembly::STORE8_I64: - case WebAssembly::STORE16_I64: - case WebAssembly::STORE32_I64: - case WebAssembly::ATOMIC_STORE_I32: - case WebAssembly::ATOMIC_STORE8_I32: - case WebAssembly::ATOMIC_STORE16_I32: - case WebAssembly::ATOMIC_STORE_I64: - case WebAssembly::ATOMIC_STORE8_I64: - case WebAssembly::ATOMIC_STORE16_I64: - case WebAssembly::ATOMIC_STORE32_I64: - RewriteP2Align(MI, WebAssembly::StoreP2AlignOperandNo); - break; - default: - break; + int16_t P2AlignOpNum = WebAssembly::getNamedOperandIdx( + MI.getOpcode(), WebAssembly::OpName::p2align); + if (P2AlignOpNum != -1) { + rewriteP2Align(MI, P2AlignOpNum); + Changed = true; } } } diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index 98133e2153a0..196a74565285 100644 --- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblySubtarget.cpp - WebAssembly Subtarget Information ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -45,6 +44,11 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT, InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), TLInfo(TM, *this) {} +bool WebAssemblySubtarget::enableAtomicExpand() const { + // If atomics are disabled, atomic ops are lowered instead of expanded + return hasAtomics(); +} + bool WebAssemblySubtarget::enableMachineScheduler() const { // Disable the MachineScheduler for now. Even with ShouldTrackPressure set and // enableMachineSchedDefaultSched overridden, it appears to have an overall diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.h b/lib/Target/WebAssembly/WebAssemblySubtarget.h index 0a0c04609ac4..8db2120f9834 100644 --- a/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -1,9 +1,8 @@ //=- WebAssemblySubtarget.h - Define Subtarget for the WebAssembly -*- C++ -*-// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -23,11 +22,16 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include +#define GET_SUBTARGETINFO_ENUM #define GET_SUBTARGETINFO_HEADER #include "WebAssemblyGenSubtargetInfo.inc" namespace llvm { +// Defined in WebAssemblyGenSubtargetInfo.inc. +extern const SubtargetFeatureKV + WebAssemblyFeatureKV[WebAssembly::NumSubtargetFeatures]; + class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { enum SIMDEnum { NoSIMD, @@ -39,6 +43,10 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool HasNontrappingFPToInt = false; bool HasSignExt = false; bool HasExceptionHandling = false; + bool HasBulkMemory = false; + bool HasMultivalue = false; + bool HasMutableGlobals = false; + bool HasTailCall = false; /// String name of used CPU. std::string CPUString; @@ -77,6 +85,8 @@ public: return &getInstrInfo()->getRegisterInfo(); } const Triple &getTargetTriple() const { return TargetTriple; } + bool enableAtomicExpand() const override; + bool enableIndirectBrExpand() const override { return true; } bool enableMachineScheduler() const override; bool useAA() const override; @@ -90,6 +100,10 @@ public: bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; } bool hasSignExt() const { return HasSignExt; } bool hasExceptionHandling() const { return HasExceptionHandling; } + bool hasBulkMemory() const { return HasBulkMemory; } + bool hasMultivalue() const { return HasMultivalue; } + bool hasMutableGlobals() const { return HasMutableGlobals; } + bool hasTailCall() const { return HasTailCall; } /// Parses features string setting specified subtarget options. Definition of /// function is auto generated by tblgen. diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 3bf8dd40892c..7e65368e671a 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -1,9 +1,8 @@ //===- WebAssemblyTargetMachine.cpp - Define TargetMachine for WebAssembly -==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -14,9 +13,12 @@ #include "WebAssemblyTargetMachine.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "TargetInfo/WebAssemblyTargetInfo.h" #include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblyTargetObjectFile.h" #include "WebAssemblyTargetTransformInfo.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" @@ -25,6 +27,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/Transforms/Utils.h" using namespace llvm; @@ -58,19 +61,18 @@ extern "C" void LLVMInitializeWebAssemblyTarget() { initializeOptimizeReturnedPass(PR); initializeWebAssemblyArgumentMovePass(PR); initializeWebAssemblySetP2AlignOperandsPass(PR); - initializeWebAssemblyEHRestoreStackPointerPass(PR); initializeWebAssemblyReplacePhysRegsPass(PR); initializeWebAssemblyPrepareForLiveIntervalsPass(PR); initializeWebAssemblyOptimizeLiveIntervalsPass(PR); initializeWebAssemblyMemIntrinsicResultsPass(PR); initializeWebAssemblyRegStackifyPass(PR); initializeWebAssemblyRegColoringPass(PR); - initializeWebAssemblyExplicitLocalsPass(PR); initializeWebAssemblyFixIrreducibleControlFlowPass(PR); initializeWebAssemblyLateEHPreparePass(PR); initializeWebAssemblyExceptionInfoPass(PR); initializeWebAssemblyCFGSortPass(PR); initializeWebAssemblyCFGStackifyPass(PR); + initializeWebAssemblyExplicitLocalsPass(PR); initializeWebAssemblyLowerBrUnlessPass(PR); initializeWebAssemblyRegNumberingPass(PR); initializeWebAssemblyPeepholePass(PR); @@ -81,13 +83,22 @@ extern "C" void LLVMInitializeWebAssemblyTarget() { // WebAssembly Lowering public interface. //===----------------------------------------------------------------------===// -static Reloc::Model getEffectiveRelocModel(Optional RM) { +static Reloc::Model getEffectiveRelocModel(Optional RM, + const Triple &TT) { if (!RM.hasValue()) { // Default to static relocation model. This should always be more optimial // than PIC since the static linker can determine all global addresses and // assume direct function calls. return Reloc::Static; } + + if (!TT.isOSEmscripten()) { + // Relocation modes other than static are currently implemented in a way + // that only works for Emscripten, so disable them if we aren't targeting + // Emscripten. + return Reloc::Static; + } + return *RM; } @@ -100,7 +111,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( : LLVMTargetMachine(T, TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128" : "e-m:e-p:32:32-i64:64-n32:64-S128", - TT, CPU, FS, Options, getEffectiveRelocModel(RM), + TT, CPU, FS, Options, getEffectiveRelocModel(RM, TT), getEffectiveCodeModel(CM, CodeModel::Large), OL), TLOF(new WebAssemblyTargetObjectFile()) { // WebAssembly type-checks instructions, but a noreturn function with a return @@ -122,7 +133,17 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( // splitting and tail merging. } -WebAssemblyTargetMachine::~WebAssemblyTargetMachine() {} +WebAssemblyTargetMachine::~WebAssemblyTargetMachine() = default; // anchor. + +const WebAssemblySubtarget * +WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU, + std::string FS) const { + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + I = llvm::make_unique(TargetTriple, CPU, FS, *this); + } + return I.get(); +} const WebAssemblySubtarget * WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const { @@ -136,33 +157,141 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const { ? FSAttr.getValueAsString().str() : TargetFS; - auto &I = SubtargetMap[CPU + FS]; - if (!I) { - // This needs to be done before we create a new subtarget since any - // creation will depend on the TM and the code generation flags on the - // function that reside in TargetOptions. - resetTargetOptions(F); - I = llvm::make_unique(TargetTriple, CPU, FS, *this); - } - return I.get(); + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + + return getSubtargetImpl(CPU, FS); } namespace { -class StripThreadLocal final : public ModulePass { - // The default thread model for wasm is single, where thread-local variables - // are identical to regular globals and should be treated the same. So this - // pass just converts all GlobalVariables to NotThreadLocal + +class CoalesceFeaturesAndStripAtomics final : public ModulePass { + // Take the union of all features used in the module and use it for each + // function individually, since having multiple feature sets in one module + // currently does not make sense for WebAssembly. If atomics are not enabled, + // also strip atomic operations and thread local storage. static char ID; + WebAssemblyTargetMachine *WasmTM; public: - StripThreadLocal() : ModulePass(ID) {} + CoalesceFeaturesAndStripAtomics(WebAssemblyTargetMachine *WasmTM) + : ModulePass(ID), WasmTM(WasmTM) {} + bool runOnModule(Module &M) override { - for (auto &GV : M.globals()) - GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal); + FeatureBitset Features = coalesceFeatures(M); + + std::string FeatureStr = getFeatureString(Features); + for (auto &F : M) + replaceFeatures(F, FeatureStr); + + bool StrippedAtomics = false; + bool StrippedTLS = false; + + if (!Features[WebAssembly::FeatureAtomics]) + StrippedAtomics = stripAtomics(M); + + if (!Features[WebAssembly::FeatureBulkMemory]) + StrippedTLS = stripThreadLocals(M); + + if (StrippedAtomics && !StrippedTLS) + stripThreadLocals(M); + else if (StrippedTLS && !StrippedAtomics) + stripAtomics(M); + + recordFeatures(M, Features, StrippedAtomics || StrippedTLS); + + // Conservatively assume we have made some change + return true; + } + +private: + FeatureBitset coalesceFeatures(const Module &M) { + FeatureBitset Features = + WasmTM + ->getSubtargetImpl(WasmTM->getTargetCPU(), + WasmTM->getTargetFeatureString()) + ->getFeatureBits(); + for (auto &F : M) + Features |= WasmTM->getSubtargetImpl(F)->getFeatureBits(); + return Features; + } + + std::string getFeatureString(const FeatureBitset &Features) { + std::string Ret; + for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) { + if (Features[KV.Value]) + Ret += (StringRef("+") + KV.Key + ",").str(); + } + return Ret; + } + + void replaceFeatures(Function &F, const std::string &Features) { + F.removeFnAttr("target-features"); + F.removeFnAttr("target-cpu"); + F.addFnAttr("target-features", Features); + } + + bool stripAtomics(Module &M) { + // Detect whether any atomics will be lowered, since there is no way to tell + // whether the LowerAtomic pass lowers e.g. stores. + bool Stripped = false; + for (auto &F : M) { + for (auto &B : F) { + for (auto &I : B) { + if (I.isAtomic()) { + Stripped = true; + goto done; + } + } + } + } + + done: + if (!Stripped) + return false; + + LowerAtomicPass Lowerer; + FunctionAnalysisManager FAM; + for (auto &F : M) + Lowerer.run(F, FAM); + return true; } + + bool stripThreadLocals(Module &M) { + bool Stripped = false; + for (auto &GV : M.globals()) { + if (GV.getThreadLocalMode() != + GlobalValue::ThreadLocalMode::NotThreadLocal) { + Stripped = true; + GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal); + } + } + return Stripped; + } + + void recordFeatures(Module &M, const FeatureBitset &Features, bool Stripped) { + for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) { + std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str(); + if (KV.Value == WebAssembly::FeatureAtomics && Stripped) { + // "atomics" is special: code compiled without atomics may have had its + // atomics lowered to nonatomic operations. In that case, atomics is + // disallowed to prevent unsafe linking with atomics-enabled objects. + assert(!Features[WebAssembly::FeatureAtomics] || + !Features[WebAssembly::FeatureBulkMemory]); + M.addModuleFlag(Module::ModFlagBehavior::Error, MDKey, + wasm::WASM_FEATURE_PREFIX_DISALLOWED); + } else if (Features[KV.Value]) { + // Otherwise features are marked Used or not mentioned + M.addModuleFlag(Module::ModFlagBehavior::Error, MDKey, + wasm::WASM_FEATURE_PREFIX_USED); + } + } + } }; -char StripThreadLocal::ID = 0; +char CoalesceFeaturesAndStripAtomics::ID = 0; /// WebAssembly Code Generator Pass Configuration Options. class WebAssemblyPassConfig final : public TargetPassConfig { @@ -181,6 +310,12 @@ public: void addPostRegAlloc() override; bool addGCPasses() override { return false; } void addPreEmitPass() override; + + // No reg alloc + bool addRegAssignmentFast() override { return false; } + + // No reg alloc + bool addRegAssignmentOptimized() override { return false; } }; } // end anonymous namespace @@ -204,15 +339,11 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { //===----------------------------------------------------------------------===// void WebAssemblyPassConfig::addIRPasses() { - if (TM->Options.ThreadModel == ThreadModel::Single) { - // In "single" mode, atomics get lowered to non-atomics. - addPass(createLowerAtomicPass()); - addPass(new StripThreadLocal()); - } else { - // Expand some atomic operations. WebAssemblyTargetLowering has hooks which - // control specifically what gets lowered. - addPass(createAtomicExpandPass()); - } + // Runs LowerAtomicPass if necessary + addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine())); + + // This is a no-op if atomics are not used in the module + addPass(createAtomicExpandPass()); // Add signatures to prototype-less function declarations addPass(createWebAssemblyAddMissingPrototypes()); @@ -246,6 +377,9 @@ void WebAssemblyPassConfig::addIRPasses() { addPass(createWebAssemblyLowerEmscriptenEHSjLj(EnableEmException, EnableEmSjLj)); + // Expand indirectbr instructions to switches. + addPass(createIndirectBrExpandPass()); + TargetPassConfig::addIRPasses(); } @@ -279,20 +413,16 @@ void WebAssemblyPassConfig::addPostRegAlloc() { disablePass(&PatchableFunctionID); disablePass(&ShrinkWrapID); + // This pass hurts code size for wasm because it can generate irreducible + // control flow. + disablePass(&MachineBlockPlacementID); + TargetPassConfig::addPostRegAlloc(); } void WebAssemblyPassConfig::addPreEmitPass() { TargetPassConfig::addPreEmitPass(); - // Restore __stack_pointer global after an exception is thrown. - addPass(createWebAssemblyEHRestoreStackPointer()); - - // Now that we have a prologue and epilogue and all frame indices are - // rewritten, eliminate SP and FP. This allows them to be stackified, - // colored, and numbered with the rest of the registers. - addPass(createWebAssemblyReplacePhysRegs()); - // Rewrite pseudo call_indirect instructions as real instructions. // This needs to run before register stackification, because we change the // order of the arguments. @@ -302,8 +432,15 @@ void WebAssemblyPassConfig::addPreEmitPass() { addPass(createWebAssemblyFixIrreducibleControlFlow()); // Do various transformations for exception handling. + // Every CFG-changing optimizations should come before this. addPass(createWebAssemblyLateEHPrepare()); + // Now that we have a prologue and epilogue and all frame indices are + // rewritten, eliminate SP and FP. This allows them to be stackified, + // colored, and numbered with the rest of the registers. + addPass(createWebAssemblyReplacePhysRegs()); + + // Preparations and optimizations related to register stackification. if (getOptLevel() != CodeGenOpt::None) { // LiveIntervals isn't commonly run this late. Re-establish preconditions. addPass(createWebAssemblyPrepareForLiveIntervals()); @@ -327,9 +464,6 @@ void WebAssemblyPassConfig::addPreEmitPass() { addPass(createWebAssemblyRegColoring()); } - // Insert explicit local.get and local.set operators. - addPass(createWebAssemblyExplicitLocals()); - // Sort the blocks of the CFG into topological order, a prerequisite for // BLOCK and LOOP markers. addPass(createWebAssemblyCFGSort()); @@ -337,6 +471,9 @@ void WebAssemblyPassConfig::addPreEmitPass() { // Insert BLOCK and LOOP markers. addPass(createWebAssemblyCFGStackify()); + // Insert explicit local.get and local.set operators. + addPass(createWebAssemblyExplicitLocals()); + // Lower br_unless into br_if. addPass(createWebAssemblyLowerBrUnless()); @@ -347,3 +484,24 @@ void WebAssemblyPassConfig::addPreEmitPass() { // Create a mapping from LLVM CodeGen virtual registers to wasm registers. addPass(createWebAssemblyRegNumbering()); } + +yaml::MachineFunctionInfo * +WebAssemblyTargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::WebAssemblyFunctionInfo(); +} + +yaml::MachineFunctionInfo *WebAssemblyTargetMachine::convertFuncInfoToYAML( + const MachineFunction &MF) const { + const auto *MFI = MF.getInfo(); + return new yaml::WebAssemblyFunctionInfo(*MFI); +} + +bool WebAssemblyTargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const auto &YamlMFI = + reinterpret_cast(MFI); + MachineFunction &MF = PFS.MF; + MF.getInfo()->initializeBaseYamlFields(YamlMFI); + return false; +} diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h index 41001e7a0cc7..850e6b9a9e9e 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h @@ -1,9 +1,8 @@ // WebAssemblyTargetMachine.h - Define TargetMachine for WebAssembly -*- C++ -*- // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -33,6 +32,9 @@ public: bool JIT); ~WebAssemblyTargetMachine() override; + + const WebAssemblySubtarget *getSubtargetImpl(std::string CPU, + std::string FS) const; const WebAssemblySubtarget * getSubtargetImpl(const Function &F) const override; @@ -46,6 +48,14 @@ public: TargetTransformInfo getTargetTransformInfo(const Function &F) override; bool usesPhysRegsForPEI() const override { return false; } + + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp index 0459bfca418d..ad57c600db10 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h index ce744ba8b8e8..f46bb2040a7d 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- WebAssemblyTargetObjectFile.h - WebAssembly Object Info -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 4a2777cc3a9f..46ef765ce0f4 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -51,7 +50,7 @@ unsigned WebAssemblyTTIImpl::getArithmeticInstrCost( unsigned Cost = BasicTTIImplBase::getArithmeticInstrCost( Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); - if (VectorType *VTy = dyn_cast(Ty)) { + if (auto *VTy = dyn_cast(Ty)) { switch (Opcode) { case Instruction::LShr: case Instruction::AShr: diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 4300ca3defbf..1b11b4b631eb 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -1,9 +1,8 @@ //==- WebAssemblyTargetTransformInfo.h - WebAssembly-specific TTI -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index ada6fb9a96d7..e9d88d4818a5 100644 --- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -1,9 +1,8 @@ //===-- WebAssemblyUtilities.cpp - WebAssembly Utility Functions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -25,70 +24,6 @@ const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev"; const char *const WebAssembly::PersonalityWrapperFn = "_Unwind_Wasm_CallPersonality"; -bool WebAssembly::isArgument(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::ARGUMENT_i32: - case WebAssembly::ARGUMENT_i32_S: - case WebAssembly::ARGUMENT_i64: - case WebAssembly::ARGUMENT_i64_S: - case WebAssembly::ARGUMENT_f32: - case WebAssembly::ARGUMENT_f32_S: - case WebAssembly::ARGUMENT_f64: - case WebAssembly::ARGUMENT_f64_S: - case WebAssembly::ARGUMENT_v16i8: - case WebAssembly::ARGUMENT_v16i8_S: - case WebAssembly::ARGUMENT_v8i16: - case WebAssembly::ARGUMENT_v8i16_S: - case WebAssembly::ARGUMENT_v4i32: - case WebAssembly::ARGUMENT_v4i32_S: - case WebAssembly::ARGUMENT_v2i64: - case WebAssembly::ARGUMENT_v2i64_S: - case WebAssembly::ARGUMENT_v4f32: - case WebAssembly::ARGUMENT_v4f32_S: - case WebAssembly::ARGUMENT_v2f64: - case WebAssembly::ARGUMENT_v2f64_S: - return true; - default: - return false; - } -} - -bool WebAssembly::isCopy(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::COPY_I32: - case WebAssembly::COPY_I32_S: - case WebAssembly::COPY_I64: - case WebAssembly::COPY_I64_S: - case WebAssembly::COPY_F32: - case WebAssembly::COPY_F32_S: - case WebAssembly::COPY_F64: - case WebAssembly::COPY_F64_S: - case WebAssembly::COPY_V128: - case WebAssembly::COPY_V128_S: - return true; - default: - return false; - } -} - -bool WebAssembly::isTee(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::TEE_I32: - case WebAssembly::TEE_I32_S: - case WebAssembly::TEE_I64: - case WebAssembly::TEE_I64_S: - case WebAssembly::TEE_F32: - case WebAssembly::TEE_F32_S: - case WebAssembly::TEE_F64: - case WebAssembly::TEE_F64_S: - case WebAssembly::TEE_V128: - case WebAssembly::TEE_V128_S: - return true; - default: - return false; - } -} - /// Test whether MI is a child of some other node in an expression tree. bool WebAssembly::isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI) { @@ -102,201 +37,20 @@ bool WebAssembly::isChild(const MachineInstr &MI, MFI.isVRegStackified(Reg); } -bool WebAssembly::isCallDirect(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::CALL_VOID: - case WebAssembly::CALL_VOID_S: - case WebAssembly::CALL_I32: - case WebAssembly::CALL_I32_S: - case WebAssembly::CALL_I64: - case WebAssembly::CALL_I64_S: - case WebAssembly::CALL_F32: - case WebAssembly::CALL_F32_S: - case WebAssembly::CALL_F64: - case WebAssembly::CALL_F64_S: - case WebAssembly::CALL_v16i8: - case WebAssembly::CALL_v16i8_S: - case WebAssembly::CALL_v8i16: - case WebAssembly::CALL_v8i16_S: - case WebAssembly::CALL_v4i32: - case WebAssembly::CALL_v4i32_S: - case WebAssembly::CALL_v2i64: - case WebAssembly::CALL_v2i64_S: - case WebAssembly::CALL_v4f32: - case WebAssembly::CALL_v4f32_S: - case WebAssembly::CALL_v2f64: - case WebAssembly::CALL_v2f64_S: - case WebAssembly::CALL_EXCEPT_REF: - case WebAssembly::CALL_EXCEPT_REF_S: - return true; - default: - return false; - } -} - -bool WebAssembly::isCallIndirect(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::CALL_INDIRECT_VOID: - case WebAssembly::CALL_INDIRECT_VOID_S: - case WebAssembly::CALL_INDIRECT_I32: - case WebAssembly::CALL_INDIRECT_I32_S: - case WebAssembly::CALL_INDIRECT_I64: - case WebAssembly::CALL_INDIRECT_I64_S: - case WebAssembly::CALL_INDIRECT_F32: - case WebAssembly::CALL_INDIRECT_F32_S: - case WebAssembly::CALL_INDIRECT_F64: - case WebAssembly::CALL_INDIRECT_F64_S: - case WebAssembly::CALL_INDIRECT_v16i8: - case WebAssembly::CALL_INDIRECT_v16i8_S: - case WebAssembly::CALL_INDIRECT_v8i16: - case WebAssembly::CALL_INDIRECT_v8i16_S: - case WebAssembly::CALL_INDIRECT_v4i32: - case WebAssembly::CALL_INDIRECT_v4i32_S: - case WebAssembly::CALL_INDIRECT_v2i64: - case WebAssembly::CALL_INDIRECT_v2i64_S: - case WebAssembly::CALL_INDIRECT_v4f32: - case WebAssembly::CALL_INDIRECT_v4f32_S: - case WebAssembly::CALL_INDIRECT_v2f64: - case WebAssembly::CALL_INDIRECT_v2f64_S: - case WebAssembly::CALL_INDIRECT_EXCEPT_REF: - case WebAssembly::CALL_INDIRECT_EXCEPT_REF_S: - return true; - default: - return false; - } -} - -unsigned WebAssembly::getCalleeOpNo(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::CALL_VOID: - case WebAssembly::CALL_VOID_S: - case WebAssembly::CALL_INDIRECT_VOID: - case WebAssembly::CALL_INDIRECT_VOID_S: - return 0; - case WebAssembly::CALL_I32: - case WebAssembly::CALL_I32_S: - case WebAssembly::CALL_I64: - case WebAssembly::CALL_I64_S: - case WebAssembly::CALL_F32: - case WebAssembly::CALL_F32_S: - case WebAssembly::CALL_F64: - case WebAssembly::CALL_F64_S: - case WebAssembly::CALL_v16i8: - case WebAssembly::CALL_v16i8_S: - case WebAssembly::CALL_v8i16: - case WebAssembly::CALL_v8i16_S: - case WebAssembly::CALL_v4i32: - case WebAssembly::CALL_v4i32_S: - case WebAssembly::CALL_v2i64: - case WebAssembly::CALL_v2i64_S: - case WebAssembly::CALL_v4f32: - case WebAssembly::CALL_v4f32_S: - case WebAssembly::CALL_v2f64: - case WebAssembly::CALL_v2f64_S: - case WebAssembly::CALL_EXCEPT_REF: - case WebAssembly::CALL_EXCEPT_REF_S: - case WebAssembly::CALL_INDIRECT_I32: - case WebAssembly::CALL_INDIRECT_I32_S: - case WebAssembly::CALL_INDIRECT_I64: - case WebAssembly::CALL_INDIRECT_I64_S: - case WebAssembly::CALL_INDIRECT_F32: - case WebAssembly::CALL_INDIRECT_F32_S: - case WebAssembly::CALL_INDIRECT_F64: - case WebAssembly::CALL_INDIRECT_F64_S: - case WebAssembly::CALL_INDIRECT_v16i8: - case WebAssembly::CALL_INDIRECT_v16i8_S: - case WebAssembly::CALL_INDIRECT_v8i16: - case WebAssembly::CALL_INDIRECT_v8i16_S: - case WebAssembly::CALL_INDIRECT_v4i32: - case WebAssembly::CALL_INDIRECT_v4i32_S: - case WebAssembly::CALL_INDIRECT_v2i64: - case WebAssembly::CALL_INDIRECT_v2i64_S: - case WebAssembly::CALL_INDIRECT_v4f32: - case WebAssembly::CALL_INDIRECT_v4f32_S: - case WebAssembly::CALL_INDIRECT_v2f64: - case WebAssembly::CALL_INDIRECT_v2f64_S: - case WebAssembly::CALL_INDIRECT_EXCEPT_REF: - case WebAssembly::CALL_INDIRECT_EXCEPT_REF_S: - return 1; - default: - llvm_unreachable("Not a call instruction"); - } -} - -bool WebAssembly::isMarker(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::BLOCK: - case WebAssembly::BLOCK_S: - case WebAssembly::END_BLOCK: - case WebAssembly::END_BLOCK_S: - case WebAssembly::LOOP: - case WebAssembly::LOOP_S: - case WebAssembly::END_LOOP: - case WebAssembly::END_LOOP_S: - case WebAssembly::TRY: - case WebAssembly::TRY_S: - case WebAssembly::END_TRY: - case WebAssembly::END_TRY_S: - return true; - default: - return false; - } -} - -bool WebAssembly::isThrow(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::THROW_I32: - case WebAssembly::THROW_I32_S: - case WebAssembly::THROW_I64: - case WebAssembly::THROW_I64_S: - return true; - default: - return false; - } -} - -bool WebAssembly::isRethrow(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::RETHROW: - case WebAssembly::RETHROW_S: - case WebAssembly::RETHROW_TO_CALLER: - case WebAssembly::RETHROW_TO_CALLER_S: - return true; - default: - return false; - } -} - -bool WebAssembly::isCatch(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case WebAssembly::CATCH_I32: - case WebAssembly::CATCH_I32_S: - case WebAssembly::CATCH_I64: - case WebAssembly::CATCH_I64_S: - case WebAssembly::CATCH_ALL: - case WebAssembly::CATCH_ALL_S: - return true; - default: - return false; - } -} - bool WebAssembly::mayThrow(const MachineInstr &MI) { switch (MI.getOpcode()) { - case WebAssembly::THROW_I32: - case WebAssembly::THROW_I32_S: - case WebAssembly::THROW_I64: - case WebAssembly::THROW_I64_S: + case WebAssembly::THROW: + case WebAssembly::THROW_S: case WebAssembly::RETHROW: case WebAssembly::RETHROW_S: return true; } - if (isCallIndirect(MI)) + if (isCallIndirect(MI.getOpcode())) return true; if (!MI.isCall()) return false; - const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI)); + const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI.getOpcode())); assert(MO.isGlobal()); const auto *F = dyn_cast(MO.getGlobal()); if (!F) @@ -307,43 +61,8 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) { if (F->getName() == CxaBeginCatchFn || F->getName() == PersonalityWrapperFn || F->getName() == ClangCallTerminateFn || F->getName() == StdTerminateFn) return false; - return true; -} - -bool WebAssembly::isCatchTerminatePad(const MachineBasicBlock &MBB) { - if (!MBB.isEHPad()) - return false; - bool SeenCatch = false; - for (auto &MI : MBB) { - if (MI.getOpcode() == WebAssembly::CATCH_I32 || - MI.getOpcode() == WebAssembly::CATCH_I64 || - MI.getOpcode() == WebAssembly::CATCH_I32_S || - MI.getOpcode() == WebAssembly::CATCH_I64_S) - SeenCatch = true; - if (SeenCatch && MI.isCall()) { - const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI)); - if (CalleeOp.isGlobal() && - CalleeOp.getGlobal()->getName() == ClangCallTerminateFn) - return true; - } - } - return false; -} -bool WebAssembly::isCatchAllTerminatePad(const MachineBasicBlock &MBB) { - if (!MBB.isEHPad()) - return false; - bool SeenCatchAll = false; - for (auto &MI : MBB) { - if (MI.getOpcode() == WebAssembly::CATCH_ALL || - MI.getOpcode() == WebAssembly::CATCH_ALL_S) - SeenCatchAll = true; - if (SeenCatchAll && MI.isCall()) { - const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI)); - if (CalleeOp.isGlobal() && - CalleeOp.getGlobal()->getName() == StdTerminateFn) - return true; - } - } - return false; + // TODO Can we exclude call instructions that are marked as 'nounwind' in the + // original LLVm IR? (Even when the callee may throw) + return true; } diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.h b/lib/Target/WebAssembly/WebAssemblyUtilities.h index cdb7873e9013..26cf84de89b9 100644 --- a/lib/Target/WebAssembly/WebAssemblyUtilities.h +++ b/lib/Target/WebAssembly/WebAssemblyUtilities.h @@ -1,9 +1,8 @@ //===-- WebAssemblyUtilities - WebAssembly Utility Functions ---*- C++ -*-====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -24,29 +23,9 @@ class WebAssemblyFunctionInfo; namespace WebAssembly { -bool isArgument(const MachineInstr &MI); -bool isCopy(const MachineInstr &MI); -bool isTee(const MachineInstr &MI); bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI); -bool isCallDirect(const MachineInstr &MI); -bool isCallIndirect(const MachineInstr &MI); -bool isMarker(const MachineInstr &MI); -bool isThrow(const MachineInstr &MI); -bool isRethrow(const MachineInstr &MI); -bool isCatch(const MachineInstr &MI); bool mayThrow(const MachineInstr &MI); -/// Returns the operand number of a callee, assuming the argument is a call -/// instruction. -unsigned getCalleeOpNo(const MachineInstr &MI); - -/// Returns if the given BB is a single BB terminate pad which starts with a -/// 'catch' instruction. -bool isCatchTerminatePad(const MachineBasicBlock &MBB); -/// Returns if the given BB is a single BB terminate pad which starts with a -/// 'catch_all' insrtruction. -bool isCatchAllTerminatePad(const MachineBasicBlock &MBB); - // Exception-related function names extern const char *const ClangCallTerminateFn; extern const char *const CxaBeginCatchFn; diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt index 364c871f61b0..701b347bcbd7 100644 --- a/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -6,21 +6,13 @@ # error). The format is # # comment -# Computed gotos are not supported (Cannot select BlockAddress/BRIND) -20071220-1.c +# blockaddress without an indirectbr still can't be supported +20071220-1.c O2 # Relocation against a BB address 20071220-2.c -20040302-1.c -20041214-1.c O0 -20071210-1.c -920501-4.c -920501-5.c -comp-goto-1.c -980526-1.c 990208-1.c label13.C O0 label13a.C O0 label3.C -pr42462.C O0 # WebAssembly hasn't implemented (will never?) __builtin_return_address 20010122-1.c @@ -75,7 +67,6 @@ pr41935.c 920501-3.c 920728-1.c pr28865.c -widechar-2.c attr-alias-1.C attr-alias-2.C attr-ifunc-1.C @@ -86,7 +77,6 @@ complit12.C va-arg-pack-1.C va-arg-pack-len-1.C builtin-line1.C -builtin-location.C devirt-6.C # bad main signature devirt-13.C # bad main signature devirt-14.C # bad main signature @@ -94,11 +84,22 @@ devirt-21.C # bad main signature devirt-23.C # bad main signature lifetime2.C # violates C++ DR1696 +# WASI doesn't have stdjmp.h yet +pr56982.c +simd-2.C + +# WASI doesn't have pthread.h yet +thread_local3.C +thread_local3g.C +thread_local4.C +thread_local4g.C +thread_local5.C +thread_local5g.C + # Untriaged C++ failures spec5.C addr1.C ef_test.C -friend18.C member2.C new39.C new40.C diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp deleted file mode 100644 index 2c376fd062ca..000000000000 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ /dev/null @@ -1,1089 +0,0 @@ -//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "X86AsmInstrumentation.h" -#include "MCTargetDesc/X86MCTargetDesc.h" -#include "X86Operand.h" -#include "llvm/ADT/Triple.h" -#include "llvm/ADT/Twine.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDwarf.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstBuilder.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCParser/MCTargetAsmParser.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCTargetOptions.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/SMLoc.h" -#include -#include -#include -#include -#include -#include - -// Following comment describes how assembly instrumentation works. -// Currently we have only AddressSanitizer instrumentation, but we're -// planning to implement MemorySanitizer for inline assembly too. If -// you're not familiar with AddressSanitizer algorithm, please, read -// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm -// -// When inline assembly is parsed by an instance of X86AsmParser, all -// instructions are emitted via EmitInstruction method. That's the -// place where X86AsmInstrumentation analyzes an instruction and -// decides, whether the instruction should be emitted as is or -// instrumentation is required. The latter case happens when an -// instruction reads from or writes to memory. Now instruction opcode -// is explicitly checked, and if an instruction has a memory operand -// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be -// instrumented. There're also exist instructions that modify -// memory but don't have an explicit memory operands, for instance, -// movs. -// -// Let's consider at first 8-byte memory accesses when an instruction -// has an explicit memory operand. In this case we need two registers - -// AddressReg to compute address of a memory cells which are accessed -// and ShadowReg to compute corresponding shadow address. So, we need -// to spill both registers before instrumentation code and restore them -// after instrumentation. Thus, in general, instrumentation code will -// look like this: -// PUSHF # Store flags, otherwise they will be overwritten -// PUSH AddressReg # spill AddressReg -// PUSH ShadowReg # spill ShadowReg -// LEA MemOp, AddressReg # compute address of the memory operand -// MOV AddressReg, ShadowReg -// SHR ShadowReg, 3 -// # ShadowOffset(AddressReg >> 3) contains address of a shadow -// # corresponding to MemOp. -// CMP ShadowOffset(ShadowReg), 0 # test shadow value -// JZ .Done # when shadow equals to zero, everything is fine -// MOV AddressReg, RDI -// # Call __asan_report function with AddressReg as an argument -// CALL __asan_report -// .Done: -// POP ShadowReg # Restore ShadowReg -// POP AddressReg # Restore AddressReg -// POPF # Restore flags -// -// Memory accesses with different size (1-, 2-, 4- and 16-byte) are -// handled in a similar manner, but small memory accesses (less than 8 -// byte) require an additional ScratchReg, which is used for shadow value. -// -// If, suppose, we're instrumenting an instruction like movs, only -// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize * -// RCX are checked. In this case there're no need to spill and restore -// AddressReg , ShadowReg or flags four times, they're saved on stack -// just once, before instrumentation of these four addresses, and restored -// at the end of the instrumentation. -// -// There exist several things which complicate this simple algorithm. -// * Instrumented memory operand can have RSP as a base or an index -// register. So we need to add a constant offset before computation -// of memory address, since flags, AddressReg, ShadowReg, etc. were -// already stored on stack and RSP was modified. -// * Debug info (usually, DWARF) should be adjusted, because sometimes -// RSP is used as a frame register. So, we need to select some -// register as a frame register and temprorary override current CFA -// register. - -using namespace llvm; - -static cl::opt ClAsanInstrumentAssembly( - "asan-instrument-assembly", - cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, - cl::init(false)); - -static const int64_t MinAllowedDisplacement = - std::numeric_limits::min(); -static const int64_t MaxAllowedDisplacement = - std::numeric_limits::max(); - -static int64_t ApplyDisplacementBounds(int64_t Displacement) { - return std::max(std::min(MaxAllowedDisplacement, Displacement), - MinAllowedDisplacement); -} - -static void CheckDisplacementBounds(int64_t Displacement) { - assert(Displacement >= MinAllowedDisplacement && - Displacement <= MaxAllowedDisplacement); -} - -static bool IsStackReg(unsigned Reg) { - return Reg == X86::RSP || Reg == X86::ESP; -} - -static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } - -namespace { - -class X86AddressSanitizer : public X86AsmInstrumentation { -public: - struct RegisterContext { - private: - enum RegOffset { - REG_OFFSET_ADDRESS = 0, - REG_OFFSET_SHADOW, - REG_OFFSET_SCRATCH - }; - - public: - RegisterContext(unsigned AddressReg, unsigned ShadowReg, - unsigned ScratchReg) { - BusyRegs.push_back(convReg(AddressReg, 64)); - BusyRegs.push_back(convReg(ShadowReg, 64)); - BusyRegs.push_back(convReg(ScratchReg, 64)); - } - - unsigned AddressReg(unsigned Size) const { - return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size); - } - - unsigned ShadowReg(unsigned Size) const { - return convReg(BusyRegs[REG_OFFSET_SHADOW], Size); - } - - unsigned ScratchReg(unsigned Size) const { - return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size); - } - - void AddBusyReg(unsigned Reg) { - if (Reg != X86::NoRegister) - BusyRegs.push_back(convReg(Reg, 64)); - } - - void AddBusyRegs(const X86Operand &Op) { - AddBusyReg(Op.getMemBaseReg()); - AddBusyReg(Op.getMemIndexReg()); - } - - unsigned ChooseFrameReg(unsigned Size) const { - static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, - X86::RCX, X86::RDX, X86::RDI, - X86::RSI }; - for (unsigned Reg : Candidates) { - if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) - return convReg(Reg, Size); - } - return X86::NoRegister; - } - - private: - unsigned convReg(unsigned Reg, unsigned Size) const { - return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size); - } - - std::vector BusyRegs; - }; - - X86AddressSanitizer(const MCSubtargetInfo *&STI) - : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} - - ~X86AddressSanitizer() override = default; - - // X86AsmInstrumentation implementation: - void InstrumentAndEmitInstruction(const MCInst &Inst, OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, - MCStreamer &Out, - /* unused */ bool) override { - InstrumentMOVS(Inst, Operands, Ctx, MII, Out); - if (RepPrefix) - EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); - - InstrumentMOV(Inst, Operands, Ctx, MII, Out); - - RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX); - if (!RepPrefix) - EmitInstruction(Out, Inst); - } - - // Adjusts up stack and saves all registers used in instrumentation. - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) = 0; - - // Restores all registers used in instrumentation and adjusts stack. - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) = 0; - - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, MCStreamer &Out) = 0; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, MCStreamer &Out) = 0; - - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) = 0; - - void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, - MCStreamer &Out); - void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg, - unsigned AccessSize, MCContext &Ctx, MCStreamer &Out); - - void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); - void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); - -protected: - void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } - - void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) { - assert(Size == 32 || Size == 64); - MCInst Inst; - Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r); - Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size))); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - - void ComputeMemOperandAddress(X86Operand &Op, unsigned Size, - unsigned Reg, MCContext &Ctx, MCStreamer &Out); - - // Creates new memory operand with Displacement added to an original - // displacement. Residue will contain a residue which could happen when the - // total displacement exceeds 32-bit limitation. - std::unique_ptr AddDisplacement(X86Operand &Op, - int64_t Displacement, - MCContext &Ctx, int64_t *Residue); - - bool is64BitMode() const { - return STI->getFeatureBits()[X86::Mode64Bit]; - } - - bool is32BitMode() const { - return STI->getFeatureBits()[X86::Mode32Bit]; - } - - bool is16BitMode() const { - return STI->getFeatureBits()[X86::Mode16Bit]; - } - - unsigned getPointerWidth() { - if (is16BitMode()) return 16; - if (is32BitMode()) return 32; - if (is64BitMode()) return 64; - llvm_unreachable("invalid mode"); - } - - // True when previous instruction was actually REP prefix. - bool RepPrefix; - - // Offset from the original SP register. - int64_t OrigSPOffset; -}; - -void X86AddressSanitizer::InstrumentMemOperand( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - assert(Op.isMem() && "Op should be a memory operand."); - assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && - "AccessSize should be a power of two, less or equal than 16."); - // FIXME: take into account load/store alignment. - if (IsSmallMemAccess(AccessSize)) - InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); - else - InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); -} - -void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, - unsigned CntReg, - unsigned AccessSize, - MCContext &Ctx, MCStreamer &Out) { - // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)] - // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)]. - RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */, - IsSmallMemAccess(AccessSize) - ? X86::RBX - : X86::NoRegister /* ScratchReg */); - RegCtx.AddBusyReg(DstReg); - RegCtx.AddBusyReg(SrcReg); - RegCtx.AddBusyReg(CntReg); - - InstrumentMemOperandPrologue(RegCtx, Ctx, Out); - - // Test (%SrcReg) - { - const MCExpr *Disp = MCConstantExpr::create(0, Ctx); - std::unique_ptr Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); - InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, - Out); - } - - // Test -1(%SrcReg, %CntReg, AccessSize) - { - const MCExpr *Disp = MCConstantExpr::create(-1, Ctx); - std::unique_ptr Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), - SMLoc())); - InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, - Out); - } - - // Test (%DstReg) - { - const MCExpr *Disp = MCConstantExpr::create(0, Ctx); - std::unique_ptr Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); - InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); - } - - // Test -1(%DstReg, %CntReg, AccessSize) - { - const MCExpr *Disp = MCConstantExpr::create(-1, Ctx); - std::unique_ptr Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), - SMLoc())); - InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); - } - - InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); -} - -void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, - MCStreamer &Out) { - // Access size in bytes. - unsigned AccessSize = 0; - - switch (Inst.getOpcode()) { - case X86::MOVSB: - AccessSize = 1; - break; - case X86::MOVSW: - AccessSize = 2; - break; - case X86::MOVSL: - AccessSize = 4; - break; - case X86::MOVSQ: - AccessSize = 8; - break; - default: - return; - } - - InstrumentMOVSImpl(AccessSize, Ctx, Out); -} - -void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, - OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) { - // Access size in bytes. - unsigned AccessSize = 0; - - switch (Inst.getOpcode()) { - case X86::MOV8mi: - case X86::MOV8mr: - case X86::MOV8rm: - AccessSize = 1; - break; - case X86::MOV16mi: - case X86::MOV16mr: - case X86::MOV16rm: - AccessSize = 2; - break; - case X86::MOV32mi: - case X86::MOV32mr: - case X86::MOV32rm: - AccessSize = 4; - break; - case X86::MOV64mi32: - case X86::MOV64mr: - case X86::MOV64rm: - AccessSize = 8; - break; - case X86::MOVAPDmr: - case X86::MOVAPSmr: - case X86::MOVAPDrm: - case X86::MOVAPSrm: - AccessSize = 16; - break; - default: - return; - } - - const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); - - for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { - assert(Operands[Ix]); - MCParsedAsmOperand &Op = *Operands[Ix]; - if (Op.isMem()) { - X86Operand &MemOp = static_cast(Op); - RegisterContext RegCtx( - X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */, - IsSmallMemAccess(AccessSize) ? X86::RCX - : X86::NoRegister /* ScratchReg */); - RegCtx.AddBusyRegs(MemOp); - InstrumentMemOperandPrologue(RegCtx, Ctx, Out); - InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out); - InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); - } - } -} - -void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, - unsigned Size, - unsigned Reg, MCContext &Ctx, - MCStreamer &Out) { - int64_t Displacement = 0; - if (IsStackReg(Op.getMemBaseReg())) - Displacement -= OrigSPOffset; - if (IsStackReg(Op.getMemIndexReg())) - Displacement -= OrigSPOffset * Op.getMemScale(); - - assert(Displacement >= 0); - - // Emit Op as is. - if (Displacement == 0) { - EmitLEA(Op, Size, Reg, Out); - return; - } - - int64_t Residue; - std::unique_ptr NewOp = - AddDisplacement(Op, Displacement, Ctx, &Residue); - EmitLEA(*NewOp, Size, Reg, Out); - - while (Residue != 0) { - const MCConstantExpr *Disp = - MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx); - std::unique_ptr DispOp = - X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), - SMLoc()); - EmitLEA(*DispOp, Size, Reg, Out); - Residue -= Disp->getValue(); - } -} - -std::unique_ptr -X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, - MCContext &Ctx, int64_t *Residue) { - assert(Displacement >= 0); - - if (Displacement == 0 || - (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) { - *Residue = Displacement; - return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), - Op.getMemDisp(), Op.getMemBaseReg(), - Op.getMemIndexReg(), Op.getMemScale(), - SMLoc(), SMLoc()); - } - - int64_t OrigDisplacement = - static_cast(Op.getMemDisp())->getValue(); - CheckDisplacementBounds(OrigDisplacement); - Displacement += OrigDisplacement; - - int64_t NewDisplacement = ApplyDisplacementBounds(Displacement); - CheckDisplacementBounds(NewDisplacement); - - *Residue = Displacement - NewDisplacement; - const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx); - return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp, - Op.getMemBaseReg(), Op.getMemIndexReg(), - Op.getMemScale(), SMLoc(), SMLoc()); -} - -class X86AddressSanitizer32 : public X86AddressSanitizer { -public: - static const long kShadowOffset = 0x20000000; - - X86AddressSanitizer32(const MCSubtargetInfo *&STI) - : X86AddressSanitizer(STI) {} - - ~X86AddressSanitizer32() override = default; - - unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { - unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); - if (FrameReg == X86::NoRegister) - return FrameReg; - return getX86SubSuperRegister(FrameReg, 32); - } - - void SpillReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg)); - OrigSPOffset -= 4; - } - - void RestoreReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg)); - OrigSPOffset += 4; - } - - void StoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); - OrigSPOffset -= 4; - } - - void RestoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::POPF32)); - OrigSPOffset += 4; - } - - void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); - assert(LocalFrameReg != X86::NoRegister); - - const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (MRI && FrameReg != X86::NoRegister) { - SpillReg(Out, LocalFrameReg); - if (FrameReg == X86::ESP) { - Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */); - Out.EmitCFIRelOffset( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); - } - EmitInstruction( - Out, - MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg)); - Out.EmitCFIRememberState(); - Out.EmitCFIDefCfaRegister( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); - } - - SpillReg(Out, RegCtx.AddressReg(32)); - SpillReg(Out, RegCtx.ShadowReg(32)); - if (RegCtx.ScratchReg(32) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(32)); - StoreFlags(Out); - } - - void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); - assert(LocalFrameReg != X86::NoRegister); - - RestoreFlags(Out); - if (RegCtx.ScratchReg(32) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(32)); - RestoreReg(Out, RegCtx.ShadowReg(32)); - RestoreReg(Out, RegCtx.AddressReg(32)); - - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { - RestoreReg(Out, LocalFrameReg); - Out.EmitCFIRestoreState(); - if (FrameReg == X86::ESP) - Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */); - } - } - - void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; - -private: - void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out, const RegisterContext &RegCtx) { - EmitInstruction(Out, MCInstBuilder(X86::CLD)); - EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - - EmitInstruction(Out, MCInstBuilder(X86::AND32ri8) - .addReg(X86::ESP) - .addReg(X86::ESP) - .addImm(-16)); - EmitInstruction( - Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); - - MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") + - (IsWrite ? "store" : "load") + - Twine(AccessSize)); - const MCSymbolRefExpr *FnExpr = - MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); - } -}; - -void X86AddressSanitizer32::InstrumentMemOperandSmall( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - - assert(RegCtx.ScratchReg(32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - - ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) - .addReg(ShadowRegI32) - .addReg(ShadowRegI32) - .addImm(3)); - - { - MCInst Inst; - Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::createReg(ShadowRegI8)); - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - - EmitInstruction( - Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(7)); - - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 1: - break; - case 2: { - const MCExpr *Disp = MCConstantExpr::create(1, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, - SMLoc(), SMLoc())); - EmitLEA(*Op, 32, ScratchRegI32, Out); - break; - } - case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(3)); - break; - } - - EmitInstruction( - Out, - MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); - EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( - ShadowRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer32::InstrumentMemOperandLarge( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - - ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) - .addReg(ShadowRegI32) - .addReg(ShadowRegI32) - .addImm(3)); - { - MCInst Inst; - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 8: - Inst.setOpcode(X86::CMP8mi); - break; - case 16: - Inst.setOpcode(X86::CMP16mi); - break; - } - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - Inst.addOperand(MCOperand::createImm(0)); - EmitInstruction(Out, Inst); - } - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, - MCContext &Ctx, - MCStreamer &Out) { - StoreFlags(Out); - - // No need to test when ECX is equals to zero. - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction( - Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - // Instrument first and last elements in src and dst range. - InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */, - X86::ECX /* CntReg */, AccessSize, Ctx, Out); - - EmitLabel(Out, DoneSym); - RestoreFlags(Out); -} - -class X86AddressSanitizer64 : public X86AddressSanitizer { -public: - static const long kShadowOffset = 0x7fff8000; - - X86AddressSanitizer64(const MCSubtargetInfo *&STI) - : X86AddressSanitizer(STI) {} - - ~X86AddressSanitizer64() override = default; - - unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { - unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); - if (FrameReg == X86::NoRegister) - return FrameReg; - return getX86SubSuperRegister(FrameReg, 64); - } - - void SpillReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg)); - OrigSPOffset -= 8; - } - - void RestoreReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg)); - OrigSPOffset += 8; - } - - void StoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - OrigSPOffset -= 8; - } - - void RestoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::POPF64)); - OrigSPOffset += 8; - } - - void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); - assert(LocalFrameReg != X86::NoRegister); - - const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (MRI && FrameReg != X86::NoRegister) { - SpillReg(Out, X86::RBP); - if (FrameReg == X86::RSP) { - Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */); - Out.EmitCFIRelOffset( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); - } - EmitInstruction( - Out, - MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg)); - Out.EmitCFIRememberState(); - Out.EmitCFIDefCfaRegister( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); - } - - EmitAdjustRSP(Ctx, Out, -128); - SpillReg(Out, RegCtx.ShadowReg(64)); - SpillReg(Out, RegCtx.AddressReg(64)); - if (RegCtx.ScratchReg(64) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(64)); - StoreFlags(Out); - } - - void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); - assert(LocalFrameReg != X86::NoRegister); - - RestoreFlags(Out); - if (RegCtx.ScratchReg(64) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(64)); - RestoreReg(Out, RegCtx.AddressReg(64)); - RestoreReg(Out, RegCtx.ShadowReg(64)); - EmitAdjustRSP(Ctx, Out, 128); - - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { - RestoreReg(Out, LocalFrameReg); - Out.EmitCFIRestoreState(); - if (FrameReg == X86::RSP) - Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */); - } - } - - void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; - -private: - void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { - const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, - SMLoc(), SMLoc())); - EmitLEA(*Op, 64, X86::RSP, Out); - OrigSPOffset += Offset; - } - - void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out, const RegisterContext &RegCtx) { - EmitInstruction(Out, MCInstBuilder(X86::CLD)); - EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) - .addReg(X86::RSP) - .addReg(X86::RSP) - .addImm(-16)); - - if (RegCtx.AddressReg(64) != X86::RDI) { - EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( - RegCtx.AddressReg(64))); - } - MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") + - (IsWrite ? "store" : "load") + - Twine(AccessSize)); - const MCSymbolRefExpr *FnExpr = - MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); - } -}; - -} // end anonymous namespace - -void X86AddressSanitizer64::InstrumentMemOperandSmall( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(64); - unsigned AddressRegI32 = RegCtx.AddressReg(32); - unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - - assert(RegCtx.ScratchReg(32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - - ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( - AddressRegI64)); - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) - .addReg(ShadowRegI64) - .addReg(ShadowRegI64) - .addImm(3)); - { - MCInst Inst; - Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::createReg(ShadowRegI8)); - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - - EmitInstruction( - Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(7)); - - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 1: - break; - case 2: { - const MCExpr *Disp = MCConstantExpr::create(1, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, - SMLoc(), SMLoc())); - EmitLEA(*Op, 32, ScratchRegI32, Out); - break; - } - case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(3)); - break; - } - - EmitInstruction( - Out, - MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); - EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( - ShadowRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer64::InstrumentMemOperandLarge( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(64); - unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - - ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( - AddressRegI64)); - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) - .addReg(ShadowRegI64) - .addReg(ShadowRegI64) - .addImm(3)); - { - MCInst Inst; - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 8: - Inst.setOpcode(X86::CMP8mi); - break; - case 16: - Inst.setOpcode(X86::CMP16mi); - break; - } - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - Inst.addOperand(MCOperand::createImm(0)); - EmitInstruction(Out, Inst); - } - - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, - MCContext &Ctx, - MCStreamer &Out) { - StoreFlags(Out); - - // No need to test when RCX is equals to zero. - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction( - Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - // Instrument first and last elements in src and dst range. - InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */, - X86::RCX /* CntReg */, AccessSize, Ctx, Out); - - EmitLabel(Out, DoneSym); - RestoreFlags(Out); -} - -X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) - : STI(STI) {} - -X86AsmInstrumentation::~X86AsmInstrumentation() = default; - -void X86AsmInstrumentation::InstrumentAndEmitInstruction( - const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out, bool PrintSchedInfoEnabled) { - EmitInstruction(Out, Inst, PrintSchedInfoEnabled); -} - -void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst, - bool PrintSchedInfoEnabled) { - Out.EmitInstruction(Inst, *STI, PrintSchedInfoEnabled); -} - -unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, - MCStreamer &Out) { - if (!Out.getNumFrameInfos()) // No active dwarf frame - return X86::NoRegister; - const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back(); - if (Frame.End) // Active dwarf frame is closed - return X86::NoRegister; - const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); - if (!MRI) // No register info - return X86::NoRegister; - - if (InitialFrameReg) { - // FrameReg is set explicitly, we're instrumenting a MachineFunction. - return InitialFrameReg; - } - - return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */); -} - -X86AsmInstrumentation * -llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, - const MCSubtargetInfo *&STI) { - Triple T(STI->getTargetTriple()); - const bool hasCompilerRTSupport = T.isOSLinux(); - if (ClAsanInstrumentAssembly && hasCompilerRTSupport && - MCOptions.SanitizeAddress) { - if (STI->getFeatureBits()[X86::Mode32Bit] != 0) - return new X86AddressSanitizer32(STI); - if (STI->getFeatureBits()[X86::Mode64Bit] != 0) - return new X86AddressSanitizer64(STI); - } - return new X86AsmInstrumentation(STI); -} diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h deleted file mode 100644 index 42a9dc3ba26a..000000000000 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ /dev/null @@ -1,68 +0,0 @@ -//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H -#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H - -#include "llvm/ADT/SmallVector.h" -#include - -namespace llvm { - -class MCContext; -class MCInst; -class MCInstrInfo; -class MCParsedAsmOperand; -class MCStreamer; -class MCSubtargetInfo; -class MCTargetOptions; -class X86AsmInstrumentation; - -X86AsmInstrumentation * -CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, - const MCSubtargetInfo *&STI); - -class X86AsmInstrumentation { -public: - virtual ~X86AsmInstrumentation(); - - // Sets frame register corresponding to a current frame. - void SetInitialFrameRegister(unsigned RegNo) { - InitialFrameReg = RegNo; - } - - // Tries to instrument and emit instruction. - virtual void InstrumentAndEmitInstruction( - const MCInst &Inst, - SmallVectorImpl> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out, - bool PrintSchedInfoEnabled); - -protected: - friend X86AsmInstrumentation * - CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, - const MCSubtargetInfo *&STI); - - X86AsmInstrumentation(const MCSubtargetInfo *&STI); - - unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); - - void EmitInstruction(MCStreamer &Out, const MCInst &Inst, - bool PrintSchedInfoEnabled = false); - - const MCSubtargetInfo *&STI; - - unsigned InitialFrameReg = 0; -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 899b50d0f78f..95cbf46d37ed 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1,17 +1,16 @@ //===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86IntelInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86IntelInstPrinter.h" #include "MCTargetDesc/X86MCExpr.h" #include "MCTargetDesc/X86TargetStreamer.h" -#include "X86AsmInstrumentation.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" #include "llvm/ADT/STLExtras.h" @@ -71,9 +70,17 @@ static const char OpPrecedence[] = { class X86AsmParser : public MCTargetAsmParser { ParseInstructionInfo *InstInfo; - std::unique_ptr Instrumentation; bool Code16GCC; + enum VEXEncoding { + VEXEncoding_Default, + VEXEncoding_VEX2, + VEXEncoding_VEX3, + VEXEncoding_EVEX, + }; + + VEXEncoding ForcedVEXEncoding = VEXEncoding_Default; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -90,13 +97,14 @@ private: } unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst, - uint64_t &ErrorInfo, bool matchingInlineAsm, - unsigned VariantID = 0) { + uint64_t &ErrorInfo, FeatureBitset &MissingFeatures, + bool matchingInlineAsm, unsigned VariantID = 0) { // In Code16GCC mode, match as 32-bit. if (Code16GCC) SwitchMode(X86::Mode32Bit); unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo, - matchingInlineAsm, VariantID); + MissingFeatures, matchingInlineAsm, + VariantID); if (Code16GCC) SwitchMode(X86::Mode16Bit); return rv; @@ -840,6 +848,8 @@ private: const SMLoc &StartLoc, SMLoc &EndLoc); + X86::CondCode ParseConditionCode(StringRef CCode); + bool ParseIntelMemoryOperandSize(unsigned &Size); std::unique_ptr CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, @@ -860,6 +870,8 @@ private: bool parseDirectiveFPOEndProc(SMLoc L); bool parseDirectiveFPOData(SMLoc L); + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); @@ -875,7 +887,7 @@ private: void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands, MCStreamer &Out, bool MatchingInlineAsm); - bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures, bool MatchingInlineAsm); bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -914,7 +926,7 @@ private: MCSubtargetInfo &STI = copySTI(); FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; - uint64_t FB = ComputeAvailableFeatures( + FeatureBitset FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); @@ -941,6 +953,9 @@ private: /// } public: + enum X86MatchResultTy { + Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY, + }; X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) @@ -951,14 +966,10 @@ public: // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); - Instrumentation.reset( - CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); } bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - void SetFrameRegister(unsigned RegNo) override; - bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -1115,8 +1126,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, } // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. - if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) { - RegNo = X86::ST0; + if (RegNo == X86::ST0) { Parser.Lex(); // Eat 'st' // Check to see if we have '(4)' after %st. @@ -1194,10 +1204,6 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } -void X86AsmParser::SetFrameRegister(unsigned RegNo) { - Instrumentation->SetInitialFrameRegister(RegNo); -} - std::unique_ptr X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { bool Parse32 = is32BitMode() || Code16GCC; unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI); @@ -1656,6 +1662,8 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) { const AsmToken &Tok = Parser.getTok(); // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); + if (Tok.isNot(AsmToken::Identifier)) + return ErrorOperand(Tok.getLoc(), "Expected an identifier after {"); if (Tok.getIdentifier().startswith("r")){ int rndMode = StringSwitch(Tok.getIdentifier()) .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) @@ -1999,6 +2007,29 @@ std::unique_ptr X86AsmParser::ParseATTOperand() { } } +// X86::COND_INVALID if not a recognized condition code or alternate mnemonic, +// otherwise the EFLAGS Condition Code enumerator. +X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) { + return StringSwitch(CC) + .Case("o", X86::COND_O) // Overflow + .Case("no", X86::COND_NO) // No Overflow + .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal + .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below + .Cases("e", "z", X86::COND_E) // Equal/Zero + .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero + .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above + .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal + .Case("s", X86::COND_S) // Sign + .Case("ns", X86::COND_NS) // No Sign + .Cases("p", "pe", X86::COND_P) // Parity/Parity Even + .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd + .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal + .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less + .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater + .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal + .Default(X86::COND_INVALID); +} + // true on failure, false otherwise // If no {z} mark was found - Parser doesn't advance bool X86AsmParser::ParseZ(std::unique_ptr &Z, @@ -2305,18 +2336,64 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { MCAsmParser &Parser = getParser(); InstInfo = &Info; + + // Reset the forced VEX encoding. + ForcedVEXEncoding = VEXEncoding_Default; + + // Parse pseudo prefixes. + while (1) { + if (Name == "{") { + if (getLexer().isNot(AsmToken::Identifier)) + return Error(Parser.getTok().getLoc(), "Unexpected token after '{'"); + std::string Prefix = Parser.getTok().getString().lower(); + Parser.Lex(); // Eat identifier. + if (getLexer().isNot(AsmToken::RCurly)) + return Error(Parser.getTok().getLoc(), "Expected '}'"); + Parser.Lex(); // Eat curly. + + if (Prefix == "vex2") + ForcedVEXEncoding = VEXEncoding_VEX2; + else if (Prefix == "vex3") + ForcedVEXEncoding = VEXEncoding_VEX3; + else if (Prefix == "evex") + ForcedVEXEncoding = VEXEncoding_EVEX; + else + return Error(NameLoc, "unknown prefix"); + + NameLoc = Parser.getTok().getLoc(); + if (getLexer().is(AsmToken::LCurly)) { + Parser.Lex(); + Name = "{"; + } else { + if (getLexer().isNot(AsmToken::Identifier)) + return Error(Parser.getTok().getLoc(), "Expected identifier"); + // FIXME: The mnemonic won't match correctly if its not in lower case. + Name = Parser.getTok().getString(); + Parser.Lex(); + } + continue; + } + + break; + } + StringRef PatchedName = Name; - if ((Name.equals("jmp") || Name.equals("jc") || Name.equals("jz")) && - isParsingIntelSyntax() && isParsingInlineAsm()) { + // Hack to skip "short" following Jcc. + if (isParsingIntelSyntax() && + (PatchedName == "jmp" || PatchedName == "jc" || PatchedName == "jnc" || + PatchedName == "jcxz" || PatchedName == "jexcz" || + (PatchedName.startswith("j") && + ParseConditionCode(PatchedName.substr(1)) != X86::COND_INVALID))) { StringRef NextTok = Parser.getTok().getString(); if (NextTok == "short") { SMLoc NameEndLoc = NameLoc.getFromPointer(NameLoc.getPointer() + Name.size()); - // Eat the short keyword + // Eat the short keyword. Parser.Lex(); - // MS ignores the short keyword, it determines the jmp type based - // on the distance of the label + // MS and GAS ignore the short keyword; they both determine the jmp type + // based on the distance of the label. (NASM does emit different code with + // and without "short," though.) InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc, NextTok.size() + 1); } @@ -2327,13 +2404,15 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, PatchedName != "setb" && PatchedName != "setnb") PatchedName = PatchedName.substr(0, Name.size()-1); + unsigned ComparisonPredicate = ~0U; + // FIXME: Hack to recognize cmp{ss,sd,ps,pd}. if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; unsigned CCIdx = IsVCMP ? 4 : 3; - unsigned ComparisonCode = StringSwitch( + unsigned CC = StringSwitch( PatchedName.slice(CCIdx, PatchedName.size() - 2)) .Case("eq", 0x00) .Case("eq_oq", 0x00) @@ -2383,26 +2462,29 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("gt_oq", 0x1E) .Case("true_us", 0x1F) .Default(~0U); - if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) { - - Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx), - NameLoc)); - - const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, - getParser().getContext()); - Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + if (CC != ~0U && (IsVCMP || CC < 8)) { + if (PatchedName.endswith("ss")) + PatchedName = IsVCMP ? "vcmpss" : "cmpss"; + else if (PatchedName.endswith("sd")) + PatchedName = IsVCMP ? "vcmpsd" : "cmpsd"; + else if (PatchedName.endswith("ps")) + PatchedName = IsVCMP ? "vcmpps" : "cmpps"; + else if (PatchedName.endswith("pd")) + PatchedName = IsVCMP ? "vcmppd" : "cmppd"; + else + llvm_unreachable("Unexpected suffix!"); - PatchedName = PatchedName.substr(PatchedName.size() - 2); + ComparisonPredicate = CC; } } // FIXME: Hack to recognize vpcmp{ub,uw,ud,uq,b,w,d,q}. if (PatchedName.startswith("vpcmp") && - (PatchedName.endswith("b") || PatchedName.endswith("w") || - PatchedName.endswith("d") || PatchedName.endswith("q"))) { - unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; - unsigned ComparisonCode = StringSwitch( - PatchedName.slice(5, PatchedName.size() - CCIdx)) + (PatchedName.back() == 'b' || PatchedName.back() == 'w' || + PatchedName.back() == 'd' || PatchedName.back() == 'q')) { + unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned CC = StringSwitch( + PatchedName.slice(5, PatchedName.size() - SuffixSize)) .Case("eq", 0x0) // Only allowed on unsigned. Checked below. .Case("lt", 0x1) .Case("le", 0x2) @@ -2412,24 +2494,26 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("nle", 0x6) //.Case("true", 0x7) // Not a documented alias. .Default(~0U); - if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) { - Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc)); - - const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, - getParser().getContext()); - Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); - - PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + if (CC != ~0U && (CC != 0 || SuffixSize == 2)) { + switch (PatchedName.back()) { + default: llvm_unreachable("Unexpected character!"); + case 'b': PatchedName = SuffixSize == 2 ? "vpcmpub" : "vpcmpb"; break; + case 'w': PatchedName = SuffixSize == 2 ? "vpcmpuw" : "vpcmpw"; break; + case 'd': PatchedName = SuffixSize == 2 ? "vpcmpud" : "vpcmpd"; break; + case 'q': PatchedName = SuffixSize == 2 ? "vpcmpuq" : "vpcmpq"; break; + } + // Set up the immediate to push into the operands later. + ComparisonPredicate = CC; } } // FIXME: Hack to recognize vpcom{ub,uw,ud,uq,b,w,d,q}. if (PatchedName.startswith("vpcom") && - (PatchedName.endswith("b") || PatchedName.endswith("w") || - PatchedName.endswith("d") || PatchedName.endswith("q"))) { - unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; - unsigned ComparisonCode = StringSwitch( - PatchedName.slice(5, PatchedName.size() - CCIdx)) + (PatchedName.back() == 'b' || PatchedName.back() == 'w' || + PatchedName.back() == 'd' || PatchedName.back() == 'q')) { + unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned CC = StringSwitch( + PatchedName.slice(5, PatchedName.size() - SuffixSize)) .Case("lt", 0x0) .Case("le", 0x1) .Case("gt", 0x2) @@ -2439,14 +2523,16 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("false", 0x6) .Case("true", 0x7) .Default(~0U); - if (ComparisonCode != ~0U) { - Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc)); - - const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, - getParser().getContext()); - Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); - - PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + if (CC != ~0U) { + switch (PatchedName.back()) { + default: llvm_unreachable("Unexpected character!"); + case 'b': PatchedName = SuffixSize == 2 ? "vpcomub" : "vpcomb"; break; + case 'w': PatchedName = SuffixSize == 2 ? "vpcomuw" : "vpcomw"; break; + case 'd': PatchedName = SuffixSize == 2 ? "vpcomud" : "vpcomd"; break; + case 'q': PatchedName = SuffixSize == 2 ? "vpcomuq" : "vpcomq"; break; + } + // Set up the immediate to push into the operands later. + ComparisonPredicate = CC; } } @@ -2489,6 +2575,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Flags = X86::IP_NO_PREFIX; break; } + // FIXME: The mnemonic won't match correctly if its not in lower case. Name = Parser.getTok().getString(); Parser.Lex(); // eat the prefix // Hack: we could have something like "rep # some comment" or @@ -2496,6 +2583,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, while (Name.startswith(";") || Name.startswith("\n") || Name.startswith("#") || Name.startswith("\t") || Name.startswith("/")) { + // FIXME: The mnemonic won't match correctly if its not in lower case. Name = Parser.getTok().getString(); Parser.Lex(); // go to next prefix or instr } @@ -2519,6 +2607,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); + // Push the immediate if we extracted one from the mnemonic. + if (ComparisonPredicate != ~0U && !isParsingIntelSyntax()) { + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + } + // This does the actual operand parsing. Don't parse any more if we have a // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as @@ -2553,6 +2648,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return TokError("unexpected token in argument list"); } + // Push the immediate if we extracted one from the mnemonic. + if (ComparisonPredicate != ~0U && isParsingIntelSyntax()) { + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + } + // Consume the EndOfStatement or the prefix separator Slash if (getLexer().is(AsmToken::EndOfStatement) || (isPrefix && getLexer().is(AsmToken::Slash))) @@ -2576,13 +2678,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, static_cast(*Operands[0]).setTokenValue(Repl); } - // Moving a 32 or 16 bit value into a segment register has the same - // behavior. Modify such instructions to always take shorter form. if ((Name == "mov" || Name == "movw" || Name == "movl") && (Operands.size() == 3)) { X86Operand &Op1 = (X86Operand &)*Operands[1]; X86Operand &Op2 = (X86Operand &)*Operands[2]; SMLoc Loc = Op1.getEndLoc(); + // Moving a 32 or 16 bit value into a segment register has the same + // behavior. Modify such instructions to always take shorter form. if (Op1.isReg() && Op2.isReg() && X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains( Op2.getReg()) && @@ -2759,7 +2861,69 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { - return false; + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + + switch (Inst.getOpcode()) { + default: return false; + case X86::VMOVZPQILo2PQIrr: + case X86::VMOVAPDrr: + case X86::VMOVAPDYrr: + case X86::VMOVAPSrr: + case X86::VMOVAPSYrr: + case X86::VMOVDQArr: + case X86::VMOVDQAYrr: + case X86::VMOVDQUrr: + case X86::VMOVDQUYrr: + case X86::VMOVUPDrr: + case X86::VMOVUPDYrr: + case X86::VMOVUPSrr: + case X86::VMOVUPSYrr: { + // We can get a smaller encoding by using VEX.R instead of VEX.B if one of + // the registers is extended, but other isn't. + if (ForcedVEXEncoding == VEXEncoding_VEX3 || + MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 || + MRI->getEncodingValue(Inst.getOperand(1).getReg()) < 8) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + // We can get a smaller encoding by using VEX.R instead of VEX.B if one of + // the registers is extended, but other isn't. + if (ForcedVEXEncoding == VEXEncoding_VEX3 || + MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 || + MRI->getEncodingValue(Inst.getOperand(2).getReg()) < 8) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break; + case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + } } bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { @@ -2865,9 +3029,7 @@ static const char *getSubtargetFeatureName(uint64_t Val); void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { - Instrumentation->InstrumentAndEmitInstruction( - Inst, Operands, getContext(), MII, Out, - getParser().shouldPrintSchedInfo()); + Out.EmitInstruction(Inst, getSTI()); } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -2907,17 +3069,16 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, } } -bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, +bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, + const FeatureBitset &MissingFeatures, bool MatchingInlineAsm) { - assert(ErrorInfo && "Unknown missing feature!"); + assert(MissingFeatures.any() && "Unknown missing feature!"); SmallString<126> Msg; raw_svector_ostream OS(Msg); OS << "instruction requires:"; - uint64_t Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { - if (ErrorInfo & Mask) - OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask); - Mask <<= 1; + for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) { + if (MissingFeatures[i]) + OS << ' ' << getSubtargetFeatureName(i); } return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm); } @@ -2932,30 +3093,70 @@ static unsigned getPrefixes(OperandVector &Operands) { return Result; } +unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) { + unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &MCID = MII.get(Opc); + + if (ForcedVEXEncoding == VEXEncoding_EVEX && + (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX) + return Match_Unsupported; + + if ((ForcedVEXEncoding == VEXEncoding_VEX2 || + ForcedVEXEncoding == VEXEncoding_VEX3) && + (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX) + return Match_Unsupported; + + // These instructions match ambiguously with their VEX encoded counterparts + // and appear first in the matching table. Reject them unless we're forcing + // EVEX encoding. + // FIXME: We really need a way to break the ambiguity. + switch (Opc) { + case X86::VCVTSD2SIZrm_Int: + case X86::VCVTSD2SI64Zrm_Int: + case X86::VCVTSS2SIZrm_Int: + case X86::VCVTSS2SI64Zrm_Int: + case X86::VCVTTSD2SIZrm: case X86::VCVTTSD2SIZrm_Int: + case X86::VCVTTSD2SI64Zrm: case X86::VCVTTSD2SI64Zrm_Int: + case X86::VCVTTSS2SIZrm: case X86::VCVTTSS2SIZrm_Int: + case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int: + if (ForcedVEXEncoding != VEXEncoding_EVEX) + return Match_Unsupported; + } + + return Match_Success; +} + bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); SMRange EmptyRange = None; // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); - - bool WasOriginallyInvalidOperand = false; + MatchFPUWaitAlias(IDLoc, static_cast(*Operands[0]), Operands, + Out, MatchingInlineAsm); + X86Operand &Op = static_cast(*Operands[0]); unsigned Prefixes = getPrefixes(Operands); MCInst Inst; + // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the + // encoder. + if (ForcedVEXEncoding == VEXEncoding_VEX3) + Prefixes |= X86::IP_USE_VEX3; + if (Prefixes) Inst.setFlags(Prefixes); // First, try a direct match. - switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm, - isParsingIntelSyntax())) { + FeatureBitset MissingFeatures; + unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo, + MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); + switch (OriginalError) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) @@ -2973,13 +3174,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, Opcode = Inst.getOpcode(); return false; case Match_MissingFeature: - return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm); + return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm); case Match_InvalidOperand: - WasOriginallyInvalidOperand = true; - break; case Match_MnemonicFail: + case Match_Unsupported: break; } + if (Op.getToken().empty()) { + Error(IDLoc, "instruction must have size higher than 0", EmptyRange, + MatchingInlineAsm); + return true; + } // FIXME: Ideally, we would only attempt suffix matches for things which are // valid prefixes, and we could just infer the right unambiguous @@ -3003,16 +3208,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // Check for the various suffix matches. uint64_t ErrorInfoIgnore; - uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. + FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings. unsigned Match[4]; for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { Tmp.back() = Suffixes[I]; Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); + MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match[I] == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; + ErrorInfoMissingFeatures = MissingFeatures; } // Restore the old token. @@ -3062,11 +3268,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // If all of the instructions reported an invalid mnemonic, then the original // mnemonic was invalid. if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { - if (!WasOriginallyInvalidOperand) { + if (OriginalError == Match_MnemonicFail) return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", Op.getLocRange(), MatchingInlineAsm); - } + if (OriginalError == Match_Unsupported) + return Error(IDLoc, "unsupported instruction", EmptyRange, + MatchingInlineAsm); + + assert(OriginalError == Match_InvalidOperand && "Unexpected error"); // Recover location info for the operand if we know which was the problem. if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) @@ -3085,12 +3295,19 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MatchingInlineAsm); } + // If one instruction matched as unsupported, report this as unsupported. + if (std::count(std::begin(Match), std::end(Match), + Match_Unsupported) == 1) { + return Error(IDLoc, "unsupported instruction", EmptyRange, + MatchingInlineAsm); + } + // If one instruction matched with a missing feature, report this as a // missing feature. if (std::count(std::begin(Match), std::end(Match), Match_MissingFeature) == 1) { - ErrorInfo = ErrorInfoMissingFeature; - return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + ErrorInfo = Match_MissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures, MatchingInlineAsm); } @@ -3114,18 +3331,23 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - StringRef Mnemonic = Op.getToken(); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); + StringRef Mnemonic = (static_cast(*Operands[0])).getToken(); SMRange EmptyRange = None; - StringRef Base = Op.getToken(); + StringRef Base = (static_cast(*Operands[0])).getToken(); unsigned Prefixes = getPrefixes(Operands); // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + MatchFPUWaitAlias(IDLoc, static_cast(*Operands[0]), Operands, Out, MatchingInlineAsm); + X86Operand &Op = static_cast(*Operands[0]); MCInst Inst; + // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the + // encoder. + if (ForcedVEXEncoding == VEXEncoding_VEX3) + Prefixes |= X86::IP_USE_VEX3; + if (Prefixes) Inst.setFlags(Prefixes); @@ -3154,7 +3376,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } SmallVector Match; - uint64_t ErrorInfoMissingFeature = 0; + FeatureBitset ErrorInfoMissingFeatures; + FeatureBitset MissingFeatures; // If unsized push has immediate operand we should default the default pointer // size for the size. @@ -3174,7 +3397,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, Op.setTokenValue(Tmp); // Do match in ATT mode to allow explicit suffix usage. Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo, - MatchingInlineAsm, + MissingFeatures, MatchingInlineAsm, false /*isParsingIntelSyntax()*/)); Op.setTokenValue(Base); } @@ -3191,13 +3414,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t ErrorInfoIgnore; unsigned LastOpcode = Inst.getOpcode(); unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); + MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); if (Match.empty() || LastOpcode != Inst.getOpcode()) Match.push_back(M); // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; + ErrorInfoMissingFeatures = MissingFeatures; } // Restore the size of the unsized memory operand if we modified it. @@ -3209,10 +3433,11 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // matching with the unsized operand. if (Match.empty()) { Match.push_back(MatchInstruction( - Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())); + Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax())); // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfo; + ErrorInfoMissingFeatures = MissingFeatures; } // Restore the size of the unsized memory operand if we modified it. @@ -3234,7 +3459,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, UnsizedMemOp->getMemFrontendSize()) { UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize(); unsigned M = MatchInstruction( - Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()); + Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); if (M == Match_Success) NumSuccessfulMatches = 1; @@ -3270,12 +3496,19 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, UnsizedMemOp->getLocRange()); } + // If one instruction matched as unsupported, report this as unsupported. + if (std::count(std::begin(Match), std::end(Match), + Match_Unsupported) == 1) { + return Error(IDLoc, "unsupported instruction", EmptyRange, + MatchingInlineAsm); + } + // If one instruction matched with a missing feature, report this as a // missing feature. if (std::count(std::begin(Match), std::end(Match), Match_MissingFeature) == 1) { - ErrorInfo = ErrorInfoMissingFeature; - return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + ErrorInfo = Match_MissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures, MatchingInlineAsm); } diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h index c45a3f14ef11..5bc979d1f18c 100644 --- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -1,9 +1,8 @@ //===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index 4d4aae0a1c6a..a771ba366318 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -1,16 +1,15 @@ //===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H -#include "InstPrinter/X86IntelInstPrinter.h" +#include "MCTargetDesc/X86IntelInstPrinter.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86AsmParserCommon.h" #include "llvm/ADT/STLExtras.h" @@ -452,6 +451,31 @@ struct X86Operand final : public MCParsedAsmOperand { X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg())); } + bool isVK1Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg()); + } + + bool isVK2Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK2RegClassID].contains(getReg()); + } + + bool isVK4Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK4RegClassID].contains(getReg()); + } + + bool isVK8Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK8RegClassID].contains(getReg()); + } + + bool isVK16Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK16RegClassID].contains(getReg()); + } + void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast(Expr)) @@ -483,6 +507,30 @@ struct X86Operand final : public MCParsedAsmOperand { addExpr(Inst, getImm()); } + void addMaskPairOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Reg = getReg(); + switch (Reg) { + case X86::K0: + case X86::K1: + Reg = X86::K0_K1; + break; + case X86::K2: + case X86::K3: + Reg = X86::K2_K3; + break; + case X86::K4: + case X86::K5: + Reg = X86::K4_K5; + break; + case X86::K6: + case X86::K7: + Reg = X86::K6_K7; + break; + } + Inst.addOperand(MCOperand::createReg(Reg)); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMemBaseReg())); diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 62312777318e..9a635bbe5f85 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -1,9 +1,8 @@ //===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -76,6 +75,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86DisassemblerDecoder.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -446,211 +446,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case ENCODING_IO: break; } - } else if (type == TYPE_IMM3) { - // Check for immediates that printSSECC can't handle. - if (immediate >= 8) { - unsigned NewOpc; - switch (mcInst.getOpcode()) { - default: llvm_unreachable("unexpected opcode"); - case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; - case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; - case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; - case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; - case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; - case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; - case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; - case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; - case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break; - case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break; - case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break; - case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break; - case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break; - case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break; - case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break; - case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break; - case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break; - case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break; - case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break; - case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break; - case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break; - case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break; - case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break; - case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break; - } - // Switch opcode to the one that doesn't get special printing. - mcInst.setOpcode(NewOpc); - } - } else if (type == TYPE_IMM5) { - // Check for immediates that printAVXCC can't handle. - if (immediate >= 32) { - unsigned NewOpc; - switch (mcInst.getOpcode()) { - default: llvm_unreachable("unexpected opcode"); - case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; - case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; - case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; - case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; - case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; - case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; - case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; - case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; - case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; - case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; - case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; - case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; - case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; - case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; - case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break; - case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; - case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; - case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; - case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break; - case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break; - case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break; - case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break; - case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break; - case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break; - case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break; - case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break; - case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break; - case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break; - case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break; - case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break; - case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break; - case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break; - } - // Switch opcode to the one that doesn't get special printing. - mcInst.setOpcode(NewOpc); - } - } else if (type == TYPE_AVX512ICC) { - if (immediate >= 8 || ((immediate & 0x3) == 3)) { - unsigned NewOpc; - switch (mcInst.getOpcode()) { - default: llvm_unreachable("unexpected opcode"); - case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break; - case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break; - case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break; - case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break; - case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break; - case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break; - case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break; - case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break; - case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break; - case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break; - case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break; - case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break; - case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break; - case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break; - case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break; - case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break; - case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break; - case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break; - case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break; - case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break; - case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break; - case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break; - case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break; - case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break; - case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break; - case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break; - case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break; - case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break; - case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break; - case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break; - case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break; - case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break; - case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break; - case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break; - case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break; - case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break; - case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break; - case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break; - case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break; - case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break; - case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break; - case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break; - case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break; - case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break; - case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break; - case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break; - case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break; - case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break; - case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break; - case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break; - case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break; - case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break; - case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break; - case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break; - case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break; - case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break; - case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break; - case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break; - case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break; - case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break; - case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break; - case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break; - case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break; - case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break; - case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break; - case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break; - case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break; - case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break; - case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break; - case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break; - case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break; - case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break; - case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break; - case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break; - case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break; - case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break; - case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break; - case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break; - case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break; - case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break; - case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break; - case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break; - case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break; - case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break; - case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break; - case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break; - case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break; - case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break; - case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break; - case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break; - case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break; - case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break; - case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break; - case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break; - case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break; - case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break; - case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break; - case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break; - case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break; - case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break; - case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break; - case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break; - case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break; - case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break; - case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break; - case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break; - case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break; - case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break; - case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break; - case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break; - case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break; - case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break; - case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break; - case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break; - case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break; - case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break; - case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break; - case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break; - case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break; - case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break; - } - // Switch opcode to the one that doesn't get special printing. - mcInst.setOpcode(NewOpc); - } } switch (type) { @@ -899,6 +694,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM: case TYPE_YMM: case TYPE_ZMM: + case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: case TYPE_CONTROLREG: @@ -987,6 +783,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, case ENCODING_Rv: translateRegister(mcInst, insn.opcodeRegister); return false; + case ENCODING_CC: + mcInst.addOperand(MCOperand::createImm(insn.immediates[1])); + return false; case ENCODING_FP: translateFPRegister(mcInst, insn.modRM & 7); return false; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 54d550b60652..a241362a271d 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1,9 +1,8 @@ //===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -377,8 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 || nextByte == 0xc6 || nextByte == 0xc7)) { insn->xAcquireRelease = true; - if (nextByte != 0x90) // PAUSE instruction support - break; + break; } if (isREX(insn, nextByte)) { uint8_t nnextByte; @@ -884,7 +882,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])) attrMask |= ATTR_EVEXK; if (lFromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_EVEXL; + attrMask |= ATTR_VEXL; if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) attrMask |= ATTR_EVEXL2; } else if (insn->vectorExtensionType == TYPE_VEX_3B) { @@ -1470,6 +1468,10 @@ static int readModRM(struct InternalInstruction* insn) { if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ + case TYPE_VK_PAIR: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_K0_K1 + (index / 2); \ case TYPE_MM64: \ return prefix##_MM0 + (index & 0x7); \ case TYPE_SEGMENTREG: \ @@ -1847,6 +1849,9 @@ static int readOperands(struct InternalInstruction* insn) { if (readOpcodeRegister(insn, 0)) return -1; break; + case ENCODING_CC: + insn->immediates[1] = insn->opcode & 0xf; + break; case ENCODING_FP: break; case ENCODING_VVVV: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 3b8a4f732eed..7c0a42c019e3 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -1,9 +1,8 @@ //===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -325,6 +324,12 @@ namespace X86Disassembler { ENTRY(K6) \ ENTRY(K7) +#define REGS_MASK_PAIRS \ + ENTRY(K0_K1) \ + ENTRY(K2_K3) \ + ENTRY(K4_K5) \ + ENTRY(K6_K7) + #define REGS_SEGMENT \ ENTRY(ES) \ ENTRY(CS) \ @@ -394,6 +399,7 @@ namespace X86Disassembler { REGS_YMM \ REGS_ZMM \ REGS_MASKS \ + REGS_MASK_PAIRS \ REGS_SEGMENT \ REGS_DEBUG \ REGS_CONTROL \ diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp deleted file mode 100644 index 0e861d5ddbc9..000000000000 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ /dev/null @@ -1,202 +0,0 @@ -//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes code for rendering MCInst instances as AT&T-style -// assembly. -// -//===----------------------------------------------------------------------===// - -#include "X86ATTInstPrinter.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "X86InstComments.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// Include the auto-generated portion of the assembly writer. -#define PRINT_ALIAS_INSTR -#include "X86GenAsmWriter.inc" - -void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << markup(""); -} - -void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { - // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) - HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII); - - printInstFlags(MI, OS); - - // Output CALLpcrel32 as "callq" in 64-bit mode. - // In Intel annotation it's always emitted as "call". - // - // TODO: Probably this hack should be redesigned via InstAlias in - // InstrInfo.td as soon as Requires clause is supported properly - // for InstAlias. - if (MI->getOpcode() == X86::CALLpcrel32 && - (STI.getFeatureBits()[X86::Mode64Bit])) { - OS << "\tcallq\t"; - printPCRelImm(MI, 0, OS); - } - // data16 and data32 both have the same encoding of 0x66. While data32 is - // valid only in 16 bit systems, data16 is valid in the rest. - // There seems to be some lack of support of the Requires clause that causes - // 0x66 to be interpreted as "data16" by the asm printer. - // Thus we add an adjustment here in order to print the "right" instruction. - else if (MI->getOpcode() == X86::DATA16_PREFIX && - STI.getFeatureBits()[X86::Mode16Bit]) { - OS << "\tdata32"; - } - // Try to print any aliases first. - else if (!printAliasInstr(MI, OS)) - printInstruction(MI, OS); - - // Next always print the annotation. - printAnnotation(OS, Annot); -} - -void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - } else if (Op.isImm()) { - // Print immediates as signed values. - int64_t Imm = Op.getImm(); - O << markup(""); - - // TODO: This should be in a helper function in the base class, so it can - // be used by other printers. - - // If there are no instruction-specific comments, add a comment clarifying - // the hex value of the immediate operand when it isn't in the range - // [-256,255]. - if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) { - // Don't print unnecessary hex sign bits. - if (Imm == (int16_t)(Imm)) - *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm); - else if (Imm == (int32_t)(Imm)) - *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm); - else - *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm); - } - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << markup("print(O, &MAI); - O << markup(">"); - } -} - -void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); - const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); - const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); - - O << markup("print(O, &MAI); - } - - if (IndexReg.getReg() || BaseReg.getReg()) { - O << '('; - if (BaseReg.getReg()) - printOperand(MI, Op + X86::AddrBaseReg, O); - - if (IndexReg.getReg()) { - O << ','; - printOperand(MI, Op + X86::AddrIndexReg, O); - unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm(); - if (ScaleVal != 1) { - O << ',' << markup(""); - } - } - O << ')'; - } - - O << markup(">"); -} - -void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - O << markup(""); -} - -void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - O << markup(""); -} - -void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &DispSpec = MI->getOperand(Op); - - O << markup("print(O, &MAI); - } - - O << markup(">"); -} - -void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, - raw_ostream &O) { - if (MI->getOperand(Op).isExpr()) - return printOperand(MI, Op, O); - - O << markup("getOperand(Op).getImm() & 0xff) - << markup(">"); -} diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h deleted file mode 100644 index 57422bc9a0b2..000000000000 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ /dev/null @@ -1,138 +0,0 @@ -//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an X86 MCInst to AT&T style .s file syntax. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H - -#include "X86InstPrinterCommon.h" - -namespace llvm { - -class X86ATTInstPrinter final : public X86InstPrinterCommon { -public: - X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : X86InstPrinterCommon(MAI, MII, MRI) {} - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) override; - - // Autogenerated by tblgen, returns true if we successfully printed an - // alias. - bool printAliasInstr(const MCInst *MI, raw_ostream &OS); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &OS); - static const char *getRegisterName(unsigned RegNo); - - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override; - void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O); - void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O); - void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS); - - void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - - void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - - void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printSrcIdx(MI, OpNo, O); - } - void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printSrcIdx(MI, OpNo, O); - } - void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printSrcIdx(MI, OpNo, O); - } - void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printSrcIdx(MI, OpNo, O); - } - void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printDstIdx(MI, OpNo, O); - } - void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printDstIdx(MI, OpNo, O); - } - void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printDstIdx(MI, OpNo, O); - } - void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printDstIdx(MI, OpNo, O); - } - void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemOffset(MI, OpNo, O); - } - void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemOffset(MI, OpNo, O); - } - void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemOffset(MI, OpNo, O); - } - void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemOffset(MI, OpNo, O); - } - -private: - bool HasCustomInstComment; -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp deleted file mode 100644 index 37bed37b0994..000000000000 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ /dev/null @@ -1,1310 +0,0 @@ -//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This defines functionality used to emit comments about X86 instructions to -// an output stream for -fverbose-asm. -// -//===----------------------------------------------------------------------===// - -#include "X86InstComments.h" -#include "X86ATTInstPrinter.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "MCTargetDesc/X86MCTargetDesc.h" -#include "Utils/X86ShuffleDecode.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define CASE_SSE_INS_COMMON(Inst, src) \ - case X86::Inst##src: - -#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ - case X86::V##Inst##Suffix##src: - -#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ - case X86::V##Inst##Suffix##src##k: - -#define CASE_MASKZ_INS_COMMON(Inst, Suffix, src) \ - case X86::V##Inst##Suffix##src##kz: - -#define CASE_AVX512_INS_COMMON(Inst, Suffix, src) \ - CASE_AVX_INS_COMMON(Inst, Suffix, src) \ - CASE_MASK_INS_COMMON(Inst, Suffix, src) \ - CASE_MASKZ_INS_COMMON(Inst, Suffix, src) - -#define CASE_MOVDUP(Inst, src) \ - CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ - CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ - CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \ - CASE_AVX_INS_COMMON(Inst, , r##src) \ - CASE_AVX_INS_COMMON(Inst, Y, r##src) \ - CASE_SSE_INS_COMMON(Inst, r##src) - -#define CASE_MASK_MOVDUP(Inst, src) \ - CASE_MASK_INS_COMMON(Inst, Z, r##src) \ - CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ - CASE_MASK_INS_COMMON(Inst, Z128, r##src) - -#define CASE_MASKZ_MOVDUP(Inst, src) \ - CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \ - CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \ - CASE_MASKZ_INS_COMMON(Inst, Z128, r##src) - -#define CASE_PMOVZX(Inst, src) \ - CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ - CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ - CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \ - CASE_AVX_INS_COMMON(Inst, , r##src) \ - CASE_AVX_INS_COMMON(Inst, Y, r##src) \ - CASE_SSE_INS_COMMON(Inst, r##src) - -#define CASE_MASK_PMOVZX(Inst, src) \ - CASE_MASK_INS_COMMON(Inst, Z, r##src) \ - CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ - CASE_MASK_INS_COMMON(Inst, Z128, r##src) - -#define CASE_MASKZ_PMOVZX(Inst, src) \ - CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \ - CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \ - CASE_MASKZ_INS_COMMON(Inst, Z128, r##src) - -#define CASE_UNPCK(Inst, src) \ - CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ - CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ - CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \ - CASE_AVX_INS_COMMON(Inst, , r##src) \ - CASE_AVX_INS_COMMON(Inst, Y, r##src) \ - CASE_SSE_INS_COMMON(Inst, r##src) - -#define CASE_MASK_UNPCK(Inst, src) \ - CASE_MASK_INS_COMMON(Inst, Z, r##src) \ - CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ - CASE_MASK_INS_COMMON(Inst, Z128, r##src) - -#define CASE_MASKZ_UNPCK(Inst, src) \ - CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \ - CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \ - CASE_MASKZ_INS_COMMON(Inst, Z128, r##src) - -#define CASE_SHUF(Inst, suf) \ - CASE_AVX512_INS_COMMON(Inst, Z, suf) \ - CASE_AVX512_INS_COMMON(Inst, Z256, suf) \ - CASE_AVX512_INS_COMMON(Inst, Z128, suf) \ - CASE_AVX_INS_COMMON(Inst, , suf) \ - CASE_AVX_INS_COMMON(Inst, Y, suf) \ - CASE_SSE_INS_COMMON(Inst, suf) - -#define CASE_MASK_SHUF(Inst, src) \ - CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \ - CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \ - CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) - -#define CASE_MASKZ_SHUF(Inst, src) \ - CASE_MASKZ_INS_COMMON(Inst, Z, r##src##i) \ - CASE_MASKZ_INS_COMMON(Inst, Z256, r##src##i) \ - CASE_MASKZ_INS_COMMON(Inst, Z128, r##src##i) - -#define CASE_VPERMILPI(Inst, src) \ - CASE_AVX512_INS_COMMON(Inst, Z, src##i) \ - CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \ - CASE_AVX512_INS_COMMON(Inst, Z128, src##i) \ - CASE_AVX_INS_COMMON(Inst, , src##i) \ - CASE_AVX_INS_COMMON(Inst, Y, src##i) - -#define CASE_MASK_VPERMILPI(Inst, src) \ - CASE_MASK_INS_COMMON(Inst, Z, src##i) \ - CASE_MASK_INS_COMMON(Inst, Z256, src##i) \ - CASE_MASK_INS_COMMON(Inst, Z128, src##i) - -#define CASE_MASKZ_VPERMILPI(Inst, src) \ - CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \ - CASE_MASKZ_INS_COMMON(Inst, Z256, src##i) \ - CASE_MASKZ_INS_COMMON(Inst, Z128, src##i) - -#define CASE_VPERM(Inst, src) \ - CASE_AVX512_INS_COMMON(Inst, Z, src##i) \ - CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \ - CASE_AVX_INS_COMMON(Inst, Y, src##i) - -#define CASE_MASK_VPERM(Inst, src) \ - CASE_MASK_INS_COMMON(Inst, Z, src##i) \ - CASE_MASK_INS_COMMON(Inst, Z256, src##i) - -#define CASE_MASKZ_VPERM(Inst, src) \ - CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \ - CASE_MASKZ_INS_COMMON(Inst, Z256, src##i) - -#define CASE_VSHUF(Inst, src) \ - CASE_AVX512_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ - CASE_AVX512_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ - CASE_AVX512_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ - CASE_AVX512_INS_COMMON(SHUFI##Inst, Z256, r##src##i) - -#define CASE_MASK_VSHUF(Inst, src) \ - CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ - CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ - CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ - CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) - -#define CASE_MASKZ_VSHUF(Inst, src) \ - CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ - CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ - CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ - CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i) - -#define CASE_AVX512_FMA(Inst, suf) \ - CASE_AVX512_INS_COMMON(Inst, Z, suf) \ - CASE_AVX512_INS_COMMON(Inst, Z256, suf) \ - CASE_AVX512_INS_COMMON(Inst, Z128, suf) - -#define CASE_FMA(Inst, suf) \ - CASE_AVX512_FMA(Inst, suf) \ - CASE_AVX_INS_COMMON(Inst, , suf) \ - CASE_AVX_INS_COMMON(Inst, Y, suf) - -#define CASE_FMA_PACKED_REG(Inst) \ - CASE_FMA(Inst##PD, r) \ - CASE_FMA(Inst##PS, r) - -#define CASE_FMA_PACKED_MEM(Inst) \ - CASE_FMA(Inst##PD, m) \ - CASE_FMA(Inst##PS, m) \ - CASE_AVX512_FMA(Inst##PD, mb) \ - CASE_AVX512_FMA(Inst##PS, mb) - -#define CASE_FMA_SCALAR_REG(Inst) \ - CASE_AVX_INS_COMMON(Inst##SD, , r) \ - CASE_AVX_INS_COMMON(Inst##SS, , r) \ - CASE_AVX_INS_COMMON(Inst##SD, , r_Int) \ - CASE_AVX_INS_COMMON(Inst##SS, , r_Int) \ - CASE_AVX_INS_COMMON(Inst##SD, Z, r) \ - CASE_AVX_INS_COMMON(Inst##SS, Z, r) \ - CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int) \ - CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int) - -#define CASE_FMA_SCALAR_MEM(Inst) \ - CASE_AVX_INS_COMMON(Inst##SD, , m) \ - CASE_AVX_INS_COMMON(Inst##SS, , m) \ - CASE_AVX_INS_COMMON(Inst##SD, , m_Int) \ - CASE_AVX_INS_COMMON(Inst##SS, , m_Int) \ - CASE_AVX_INS_COMMON(Inst##SD, Z, m) \ - CASE_AVX_INS_COMMON(Inst##SS, Z, m) \ - CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \ - CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int) - -static unsigned getVectorRegSize(unsigned RegNo) { - if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) - return 512; - if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31) - return 256; - if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31) - return 128; - if (X86::MM0 <= RegNo && RegNo <= X86::MM7) - return 64; - - llvm_unreachable("Unknown vector reg!"); -} - -static unsigned getRegOperandNumElts(const MCInst *MI, unsigned ScalarSize, - unsigned OperandIndex) { - unsigned OpReg = MI->getOperand(OperandIndex).getReg(); - return getVectorRegSize(OpReg) / ScalarSize; -} - -static const char *getRegName(unsigned Reg) { - return X86ATTInstPrinter::getRegisterName(Reg); -} - -/// Wraps the destination register name with AVX512 mask/maskz filtering. -static void printMasking(raw_ostream &OS, const MCInst *MI, - const MCInstrInfo &MCII) { - const MCInstrDesc &Desc = MCII.get(MI->getOpcode()); - uint64_t TSFlags = Desc.TSFlags; - - if (!(TSFlags & X86II::EVEX_K)) - return; - - bool MaskWithZero = (TSFlags & X86II::EVEX_Z); - unsigned MaskOp = Desc.getNumDefs(); - - if (Desc.getOperandConstraint(MaskOp, MCOI::TIED_TO) != -1) - ++MaskOp; - - const char *MaskRegName = getRegName(MI->getOperand(MaskOp).getReg()); - - // MASK: zmmX {%kY} - OS << " {%" << MaskRegName << "}"; - - // MASKZ: zmmX {%kY} {z} - if (MaskWithZero) - OS << " {z}"; -} - -static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) { - const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr; - unsigned NumOperands = MI->getNumOperands(); - bool RegForm = false; - bool Negate = false; - StringRef AccStr = "+"; - - // The operands for FMA instructions without rounding fall into two forms. - // dest, src1, src2, src3 - // dest, src1, mask, src2, src3 - // Where src3 is either a register or 5 memory address operands. So to find - // dest and src1 we can index from the front. To find src2 and src3 we can - // index from the end by taking into account memory vs register form when - // finding src2. - - switch (MI->getOpcode()) { - default: - return false; - CASE_FMA_PACKED_REG(FMADD132) - CASE_FMA_SCALAR_REG(FMADD132) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMADD132) - CASE_FMA_SCALAR_MEM(FMADD132) - AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul1Name = getRegName(MI->getOperand(1).getReg()); - break; - - CASE_FMA_PACKED_REG(FMADD213) - CASE_FMA_SCALAR_REG(FMADD213) - AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMADD213) - CASE_FMA_SCALAR_MEM(FMADD213) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul2Name = getRegName(MI->getOperand(1).getReg()); - break; - - CASE_FMA_PACKED_REG(FMADD231) - CASE_FMA_SCALAR_REG(FMADD231) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMADD231) - CASE_FMA_SCALAR_MEM(FMADD231) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - AccName = getRegName(MI->getOperand(1).getReg()); - break; - - CASE_FMA_PACKED_REG(FMSUB132) - CASE_FMA_SCALAR_REG(FMSUB132) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMSUB132) - CASE_FMA_SCALAR_MEM(FMSUB132) - AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul1Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "-"; - break; - - CASE_FMA_PACKED_REG(FMSUB213) - CASE_FMA_SCALAR_REG(FMSUB213) - AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMSUB213) - CASE_FMA_SCALAR_MEM(FMSUB213) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul2Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "-"; - break; - - CASE_FMA_PACKED_REG(FMSUB231) - CASE_FMA_SCALAR_REG(FMSUB231) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMSUB231) - CASE_FMA_SCALAR_MEM(FMSUB231) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - AccName = getRegName(MI->getOperand(1).getReg()); - AccStr = "-"; - break; - - CASE_FMA_PACKED_REG(FNMADD132) - CASE_FMA_SCALAR_REG(FNMADD132) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FNMADD132) - CASE_FMA_SCALAR_MEM(FNMADD132) - AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul1Name = getRegName(MI->getOperand(1).getReg()); - Negate = true; - break; - - CASE_FMA_PACKED_REG(FNMADD213) - CASE_FMA_SCALAR_REG(FNMADD213) - AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FNMADD213) - CASE_FMA_SCALAR_MEM(FNMADD213) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul2Name = getRegName(MI->getOperand(1).getReg()); - Negate = true; - break; - - CASE_FMA_PACKED_REG(FNMADD231) - CASE_FMA_SCALAR_REG(FNMADD231) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FNMADD231) - CASE_FMA_SCALAR_MEM(FNMADD231) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - AccName = getRegName(MI->getOperand(1).getReg()); - Negate = true; - break; - - CASE_FMA_PACKED_REG(FNMSUB132) - CASE_FMA_SCALAR_REG(FNMSUB132) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FNMSUB132) - CASE_FMA_SCALAR_MEM(FNMSUB132) - AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul1Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "-"; - Negate = true; - break; - - CASE_FMA_PACKED_REG(FNMSUB213) - CASE_FMA_SCALAR_REG(FNMSUB213) - AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FNMSUB213) - CASE_FMA_SCALAR_MEM(FNMSUB213) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul2Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "-"; - Negate = true; - break; - - CASE_FMA_PACKED_REG(FNMSUB231) - CASE_FMA_SCALAR_REG(FNMSUB231) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FNMSUB231) - CASE_FMA_SCALAR_MEM(FNMSUB231) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - AccName = getRegName(MI->getOperand(1).getReg()); - AccStr = "-"; - Negate = true; - break; - - CASE_FMA_PACKED_REG(FMADDSUB132) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMADDSUB132) - AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul1Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "+/-"; - break; - - CASE_FMA_PACKED_REG(FMADDSUB213) - AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMADDSUB213) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul2Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "+/-"; - break; - - CASE_FMA_PACKED_REG(FMADDSUB231) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMADDSUB231) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - AccName = getRegName(MI->getOperand(1).getReg()); - AccStr = "+/-"; - break; - - CASE_FMA_PACKED_REG(FMSUBADD132) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMSUBADD132) - AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul1Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "-/+"; - break; - - CASE_FMA_PACKED_REG(FMSUBADD213) - AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMSUBADD213) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - Mul2Name = getRegName(MI->getOperand(1).getReg()); - AccStr = "-/+"; - break; - - CASE_FMA_PACKED_REG(FMSUBADD231) - Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - CASE_FMA_PACKED_MEM(FMSUBADD231) - Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - AccName = getRegName(MI->getOperand(1).getReg()); - AccStr = "-/+"; - break; - } - - const char *DestName = getRegName(MI->getOperand(0).getReg()); - - if (!Mul1Name) Mul1Name = "mem"; - if (!Mul2Name) Mul2Name = "mem"; - if (!AccName) AccName = "mem"; - - OS << DestName << " = "; - // TODO: Print masking information? - - if (Negate) - OS << '-'; - - OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' ' - << AccName; - - return true; -} - - -//===----------------------------------------------------------------------===// -// Top Level Entrypoint -//===----------------------------------------------------------------------===// - -/// EmitAnyX86InstComments - This function decodes x86 instructions and prints -/// newline terminated strings to the specified string if desired. This -/// information is shown in disassembly dumps when verbose assembly is enabled. -bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, - const MCInstrInfo &MCII) { - // If this is a shuffle operation, the switch should fill in this state. - SmallVector ShuffleMask; - const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; - unsigned NumOperands = MI->getNumOperands(); - bool RegForm = false; - - if (printFMA3Comments(MI, OS)) - return true; - - switch (MI->getOpcode()) { - default: - // Not an instruction for which we can decode comments. - return false; - - case X86::BLENDPDrri: - case X86::VBLENDPDrri: - case X86::VBLENDPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - LLVM_FALLTHROUGH; - case X86::BLENDPDrmi: - case X86::VBLENDPDrmi: - case X86::VBLENDPDYrmi: - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeBLENDMask(getRegOperandNumElts(MI, 64, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::BLENDPSrri: - case X86::VBLENDPSrri: - case X86::VBLENDPSYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - LLVM_FALLTHROUGH; - case X86::BLENDPSrmi: - case X86::VBLENDPSrmi: - case X86::VBLENDPSYrmi: - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::PBLENDWrri: - case X86::VPBLENDWrri: - case X86::VPBLENDWYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - LLVM_FALLTHROUGH; - case X86::PBLENDWrmi: - case X86::VPBLENDWrmi: - case X86::VPBLENDWYrmi: - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeBLENDMask(getRegOperandNumElts(MI, 16, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::VPBLENDDrri: - case X86::VPBLENDDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - LLVM_FALLTHROUGH; - case X86::VPBLENDDrmi: - case X86::VPBLENDDYrmi: - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::INSERTPSrr: - case X86::VINSERTPSrr: - case X86::VINSERTPSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - LLVM_FALLTHROUGH; - case X86::INSERTPSrm: - case X86::VINSERTPSrm: - case X86::VINSERTPSZrm: - DestName = getRegName(MI->getOperand(0).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeINSERTPSMask(MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - case X86::MOVLHPSrr: - case X86::VMOVLHPSrr: - case X86::VMOVLHPSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVLHPSMask(2, ShuffleMask); - break; - - case X86::MOVHLPSrr: - case X86::VMOVHLPSrr: - case X86::VMOVHLPSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVHLPSMask(2, ShuffleMask); - break; - - case X86::MOVHPDrm: - case X86::VMOVHPDrm: - case X86::VMOVHPDZ128rm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeInsertElementMask(2, 1, 1, ShuffleMask); - break; - - case X86::MOVHPSrm: - case X86::VMOVHPSrm: - case X86::VMOVHPSZ128rm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeInsertElementMask(4, 2, 2, ShuffleMask); - break; - - case X86::MOVLPDrm: - case X86::VMOVLPDrm: - case X86::VMOVLPDZ128rm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeInsertElementMask(2, 0, 1, ShuffleMask); - break; - - case X86::MOVLPSrm: - case X86::VMOVLPSrm: - case X86::VMOVLPSZ128rm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeInsertElementMask(4, 0, 2, ShuffleMask); - break; - - CASE_MOVDUP(MOVSLDUP, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - - CASE_MOVDUP(MOVSLDUP, m) - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask); - break; - - CASE_MOVDUP(MOVSHDUP, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - - CASE_MOVDUP(MOVSHDUP, m) - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask); - break; - - CASE_MOVDUP(MOVDDUP, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - - CASE_MOVDUP(MOVDDUP, m) - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(getRegOperandNumElts(MI, 64, 0), ShuffleMask); - break; - - case X86::PSLLDQri: - case X86::VPSLLDQri: - case X86::VPSLLDQYri: - case X86::VPSLLDQZ128rr: - case X86::VPSLLDQZ256rr: - case X86::VPSLLDQZrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - case X86::VPSLLDQZ128rm: - case X86::VPSLLDQZ256rm: - case X86::VPSLLDQZrm: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - case X86::PSRLDQri: - case X86::VPSRLDQri: - case X86::VPSRLDQYri: - case X86::VPSRLDQZ128rr: - case X86::VPSRLDQZ256rr: - case X86::VPSRLDQZrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - case X86::VPSRLDQZ128rm: - case X86::VPSRLDQZ256rm: - case X86::VPSRLDQZrm: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - CASE_SHUF(PALIGNR, rri) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_SHUF(PALIGNR, rmi) - Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePALIGNRMask(getRegOperandNumElts(MI, 8, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri) - CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri) - CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi) - CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi) - CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi) - Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeVALIGNMask(getRegOperandNumElts(MI, 64, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - CASE_AVX512_INS_COMMON(ALIGND, Z, rri) - CASE_AVX512_INS_COMMON(ALIGND, Z256, rri) - CASE_AVX512_INS_COMMON(ALIGND, Z128, rri) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_AVX512_INS_COMMON(ALIGND, Z, rmi) - CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi) - CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi) - Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeVALIGNMask(getRegOperandNumElts(MI, 32, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - CASE_SHUF(PSHUFD, ri) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_SHUF(PSHUFD, mi) - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32, - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - CASE_SHUF(PSHUFHW, ri) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_SHUF(PSHUFHW, mi) - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSHUFHWMask(getRegOperandNumElts(MI, 16, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - CASE_SHUF(PSHUFLW, ri) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_SHUF(PSHUFLW, mi) - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSHUFLWMask(getRegOperandNumElts(MI, 16, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - case X86::MMX_PSHUFWri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - - case X86::MMX_PSHUFWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSHUFMask(4, 16, MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - break; - - case X86::PSWAPDrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - - case X86::PSWAPDrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodePSWAPMask(2, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKHBW, r) - case X86::MMX_PUNPCKHBWirr: - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKHBW, m) - case X86::MMX_PUNPCKHBWirm: - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKHWD, r) - case X86::MMX_PUNPCKHWDirr: - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKHWD, m) - case X86::MMX_PUNPCKHWDirm: - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKHDQ, r) - case X86::MMX_PUNPCKHDQirr: - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKHDQ, m) - case X86::MMX_PUNPCKHDQirm: - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKHQDQ, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKHQDQ, m) - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKLBW, r) - case X86::MMX_PUNPCKLBWirr: - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKLBW, m) - case X86::MMX_PUNPCKLBWirm: - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKLWD, r) - case X86::MMX_PUNPCKLWDirr: - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKLWD, m) - case X86::MMX_PUNPCKLWDirm: - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKLDQ, r) - case X86::MMX_PUNPCKLDQirr: - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKLDQ, m) - case X86::MMX_PUNPCKLDQirm: - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); - break; - - CASE_UNPCK(PUNPCKLQDQ, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(PUNPCKLQDQ, m) - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); - break; - - CASE_SHUF(SHUFPD, rri) - Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_SHUF(SHUFPD, rmi) - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeSHUFPMask(getRegOperandNumElts(MI, 64, 0), 64, - MI->getOperand(NumOperands - 1).getImm(), ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_SHUF(SHUFPS, rri) - Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_SHUF(SHUFPS, rmi) - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeSHUFPMask(getRegOperandNumElts(MI, 32, 0), 32, - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_VSHUF(64X2, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_VSHUF(64X2, m) - decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 64, 0), 64, - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_VSHUF(32X4, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_VSHUF(32X4, m) - decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 32, 0), 32, - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_UNPCK(UNPCKLPD, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(UNPCKLPD, m) - DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_UNPCK(UNPCKLPS, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(UNPCKLPS, m) - DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_UNPCK(UNPCKHPD, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(UNPCKHPD, m) - DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_UNPCK(UNPCKHPS, r) - Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - RegForm = true; - LLVM_FALLTHROUGH; - - CASE_UNPCK(UNPCKHPS, m) - DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_VPERMILPI(PERMILPS, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_VPERMILPI(PERMILPS, m) - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32, - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_VPERMILPI(PERMILPD, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_VPERMILPI(PERMILPD, m) - if (MI->getOperand(NumOperands - 1).isImm()) - DecodePSHUFMask(getRegOperandNumElts(MI, 64, 0), 64, - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::VPERM2F128rr: - case X86::VPERM2I128rr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - LLVM_FALLTHROUGH; - - case X86::VPERM2F128rm: - case X86::VPERM2I128rm: - // For instruction comments purpose, assume the 256-bit vector is v4i64. - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeVPERM2X128Mask(4, MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_VPERM(PERMPD, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_VPERM(PERMPD, m) - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_VPERM(PERMQ, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - LLVM_FALLTHROUGH; - - CASE_VPERM(PERMQ, m) - if (MI->getOperand(NumOperands - 1).isImm()) - DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0), - MI->getOperand(NumOperands - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::MOVSDrr: - case X86::VMOVSDrr: - case X86::VMOVSDZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - - case X86::MOVSDrm: - case X86::VMOVSDrm: - case X86::VMOVSDZrm: - DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::MOVSSrr: - case X86::VMOVSSrr: - case X86::VMOVSSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - - case X86::MOVSSrm: - case X86::VMOVSSrm: - case X86::VMOVSSZrm: - DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::MOVPQI2QIrr: - case X86::MOVZPQILo2PQIrr: - case X86::VMOVPQI2QIrr: - case X86::VMOVPQI2QIZrr: - case X86::VMOVZPQILo2PQIrr: - case X86::VMOVZPQILo2PQIZrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - LLVM_FALLTHROUGH; - - case X86::MOVQI2PQIrm: - case X86::VMOVQI2PQIrm: - case X86::VMOVQI2PQIZrm: - DecodeZeroMoveLowMask(2, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::MOVDI2PDIrm: - case X86::VMOVDI2PDIrm: - case X86::VMOVDI2PDIZrm: - DecodeZeroMoveLowMask(4, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::EXTRQI: - if (MI->getOperand(2).isImm() && - MI->getOperand(3).isImm()) - DecodeEXTRQIMask(16, 8, MI->getOperand(2).getImm(), - MI->getOperand(3).getImm(), ShuffleMask); - - DestName = getRegName(MI->getOperand(0).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - break; - - case X86::INSERTQI: - if (MI->getOperand(3).isImm() && - MI->getOperand(4).isImm()) - DecodeINSERTQIMask(16, 8, MI->getOperand(3).getImm(), - MI->getOperand(4).getImm(), ShuffleMask); - - DestName = getRegName(MI->getOperand(0).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - Src2Name = getRegName(MI->getOperand(2).getReg()); - break; - - case X86::VBROADCASTF128: - case X86::VBROADCASTI128: - CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm) - CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm) - DecodeSubVectorBroadcast(4, 2, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm) - DecodeSubVectorBroadcast(8, 2, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm) - DecodeSubVectorBroadcast(8, 4, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm) - CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm) - DecodeSubVectorBroadcast(8, 4, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm) - DecodeSubVectorBroadcast(16, 4, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm) - DecodeSubVectorBroadcast(16, 8, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m) - DecodeSubVectorBroadcast(4, 2, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) - DecodeSubVectorBroadcast(8, 2, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) - DecodeSubVectorBroadcast(16, 2, ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_PMOVZX(PMOVZXBW, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_PMOVZX(PMOVZXBW, m) - DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_PMOVZX(PMOVZXBD, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_PMOVZX(PMOVZXBD, m) - DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_PMOVZX(PMOVZXBQ, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_PMOVZX(PMOVZXBQ, m) - DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_PMOVZX(PMOVZXWD, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_PMOVZX(PMOVZXWD, m) - DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_PMOVZX(PMOVZXWQ, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_PMOVZX(PMOVZXWQ, m) - DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - CASE_PMOVZX(PMOVZXDQ, r) - Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - LLVM_FALLTHROUGH; - CASE_PMOVZX(PMOVZXDQ, m) - DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - } - - // The only comments we decode are shuffles, so give up if we were unable to - // decode a shuffle mask. - if (ShuffleMask.empty()) - return false; - - if (!DestName) DestName = Src1Name; - if (DestName) { - OS << DestName; - printMasking(OS, MI, MCII); - } else - OS << "mem"; - - OS << " = "; - - // If the two sources are the same, canonicalize the input elements to be - // from the first src so that we get larger element spans. - if (Src1Name == Src2Name) { - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if ((int)ShuffleMask[i] >= 0 && // Not sentinel. - ShuffleMask[i] >= (int)e) // From second mask. - ShuffleMask[i] -= e; - } - } - - // The shuffle mask specifies which elements of the src1/src2 fill in the - // destination, with a few sentinel values. Loop through and print them - // out. - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if (i != 0) - OS << ','; - if (ShuffleMask[i] == SM_SentinelZero) { - OS << "zero"; - continue; - } - - // Otherwise, it must come from src1 or src2. Print the span of elements - // that comes from this src. - bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); - const char *SrcName = isSrc1 ? Src1Name : Src2Name; - OS << (SrcName ? SrcName : "mem") << '['; - bool IsFirst = true; - while (i != e && (int)ShuffleMask[i] != SM_SentinelZero && - (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { - if (!IsFirst) - OS << ','; - else - IsFirst = false; - if (ShuffleMask[i] == SM_SentinelUndef) - OS << "u"; - else - OS << ShuffleMask[i] % ShuffleMask.size(); - ++i; - } - OS << ']'; - --i; // For loop increments element #. - } - - // We successfully added a comment to this instruction. - return true; -} diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h deleted file mode 100644 index 40dffa5fbb8a..000000000000 --- a/lib/Target/X86/InstPrinter/X86InstComments.h +++ /dev/null @@ -1,27 +0,0 @@ -//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This defines functionality used to emit comments about X86 instructions to -// an output stream for -fverbose-asm. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H - -namespace llvm { - - class MCInst; - class MCInstrInfo; - class raw_ostream; - bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, - const MCInstrInfo &MCII); -} - -#endif diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp deleted file mode 100644 index 432cd47ae499..000000000000 --- a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp +++ /dev/null @@ -1,142 +0,0 @@ -//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes common code for rendering MCInst instances as Intel-style -// and Intel-style assembly. -// -//===----------------------------------------------------------------------===// - -#include "X86InstPrinterCommon.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Casting.h" -#include -#include - -using namespace llvm; - -void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm(); - switch (Imm) { - default: llvm_unreachable("Invalid ssecc/avxcc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - case 0x10: O << "eq_os"; break; - case 0x11: O << "lt_oq"; break; - case 0x12: O << "le_oq"; break; - case 0x13: O << "unord_s"; break; - case 0x14: O << "neq_us"; break; - case 0x15: O << "nlt_uq"; break; - case 0x16: O << "nle_uq"; break; - case 0x17: O << "ord_s"; break; - case 0x18: O << "eq_us"; break; - case 0x19: O << "nge_uq"; break; - case 0x1a: O << "ngt_uq"; break; - case 0x1b: O << "false_os"; break; - case 0x1c: O << "neq_os"; break; - case 0x1d: O << "ge_oq"; break; - case 0x1e: O << "gt_oq"; break; - case 0x1f: O << "true_us"; break; - } -} - -void X86InstPrinterCommon::printXOPCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm(); - switch (Imm) { - default: llvm_unreachable("Invalid xopcc argument!"); - case 0: O << "lt"; break; - case 1: O << "le"; break; - case 2: O << "gt"; break; - case 3: O << "ge"; break; - case 4: O << "eq"; break; - case 5: O << "neq"; break; - case 6: O << "false"; break; - case 7: O << "true"; break; - } -} - -void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0x3; - switch (Imm) { - case 0: O << "{rn-sae}"; break; - case 1: O << "{rd-sae}"; break; - case 2: O << "{ru-sae}"; break; - case 3: O << "{rz-sae}"; break; - } -} - -/// printPCRelImm - This is used to print an immediate value that ends up -/// being encoded as a pc-relative value (e.g. for jumps and calls). In -/// Intel-style these print slightly differently than normal immediates. -/// for example, a $ is not emitted. -void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) - O << formatImm(Op.getImm()); - else { - assert(Op.isExpr() && "unknown pcrel immediate operand"); - // If a symbolic branch target was added as a constant expression then print - // that address in hex. - const MCConstantExpr *BranchTarget = dyn_cast(Op.getExpr()); - int64_t Address; - if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { - O << formatHex((uint64_t)Address); - } else { - // Otherwise, just print the expression. - Op.getExpr()->print(O, &MAI); - } - } -} - -void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getReg()) { - printOperand(MI, OpNo, O); - O << ':'; - } -} - -void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - uint64_t TSFlags = Desc.TSFlags; - unsigned Flags = MI->getFlags(); - - if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK)) - O << "\tlock\t"; - - if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK)) - O << "\tnotrack\t"; - - if (Flags & X86::IP_HAS_REPEAT_NE) - O << "\trepne\t"; - else if (Flags & X86::IP_HAS_REPEAT) - O << "\trep\t"; -} diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h deleted file mode 100644 index f2875e71f22c..000000000000 --- a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h +++ /dev/null @@ -1,38 +0,0 @@ -//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes code common for rendering MCInst instances as AT&T-style -// and Intel-style assembly. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H - -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class X86InstPrinterCommon : public MCInstPrinter { -public: - using MCInstPrinter::MCInstPrinter; - - virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0; - void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O); - void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); -protected: - void printInstFlags(const MCInst *MI, raw_ostream &O); - void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp deleted file mode 100644 index 044b71564152..000000000000 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ /dev/null @@ -1,162 +0,0 @@ -//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes code for rendering MCInst instances as Intel-style -// assembly. -// -//===----------------------------------------------------------------------===// - -#include "X86IntelInstPrinter.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "X86InstComments.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "X86GenAsmWriter1.inc" - -void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << getRegisterName(RegNo); -} - -void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, - const MCSubtargetInfo &STI) { - printInstFlags(MI, OS); - - // In 16-bit mode, print data16 as data32. - if (MI->getOpcode() == X86::DATA16_PREFIX && - STI.getFeatureBits()[X86::Mode16Bit]) { - OS << "\tdata32"; - } else - printInstruction(MI, OS); - - // Next always print the annotation. - printAnnotation(OS, Annot); - - // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) - EmitAnyX86InstComments(MI, *CommentStream, MII); -} - -void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - } else if (Op.isImm()) { - O << formatImm((int64_t)Op.getImm()); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << "offset "; - Op.getExpr()->print(O, &MAI); - } -} - -void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); - const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O); - - O << '['; - - bool NeedPlus = false; - if (BaseReg.getReg()) { - printOperand(MI, Op+X86::AddrBaseReg, O); - NeedPlus = true; - } - - if (IndexReg.getReg()) { - if (NeedPlus) O << " + "; - if (ScaleVal != 1) - O << ScaleVal << '*'; - printOperand(MI, Op+X86::AddrIndexReg, O); - NeedPlus = true; - } - - if (!DispSpec.isImm()) { - if (NeedPlus) O << " + "; - assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); - DispSpec.getExpr()->print(O, &MAI); - } else { - int64_t DispVal = DispSpec.getImm(); - if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { - if (NeedPlus) { - if (DispVal > 0) - O << " + "; - else { - O << " - "; - DispVal = -DispVal; - } - } - O << formatImm(DispVal); - } - } - - O << ']'; -} - -void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + 1, O); - O << '['; - printOperand(MI, Op, O); - O << ']'; -} - -void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - // DI accesses are always ES-based. - O << "es:["; - printOperand(MI, Op, O); - O << ']'; -} - -void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &DispSpec = MI->getOperand(Op); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + 1, O); - - O << '['; - - if (DispSpec.isImm()) { - O << formatImm(DispSpec.getImm()); - } else { - assert(DispSpec.isExpr() && "non-immediate displacement?"); - DispSpec.getExpr()->print(O, &MAI); - } - - O << ']'; -} - -void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, - raw_ostream &O) { - if (MI->getOperand(Op).isExpr()) - return MI->getOperand(Op).getExpr()->print(O, &MAI); - - O << formatImm(MI->getOperand(Op).getImm() & 0xff); -} diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h deleted file mode 100644 index 3b34a8052bec..000000000000 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ /dev/null @@ -1,157 +0,0 @@ -//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an X86 MCInst to Intel style .s file syntax. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H - -#include "X86InstPrinterCommon.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - -class X86IntelInstPrinter final : public X86InstPrinterCommon { -public: - X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : X86InstPrinterCommon(MAI, MII, MRI) {} - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) override; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override; - void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); - void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O); - - void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - - void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - - void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "byte ptr "; - printMemReference(MI, OpNo, O); - } - void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "word ptr "; - printMemReference(MI, OpNo, O); - } - void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "dword ptr "; - printMemReference(MI, OpNo, O); - } - void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "qword ptr "; - printMemReference(MI, OpNo, O); - } - void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "xmmword ptr "; - printMemReference(MI, OpNo, O); - } - void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "ymmword ptr "; - printMemReference(MI, OpNo, O); - } - void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "zmmword ptr "; - printMemReference(MI, OpNo, O); - } - void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "dword ptr "; - printMemReference(MI, OpNo, O); - } - void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "qword ptr "; - printMemReference(MI, OpNo, O); - } - void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "tbyte ptr "; - printMemReference(MI, OpNo, O); - } - void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "xmmword ptr "; - printMemReference(MI, OpNo, O); - } - void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "ymmword ptr "; - printMemReference(MI, OpNo, O); - } - void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "zmmword ptr "; - printMemReference(MI, OpNo, O); - } - - - void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "byte ptr "; - printSrcIdx(MI, OpNo, O); - } - void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "word ptr "; - printSrcIdx(MI, OpNo, O); - } - void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "dword ptr "; - printSrcIdx(MI, OpNo, O); - } - void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "qword ptr "; - printSrcIdx(MI, OpNo, O); - } - void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "byte ptr "; - printDstIdx(MI, OpNo, O); - } - void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "word ptr "; - printDstIdx(MI, OpNo, O); - } - void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "dword ptr "; - printDstIdx(MI, OpNo, O); - } - void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "qword ptr "; - printDstIdx(MI, OpNo, O); - } - void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "byte ptr "; - printMemOffset(MI, OpNo, O); - } - void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "word ptr "; - printMemOffset(MI, OpNo, O); - } - void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "dword ptr "; - printMemOffset(MI, OpNo, O); - } - void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "qword ptr "; - printMemOffset(MI, OpNo, O); - } -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H diff --git a/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp new file mode 100644 index 000000000000..ed2ee55ff2a5 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -0,0 +1,487 @@ +//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as AT&T-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86ATTInstPrinter.h" +#include "X86BaseInfo.h" +#include "X86InstComments.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter.inc" + +void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << markup(""); +} + +void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII); + + printInstFlags(MI, OS); + + // Output CALLpcrel32 as "callq" in 64-bit mode. + // In Intel annotation it's always emitted as "call". + // + // TODO: Probably this hack should be redesigned via InstAlias in + // InstrInfo.td as soon as Requires clause is supported properly + // for InstAlias. + if (MI->getOpcode() == X86::CALLpcrel32 && + (STI.getFeatureBits()[X86::Mode64Bit])) { + OS << "\tcallq\t"; + printPCRelImm(MI, 0, OS); + } + // data16 and data32 both have the same encoding of 0x66. While data32 is + // valid only in 16 bit systems, data16 is valid in the rest. + // There seems to be some lack of support of the Requires clause that causes + // 0x66 to be interpreted as "data16" by the asm printer. + // Thus we add an adjustment here in order to print the "right" instruction. + else if (MI->getOpcode() == X86::DATA16_PREFIX && + STI.getFeatureBits()[X86::Mode16Bit]) { + OS << "\tdata32"; + } + // Try to print any aliases first. + else if (!printAliasInstr(MI, OS) && + !printVecCompareInstr(MI, OS)) + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); +} + +bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, + raw_ostream &OS) { + if (MI->getNumOperands() == 0 || + !MI->getOperand(MI->getNumOperands() - 1).isImm()) + return false; + + int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + + // Custom print the vector compare instructions to get the immediate + // translated into the mnemonic. + switch (MI->getOpcode()) { + case X86::CMPPDrmi: case X86::CMPPDrri: + case X86::CMPPSrmi: case X86::CMPPSrri: + case X86::CMPSDrm: case X86::CMPSDrr: + case X86::CMPSDrm_Int: case X86::CMPSDrr_Int: + case X86::CMPSSrm: case X86::CMPSSrr: + case X86::CMPSSrm_Int: case X86::CMPSSrr_Int: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/false, OS); + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, 2, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, 2, OS); + else + printxmmwordmem(MI, 2, OS); + } else + printOperand(MI, 2, OS); + + // Skip operand 1 as its tied to the dest. + + OS << ", "; + printOperand(MI, 0, OS); + return true; + } + break; + + case X86::VCMPPDrmi: case X86::VCMPPDrri: + case X86::VCMPPDYrmi: case X86::VCMPPDYrri: + case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri: + case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri: + case X86::VCMPPDZrmi: case X86::VCMPPDZrri: + case X86::VCMPPSrmi: case X86::VCMPPSrri: + case X86::VCMPPSYrmi: case X86::VCMPPSYrri: + case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri: + case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri: + case X86::VCMPPSZrmi: case X86::VCMPPSZrri: + case X86::VCMPSDrm: case X86::VCMPSDrr: + case X86::VCMPSDZrm: case X86::VCMPSDZrr: + case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int: + case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int: + case X86::VCMPSSrm: case X86::VCMPSSrr: + case X86::VCMPSSZrm: case X86::VCMPSSZrr: + case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int: + case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int: + case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik: + case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik: + case X86::VCMPPDZrmik: case X86::VCMPPDZrrik: + case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: + case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: + case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: + case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk: + case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk: + case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: + case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: + case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: + case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik: + case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik: + case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: + case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: + case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: + case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: + case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + if (Imm >= 0 && Imm <= 31) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/true, OS); + + unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp--, OS); + else + printdwordmem(MI, CurOp--, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, CurOp--, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, CurOp--, OS); + else if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp--, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp--, OS); + else + printxmmwordmem(MI, CurOp--, OS); + } + } else { + if (Desc.TSFlags & X86II::EVEX_B) + OS << "{sae}, "; + printOperand(MI, CurOp--, OS); + } + + OS << ", "; + printOperand(MI, CurOp--, OS); + OS << ", "; + printOperand(MI, 0, OS); + if (CurOp > 0) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp--, OS); + OS << "}"; + } + + return true; + } + break; + + case X86::VPCOMBmi: case X86::VPCOMBri: + case X86::VPCOMDmi: case X86::VPCOMDri: + case X86::VPCOMQmi: case X86::VPCOMQri: + case X86::VPCOMUBmi: case X86::VPCOMUBri: + case X86::VPCOMUDmi: case X86::VPCOMUDri: + case X86::VPCOMUQmi: case X86::VPCOMUQri: + case X86::VPCOMUWmi: case X86::VPCOMUWri: + case X86::VPCOMWmi: case X86::VPCOMWri: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printVPCOMMnemonic(MI, OS); + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) + printxmmwordmem(MI, 2, OS); + else + printOperand(MI, 2, OS); + + OS << ", "; + printOperand(MI, 1, OS); + OS << ", "; + printOperand(MI, 0, OS); + return true; + } + break; + + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri: + case X86::VPCMPBZrmi: case X86::VPCMPBZrri: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri: + case X86::VPCMPDZrmi: case X86::VPCMPDZrri: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri: + case X86::VPCMPQZrmi: case X86::VPCMPQZrri: + case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri: + case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri: + case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri: + case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri: + case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri: + case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri: + case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri: + case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri: + case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri: + case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri: + case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri: + case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri: + case X86::VPCMPWZrmi: case X86::VPCMPWZrri: + case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmik: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmik: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmik: case X86::VPCMPQZrrik: + case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik: + case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik: + case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik: + case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik: + case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik: + case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik: + case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik: + case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik: + case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik: + case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik: + case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik: + case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik: + case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmik: case X86::VPCMPWZrrik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk: + case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk: + case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk: + case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk: + case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk: + case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk: + if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) { + OS << '\t'; + printVPCMPMnemonic(MI, OS); + + unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit as only D and Q are supported. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp--, OS); + else + printdwordmem(MI, CurOp--, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp--, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp--, OS); + else + printxmmwordmem(MI, CurOp--, OS); + } + } else { + printOperand(MI, CurOp--, OS); + } + + OS << ", "; + printOperand(MI, CurOp--, OS); + OS << ", "; + printOperand(MI, 0, OS); + if (CurOp > 0) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp--, OS); + OS << "}"; + } + + return true; + } + break; + } + + return false; +} + +void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + // Print immediates as signed values. + int64_t Imm = Op.getImm(); + O << markup(""); + + // TODO: This should be in a helper function in the base class, so it can + // be used by other printers. + + // If there are no instruction-specific comments, add a comment clarifying + // the hex value of the immediate operand when it isn't in the range + // [-256,255]. + if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) { + // Don't print unnecessary hex sign bits. + if (Imm == (int16_t)(Imm)) + *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm); + else if (Imm == (int32_t)(Imm)) + *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm); + else + *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm); + } + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << markup("print(O, &MAI); + O << markup(">"); + } +} + +void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); + + O << markup("print(O, &MAI); + } + + if (IndexReg.getReg() || BaseReg.getReg()) { + O << '('; + if (BaseReg.getReg()) + printOperand(MI, Op + X86::AddrBaseReg, O); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op + X86::AddrIndexReg, O); + unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm(); + if (ScaleVal != 1) { + O << ',' << markup(""); + } + } + O << ')'; + } + + O << markup(">"); +} + +void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup(""); +} + +void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup(""); +} + +void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + + O << markup("print(O, &MAI); + } + + O << markup(">"); +} + +void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + if (MI->getOperand(Op).isExpr()) + return printOperand(MI, Op, O); + + O << markup("getOperand(Op).getImm() & 0xff) + << markup(">"); +} + +void X86ATTInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + unsigned Reg = Op.getReg(); + // Override the default printing to print st(0) instead st. + if (Reg == X86::ST0) + OS << markup(""); + else + printRegName(OS, Reg); +} diff --git a/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h new file mode 100644 index 000000000000..747ddd30a2d9 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h @@ -0,0 +1,124 @@ +//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to AT&T style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H + +#include "X86InstPrinterCommon.h" + +namespace llvm { + +class X86ATTInstPrinter final : public X86InstPrinterCommon { +public: + X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {} + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; + bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS); + + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &OS); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override; + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O); + void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + +private: + bool HasCustomInstComment; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 64e6fb9f0375..54413fa1a02f 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -1,9 +1,8 @@ //===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,6 +12,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" @@ -26,18 +26,20 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -static unsigned getFixupKindLog2Size(unsigned Kind) { +static unsigned getFixupKindSize(unsigned Kind) { switch (Kind) { default: llvm_unreachable("invalid fixup kind!"); + case FK_NONE: + return 0; case FK_PCRel_1: case FK_SecRel_1: case FK_Data_1: - return 0; + return 1; case FK_PCRel_2: case FK_SecRel_2: case FK_Data_2: - return 1; + return 2; case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_relax: @@ -49,12 +51,12 @@ static unsigned getFixupKindLog2Size(unsigned Kind) { case X86::reloc_branch_4byte_pcrel: case FK_SecRel_4: case FK_Data_4: - return 2; + return 4; case FK_PCRel_8: case FK_SecRel_8: case FK_Data_8: case X86::reloc_global_offset_table8: - return 3; + return 8; } } @@ -77,6 +79,8 @@ public: return X86::NumTargetFixupKinds; } + Optional getFixupKind(StringRef Name) const override; + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, @@ -99,11 +103,14 @@ public: return Infos[Kind - FirstTargetFixupKind]; } + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const override { - unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); + unsigned Size = getFixupKindSize(Fixup.getKind()); assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); @@ -111,7 +118,7 @@ public: // Specifically ignore overflow/underflow as long as the leakage is // limited to the lower bits. This is to remain compatible with // other assemblers. - assert(isIntN(Size * 8 + 1, Value) && + assert((Size == 0 || isIntN(Size * 8 + 1, Value)) && "Value does not fit in the Fixup field"); for (unsigned i = 0; i != Size; ++i) @@ -137,40 +144,10 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) { switch (Op) { default: return Op; - case X86::JAE_1: - return (is16BitMode) ? X86::JAE_2 : X86::JAE_4; - case X86::JA_1: - return (is16BitMode) ? X86::JA_2 : X86::JA_4; - case X86::JBE_1: - return (is16BitMode) ? X86::JBE_2 : X86::JBE_4; - case X86::JB_1: - return (is16BitMode) ? X86::JB_2 : X86::JB_4; - case X86::JE_1: - return (is16BitMode) ? X86::JE_2 : X86::JE_4; - case X86::JGE_1: - return (is16BitMode) ? X86::JGE_2 : X86::JGE_4; - case X86::JG_1: - return (is16BitMode) ? X86::JG_2 : X86::JG_4; - case X86::JLE_1: - return (is16BitMode) ? X86::JLE_2 : X86::JLE_4; - case X86::JL_1: - return (is16BitMode) ? X86::JL_2 : X86::JL_4; + case X86::JCC_1: + return (is16BitMode) ? X86::JCC_2 : X86::JCC_4; case X86::JMP_1: return (is16BitMode) ? X86::JMP_2 : X86::JMP_4; - case X86::JNE_1: - return (is16BitMode) ? X86::JNE_2 : X86::JNE_4; - case X86::JNO_1: - return (is16BitMode) ? X86::JNO_2 : X86::JNO_4; - case X86::JNP_1: - return (is16BitMode) ? X86::JNP_2 : X86::JNP_4; - case X86::JNS_1: - return (is16BitMode) ? X86::JNS_2 : X86::JNS_4; - case X86::JO_1: - return (is16BitMode) ? X86::JO_2 : X86::JO_4; - case X86::JP_1: - return (is16BitMode) ? X86::JP_2 : X86::JP_4; - case X86::JS_1: - return (is16BitMode) ? X86::JS_2 : X86::JS_4; } } @@ -266,6 +243,25 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) { return getRelaxedOpcodeBranch(Inst, is16BitMode); } +Optional X86AsmBackend::getFixupKind(StringRef Name) const { + if (STI.getTargetTriple().isOSBinFormatELF()) { + if (STI.getTargetTriple().getArch() == Triple::x86_64) { + if (Name == "R_X86_64_NONE") + return FK_NONE; + } else { + if (Name == "R_386_NONE") + return FK_NONE; + } + } + return MCAsmBackend::getFixupKind(Name); +} + +bool X86AsmBackend::shouldForceRelocation(const MCAssembler &, + const MCFixup &Fixup, + const MCValue &) { + return Fixup.getKind() == FK_NONE; +} + bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const { // Branches can always be relaxed in either mode. diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index c85ce9bbd5a4..6bd6c6cac7df 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -1,9 +1,8 @@ //===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -49,7 +48,8 @@ namespace X86 { TO_NEG_INF = 1, TO_POS_INF = 2, TO_ZERO = 3, - CUR_DIRECTION = 4 + CUR_DIRECTION = 4, + NO_EXC = 8 }; /// The constants to describe instr prefixes if there are @@ -60,9 +60,46 @@ namespace X86 { IP_HAS_REPEAT_NE = 4, IP_HAS_REPEAT = 8, IP_HAS_LOCK = 16, - NO_SCHED_INFO = 32, // Don't add sched comment to the current instr because - // it was already added - IP_HAS_NOTRACK = 64 + IP_HAS_NOTRACK = 32, + IP_USE_VEX3 = 64, + }; + + enum OperandType : unsigned { + /// AVX512 embedded rounding control. This should only have values 0-3. + OPERAND_ROUNDING_CONTROL = MCOI::OPERAND_FIRST_TARGET, + OPERAND_COND_CODE, + }; + + // X86 specific condition code. These correspond to X86_*_COND in + // X86InstrInfo.td. They must be kept in synch. + enum CondCode { + COND_O = 0, + COND_NO = 1, + COND_B = 2, + COND_AE = 3, + COND_E = 4, + COND_NE = 5, + COND_BE = 6, + COND_A = 7, + COND_S = 8, + COND_NS = 9, + COND_P = 10, + COND_NP = 11, + COND_L = 12, + COND_GE = 13, + COND_LE = 14, + COND_G = 15, + LAST_VALID_COND = COND_G, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches that together + // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs and are inverses of one another. + COND_NE_OR_P, + COND_E_AND_NP, + + COND_INVALID }; } // end namespace X86; @@ -285,6 +322,10 @@ namespace X86II { /// manual, this operand is described as pntr16:32 and pntr16:16 RawFrmImm16 = 8, + /// AddCCFrm - This form is used for Jcc that encode the condition code + /// in the lower 4 bits of the opcode. + AddCCFrm = 9, + /// MRM[0-7][rm] - These forms are used to represent instructions that use /// a Mod/RM byte, and use the middle field to hold extended opcode /// information. In the intel manual these are represented as /0, /1, ... @@ -310,10 +351,21 @@ namespace X86II { /// MRMSrcMemOp4 = 35, + /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM + /// byte to specify the operands and also encodes a condition code. + /// + MRMSrcMemCC = 36, + + /// MRMXm - This form is used for instructions that use the Mod/RM byte + /// to specify a memory source, but doesn't use the middle field. And has + /// a condition code. + /// + MRMXmCC = 38, + /// MRMXm - This form is used for instructions that use the Mod/RM byte /// to specify a memory source, but doesn't use the middle field. /// - MRMXm = 39, // Instruction that uses Mod/RM but not the middle field. + MRMXm = 39, // Next, instructions that operate on a memory r/m operand... MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3 @@ -339,10 +391,21 @@ namespace X86II { /// MRMSrcRegOp4 = 51, + /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM + /// byte to specify the operands and also encodes a condition code + /// + MRMSrcRegCC = 52, + + /// MRMXCCr - This form is used for instructions that use the Mod/RM byte + /// to specify a register source, but doesn't use the middle field. And has + /// a condition code. + /// + MRMXrCC = 54, + /// MRMXr - This form is used for instructions that use the Mod/RM byte /// to specify a register source, but doesn't use the middle field. /// - MRMXr = 55, // Instruction that uses Mod/RM but not the middle field. + MRMXr = 55, // Instructions that operate on a register r/m operand... MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3 @@ -681,8 +744,7 @@ namespace X86II { // has it as the last op. if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && (Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 || - Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1) && - "Instruction with 2 defs isn't gather?") + Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1)) return 2; return 0; } @@ -711,6 +773,7 @@ namespace X86II { case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: + case X86II::AddCCFrm: return -1; case X86II::MRMDestMem: return 0; @@ -724,16 +787,23 @@ namespace X86II { case X86II::MRMSrcMemOp4: // Skip registers encoded in reg, VEX_VVVV, and I8IMM. return 3; + case X86II::MRMSrcMemCC: + // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a + // mask register. + return 1; case X86II::MRMDestReg: case X86II::MRMSrcReg: case X86II::MRMSrcReg4VOp3: case X86II::MRMSrcRegOp4: + case X86II::MRMSrcRegCC: + case X86II::MRMXrCC: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: return -1; + case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index b724a89f81d2..232a06593238 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -45,7 +44,7 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, (EMachine != ELF::EM_386) && (EMachine != ELF::EM_IAMCU)) {} -enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; +enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; static X86_64RelType getType64(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, @@ -53,6 +52,8 @@ static X86_64RelType getType64(unsigned Kind, switch (Kind) { default: llvm_unreachable("Unimplemented"); + case FK_NONE: + return RT64_NONE; case X86::reloc_global_offset_table8: Modifier = MCSymbolRefExpr::VK_GOT; IsPCRel = true; @@ -103,6 +104,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case MCSymbolRefExpr::VK_None: case MCSymbolRefExpr::VK_X86_ABS8: switch (Type) { + case RT64_NONE: + if (Modifier == MCSymbolRefExpr::VK_None) + return ELF::R_X86_64_NONE; + llvm_unreachable("Unimplemented"); case RT64_64: return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; case RT64_32: @@ -114,6 +119,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_8: return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8; } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOT: switch (Type) { case RT64_64: @@ -123,8 +129,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOTOFF: assert(Type == RT64_64); assert(!IsPCRel); @@ -139,8 +147,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_DTPOFF: assert(!IsPCRel); switch (Type) { @@ -151,8 +161,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_SIZE: assert(!IsPCRel); switch (Type) { @@ -163,8 +175,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_TLSCALL: return ELF::R_X86_64_TLSDESC_CALL; case MCSymbolRefExpr::VK_TLSDESC: @@ -197,13 +211,16 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case X86::reloc_riprel_4byte_movq_load: return ELF::R_X86_64_REX_GOTPCRELX; } + llvm_unreachable("unexpected relocation type!"); } } -enum X86_32RelType { RT32_32, RT32_16, RT32_8 }; +enum X86_32RelType { RT32_NONE, RT32_32, RT32_16, RT32_8 }; static X86_32RelType getType32(X86_64RelType T) { switch (T) { + case RT64_NONE: + return RT32_NONE; case RT64_64: llvm_unreachable("Unimplemented"); case RT64_32: @@ -227,6 +244,10 @@ static unsigned getRelocType32(MCContext &Ctx, case MCSymbolRefExpr::VK_None: case MCSymbolRefExpr::VK_X86_ABS8: switch (Type) { + case RT32_NONE: + if (Modifier == MCSymbolRefExpr::VK_None) + return ELF::R_386_NONE; + llvm_unreachable("Unimplemented"); case RT32_32: return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; case RT32_16: @@ -234,6 +255,7 @@ static unsigned getRelocType32(MCContext &Ctx, case RT32_8: return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8; } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOT: assert(Type == RT32_32); if (IsPCRel) @@ -249,6 +271,10 @@ static unsigned getRelocType32(MCContext &Ctx, assert(Type == RT32_32); assert(!IsPCRel); return ELF::R_386_GOTOFF; + case MCSymbolRefExpr::VK_TLSCALL: + return ELF::R_386_TLS_DESC_CALL; + case MCSymbolRefExpr::VK_TLSDESC: + return ELF::R_386_TLS_GOTDESC; case MCSymbolRefExpr::VK_TPOFF: assert(Type == RT32_32); assert(!IsPCRel); diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 3c04b13e002e..2d5217115d07 100644 --- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -1,9 +1,8 @@ //===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/lib/Target/X86/MCTargetDesc/X86InstComments.cpp new file mode 100644 index 000000000000..73b1969b4e82 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -0,0 +1,1322 @@ +//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#include "X86InstComments.h" +#include "X86ATTInstPrinter.h" +#include "X86BaseInfo.h" +#include "X86MCTargetDesc.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define CASE_SSE_INS_COMMON(Inst, src) \ + case X86::Inst##src: + +#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: + +#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src##k: + +#define CASE_MASKZ_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src##kz: + +#define CASE_AVX512_INS_COMMON(Inst, Suffix, src) \ + CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + CASE_MASKZ_INS_COMMON(Inst, Suffix, src) + +#define CASE_MOVDUP(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ + CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ + CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) + +#define CASE_MASK_MOVDUP(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) + +#define CASE_MASKZ_MOVDUP(Inst, src) \ + CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \ + CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASKZ_INS_COMMON(Inst, Z128, r##src) + +#define CASE_PMOVZX(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ + CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ + CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) + +#define CASE_MASK_PMOVZX(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) + +#define CASE_MASKZ_PMOVZX(Inst, src) \ + CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \ + CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASKZ_INS_COMMON(Inst, Z128, r##src) + +#define CASE_UNPCK(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ + CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ + CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) + +#define CASE_MASK_UNPCK(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) + +#define CASE_MASKZ_UNPCK(Inst, src) \ + CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \ + CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASKZ_INS_COMMON(Inst, Z128, r##src) + +#define CASE_SHUF(Inst, suf) \ + CASE_AVX512_INS_COMMON(Inst, Z, suf) \ + CASE_AVX512_INS_COMMON(Inst, Z256, suf) \ + CASE_AVX512_INS_COMMON(Inst, Z128, suf) \ + CASE_AVX_INS_COMMON(Inst, , suf) \ + CASE_AVX_INS_COMMON(Inst, Y, suf) \ + CASE_SSE_INS_COMMON(Inst, suf) + +#define CASE_MASK_SHUF(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) + +#define CASE_MASKZ_SHUF(Inst, src) \ + CASE_MASKZ_INS_COMMON(Inst, Z, r##src##i) \ + CASE_MASKZ_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_MASKZ_INS_COMMON(Inst, Z128, r##src##i) + +#define CASE_VPERMILPI(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z128, src##i) \ + CASE_AVX_INS_COMMON(Inst, , src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, src##i) + +#define CASE_MASK_VPERMILPI(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, src##i) + +#define CASE_MASKZ_VPERMILPI(Inst, src) \ + CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \ + CASE_MASKZ_INS_COMMON(Inst, Z256, src##i) \ + CASE_MASKZ_INS_COMMON(Inst, Z128, src##i) + +#define CASE_VPERM(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, src##i) + +#define CASE_MASK_VPERM(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, src##i) + +#define CASE_MASKZ_VPERM(Inst, src) \ + CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \ + CASE_MASKZ_INS_COMMON(Inst, Z256, src##i) + +#define CASE_VSHUF(Inst, src) \ + CASE_AVX512_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_AVX512_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_AVX512_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_AVX512_INS_COMMON(SHUFI##Inst, Z256, r##src##i) + +#define CASE_MASK_VSHUF(Inst, src) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) + +#define CASE_MASKZ_VSHUF(Inst, src) \ + CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i) + +#define CASE_AVX512_FMA(Inst, suf) \ + CASE_AVX512_INS_COMMON(Inst, Z, suf) \ + CASE_AVX512_INS_COMMON(Inst, Z256, suf) \ + CASE_AVX512_INS_COMMON(Inst, Z128, suf) + +#define CASE_FMA(Inst, suf) \ + CASE_AVX512_FMA(Inst, suf) \ + CASE_AVX_INS_COMMON(Inst, , suf) \ + CASE_AVX_INS_COMMON(Inst, Y, suf) + +#define CASE_FMA_PACKED_REG(Inst) \ + CASE_FMA(Inst##PD, r) \ + CASE_FMA(Inst##PS, r) + +#define CASE_FMA_PACKED_MEM(Inst) \ + CASE_FMA(Inst##PD, m) \ + CASE_FMA(Inst##PS, m) \ + CASE_AVX512_FMA(Inst##PD, mb) \ + CASE_AVX512_FMA(Inst##PS, mb) + +#define CASE_FMA_SCALAR_REG(Inst) \ + CASE_AVX_INS_COMMON(Inst##SD, , r) \ + CASE_AVX_INS_COMMON(Inst##SS, , r) \ + CASE_AVX_INS_COMMON(Inst##SD, , r_Int) \ + CASE_AVX_INS_COMMON(Inst##SS, , r_Int) \ + CASE_AVX_INS_COMMON(Inst##SD, Z, r) \ + CASE_AVX_INS_COMMON(Inst##SS, Z, r) \ + CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int) \ + CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int) + +#define CASE_FMA_SCALAR_MEM(Inst) \ + CASE_AVX_INS_COMMON(Inst##SD, , m) \ + CASE_AVX_INS_COMMON(Inst##SS, , m) \ + CASE_AVX_INS_COMMON(Inst##SD, , m_Int) \ + CASE_AVX_INS_COMMON(Inst##SS, , m_Int) \ + CASE_AVX_INS_COMMON(Inst##SD, Z, m) \ + CASE_AVX_INS_COMMON(Inst##SS, Z, m) \ + CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \ + CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int) + +static unsigned getVectorRegSize(unsigned RegNo) { + if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) + return 512; + if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31) + return 256; + if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31) + return 128; + if (X86::MM0 <= RegNo && RegNo <= X86::MM7) + return 64; + + llvm_unreachable("Unknown vector reg!"); +} + +static unsigned getRegOperandNumElts(const MCInst *MI, unsigned ScalarSize, + unsigned OperandIndex) { + unsigned OpReg = MI->getOperand(OperandIndex).getReg(); + return getVectorRegSize(OpReg) / ScalarSize; +} + +static const char *getRegName(unsigned Reg) { + return X86ATTInstPrinter::getRegisterName(Reg); +} + +/// Wraps the destination register name with AVX512 mask/maskz filtering. +static void printMasking(raw_ostream &OS, const MCInst *MI, + const MCInstrInfo &MCII) { + const MCInstrDesc &Desc = MCII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (!(TSFlags & X86II::EVEX_K)) + return; + + bool MaskWithZero = (TSFlags & X86II::EVEX_Z); + unsigned MaskOp = Desc.getNumDefs(); + + if (Desc.getOperandConstraint(MaskOp, MCOI::TIED_TO) != -1) + ++MaskOp; + + const char *MaskRegName = getRegName(MI->getOperand(MaskOp).getReg()); + + // MASK: zmmX {%kY} + OS << " {%" << MaskRegName << "}"; + + // MASKZ: zmmX {%kY} {z} + if (MaskWithZero) + OS << " {z}"; +} + +static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) { + const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr; + unsigned NumOperands = MI->getNumOperands(); + bool RegForm = false; + bool Negate = false; + StringRef AccStr = "+"; + + // The operands for FMA instructions without rounding fall into two forms. + // dest, src1, src2, src3 + // dest, src1, mask, src2, src3 + // Where src3 is either a register or 5 memory address operands. So to find + // dest and src1 we can index from the front. To find src2 and src3 we can + // index from the end by taking into account memory vs register form when + // finding src2. + + switch (MI->getOpcode()) { + default: + return false; + CASE_FMA_PACKED_REG(FMADD132) + CASE_FMA_SCALAR_REG(FMADD132) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMADD132) + CASE_FMA_SCALAR_MEM(FMADD132) + AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + break; + + CASE_FMA_PACKED_REG(FMADD213) + CASE_FMA_SCALAR_REG(FMADD213) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMADD213) + CASE_FMA_SCALAR_MEM(FMADD213) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul2Name = getRegName(MI->getOperand(1).getReg()); + break; + + CASE_FMA_PACKED_REG(FMADD231) + CASE_FMA_SCALAR_REG(FMADD231) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMADD231) + CASE_FMA_SCALAR_MEM(FMADD231) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + AccName = getRegName(MI->getOperand(1).getReg()); + break; + + CASE_FMA_PACKED_REG(FMSUB132) + CASE_FMA_SCALAR_REG(FMSUB132) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMSUB132) + CASE_FMA_SCALAR_MEM(FMSUB132) + AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + break; + + CASE_FMA_PACKED_REG(FMSUB213) + CASE_FMA_SCALAR_REG(FMSUB213) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMSUB213) + CASE_FMA_SCALAR_MEM(FMSUB213) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul2Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + break; + + CASE_FMA_PACKED_REG(FMSUB231) + CASE_FMA_SCALAR_REG(FMSUB231) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMSUB231) + CASE_FMA_SCALAR_MEM(FMSUB231) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + AccName = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + break; + + CASE_FMA_PACKED_REG(FNMADD132) + CASE_FMA_SCALAR_REG(FNMADD132) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FNMADD132) + CASE_FMA_SCALAR_MEM(FNMADD132) + AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + Negate = true; + break; + + CASE_FMA_PACKED_REG(FNMADD213) + CASE_FMA_SCALAR_REG(FNMADD213) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FNMADD213) + CASE_FMA_SCALAR_MEM(FNMADD213) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul2Name = getRegName(MI->getOperand(1).getReg()); + Negate = true; + break; + + CASE_FMA_PACKED_REG(FNMADD231) + CASE_FMA_SCALAR_REG(FNMADD231) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FNMADD231) + CASE_FMA_SCALAR_MEM(FNMADD231) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + AccName = getRegName(MI->getOperand(1).getReg()); + Negate = true; + break; + + CASE_FMA_PACKED_REG(FNMSUB132) + CASE_FMA_SCALAR_REG(FNMSUB132) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FNMSUB132) + CASE_FMA_SCALAR_MEM(FNMSUB132) + AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + Negate = true; + break; + + CASE_FMA_PACKED_REG(FNMSUB213) + CASE_FMA_SCALAR_REG(FNMSUB213) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FNMSUB213) + CASE_FMA_SCALAR_MEM(FNMSUB213) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul2Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + Negate = true; + break; + + CASE_FMA_PACKED_REG(FNMSUB231) + CASE_FMA_SCALAR_REG(FNMSUB231) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FNMSUB231) + CASE_FMA_SCALAR_MEM(FNMSUB231) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + AccName = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + Negate = true; + break; + + CASE_FMA_PACKED_REG(FMADDSUB132) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMADDSUB132) + AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "+/-"; + break; + + CASE_FMA_PACKED_REG(FMADDSUB213) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMADDSUB213) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul2Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "+/-"; + break; + + CASE_FMA_PACKED_REG(FMADDSUB231) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMADDSUB231) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + AccName = getRegName(MI->getOperand(1).getReg()); + AccStr = "+/-"; + break; + + CASE_FMA_PACKED_REG(FMSUBADD132) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMSUBADD132) + AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-/+"; + break; + + CASE_FMA_PACKED_REG(FMSUBADD213) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMSUBADD213) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + Mul2Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-/+"; + break; + + CASE_FMA_PACKED_REG(FMSUBADD231) + Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + CASE_FMA_PACKED_MEM(FMSUBADD231) + Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + AccName = getRegName(MI->getOperand(1).getReg()); + AccStr = "-/+"; + break; + } + + const char *DestName = getRegName(MI->getOperand(0).getReg()); + + if (!Mul1Name) Mul1Name = "mem"; + if (!Mul2Name) Mul2Name = "mem"; + if (!AccName) AccName = "mem"; + + OS << DestName << " = "; + // TODO: Print masking information? + + if (Negate) + OS << '-'; + + OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' ' + << AccName; + + return true; +} + + +//===----------------------------------------------------------------------===// +// Top Level Entrypoint +//===----------------------------------------------------------------------===// + +/// EmitAnyX86InstComments - This function decodes x86 instructions and prints +/// newline terminated strings to the specified string if desired. This +/// information is shown in disassembly dumps when verbose assembly is enabled. +bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const MCInstrInfo &MCII) { + // If this is a shuffle operation, the switch should fill in this state. + SmallVector ShuffleMask; + const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; + unsigned NumOperands = MI->getNumOperands(); + bool RegForm = false; + + if (printFMA3Comments(MI, OS)) + return true; + + switch (MI->getOpcode()) { + default: + // Not an instruction for which we can decode comments. + return false; + + case X86::BLENDPDrri: + case X86::VBLENDPDrri: + case X86::VBLENDPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + LLVM_FALLTHROUGH; + case X86::BLENDPDrmi: + case X86::VBLENDPDrmi: + case X86::VBLENDPDYrmi: + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeBLENDMask(getRegOperandNumElts(MI, 64, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::BLENDPSrri: + case X86::VBLENDPSrri: + case X86::VBLENDPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + LLVM_FALLTHROUGH; + case X86::BLENDPSrmi: + case X86::VBLENDPSrmi: + case X86::VBLENDPSYrmi: + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::PBLENDWrri: + case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + LLVM_FALLTHROUGH; + case X86::PBLENDWrmi: + case X86::VPBLENDWrmi: + case X86::VPBLENDWYrmi: + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeBLENDMask(getRegOperandNumElts(MI, 16, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPBLENDDrri: + case X86::VPBLENDDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + LLVM_FALLTHROUGH; + case X86::VPBLENDDrmi: + case X86::VPBLENDDYrmi: + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + LLVM_FALLTHROUGH; + case X86::INSERTPSrm: + case X86::VINSERTPSrm: + case X86::VINSERTPSZrm: + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeINSERTPSMask(MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + case X86::MOVLHPSrr: + case X86::VMOVLHPSrr: + case X86::VMOVLHPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVLHPSMask(2, ShuffleMask); + break; + + case X86::MOVHLPSrr: + case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVHLPSMask(2, ShuffleMask); + break; + + case X86::MOVHPDrm: + case X86::VMOVHPDrm: + case X86::VMOVHPDZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(2, 1, 1, ShuffleMask); + break; + + case X86::MOVHPSrm: + case X86::VMOVHPSrm: + case X86::VMOVHPSZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(4, 2, 2, ShuffleMask); + break; + + case X86::MOVLPDrm: + case X86::VMOVLPDrm: + case X86::VMOVLPDZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(2, 0, 1, ShuffleMask); + break; + + case X86::MOVLPSrm: + case X86::VMOVLPSrm: + case X86::VMOVLPSZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(4, 0, 2, ShuffleMask); + break; + + CASE_MOVDUP(MOVSLDUP, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + + CASE_MOVDUP(MOVSLDUP, m) + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSLDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask); + break; + + CASE_MOVDUP(MOVSHDUP, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + + CASE_MOVDUP(MOVSHDUP, m) + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSHDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask); + break; + + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + + CASE_MOVDUP(MOVDDUP, m) + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVDDUPMask(getRegOperandNumElts(MI, 64, 0), ShuffleMask); + break; + + case X86::PSLLDQri: + case X86::VPSLLDQri: + case X86::VPSLLDQYri: + case X86::VPSLLDQZ128rr: + case X86::VPSLLDQZ256rr: + case X86::VPSLLDQZrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + case X86::VPSLLDQZ128rm: + case X86::VPSLLDQZ256rm: + case X86::VPSLLDQZrm: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + case X86::PSRLDQri: + case X86::VPSRLDQri: + case X86::VPSRLDQYri: + case X86::VPSRLDQZ128rr: + case X86::VPSRLDQZ256rr: + case X86::VPSRLDQZrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + case X86::VPSRLDQZ128rm: + case X86::VPSRLDQZ256rm: + case X86::VPSRLDQZrm: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_SHUF(PALIGNR, rri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_SHUF(PALIGNR, rmi) + Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePALIGNRMask(getRegOperandNumElts(MI, 8, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri) + CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri) + CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi) + CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi) + CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi) + Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVALIGNMask(getRegOperandNumElts(MI, 64, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_AVX512_INS_COMMON(ALIGND, Z, rri) + CASE_AVX512_INS_COMMON(ALIGND, Z256, rri) + CASE_AVX512_INS_COMMON(ALIGND, Z128, rri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_AVX512_INS_COMMON(ALIGND, Z, rmi) + CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi) + CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi) + Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVALIGNMask(getRegOperandNumElts(MI, 32, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_SHUF(PSHUFD, ri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_SHUF(PSHUFD, mi) + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32, + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_SHUF(PSHUFHW, ri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_SHUF(PSHUFHW, mi) + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSHUFHWMask(getRegOperandNumElts(MI, 16, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_SHUF(PSHUFLW, ri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_SHUF(PSHUFLW, mi) + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSHUFLWMask(getRegOperandNumElts(MI, 16, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + case X86::MMX_PSHUFWri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + + case X86::MMX_PSHUFWmi: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSHUFMask(4, 16, MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + case X86::PSWAPDrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + + case X86::PSWAPDrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSWAPMask(2, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHBW, r) + case X86::MMX_PUNPCKHBWirr: + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKHBW, m) + case X86::MMX_PUNPCKHBWirm: + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHWD, r) + case X86::MMX_PUNPCKHWDirr: + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKHWD, m) + case X86::MMX_PUNPCKHWDirm: + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHDQ, r) + case X86::MMX_PUNPCKHDQirr: + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKHDQ, m) + case X86::MMX_PUNPCKHDQirm: + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHQDQ, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKHQDQ, m) + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLBW, r) + case X86::MMX_PUNPCKLBWirr: + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKLBW, m) + case X86::MMX_PUNPCKLBWirm: + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLWD, r) + case X86::MMX_PUNPCKLWDirr: + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKLWD, m) + case X86::MMX_PUNPCKLWDirm: + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLDQ, r) + case X86::MMX_PUNPCKLDQirr: + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKLDQ, m) + case X86::MMX_PUNPCKLDQirm: + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLQDQ, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(PUNPCKLQDQ, m) + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); + break; + + CASE_SHUF(SHUFPD, rri) + Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_SHUF(SHUFPD, rmi) + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeSHUFPMask(getRegOperandNumElts(MI, 64, 0), 64, + MI->getOperand(NumOperands - 1).getImm(), ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_SHUF(SHUFPS, rri) + Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_SHUF(SHUFPS, rmi) + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeSHUFPMask(getRegOperandNumElts(MI, 32, 0), 32, + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VSHUF(64X2, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_VSHUF(64X2, m) + decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 64, 0), 64, + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VSHUF(32X4, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_VSHUF(32X4, m) + decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 32, 0), 32, + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKLPD, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(UNPCKLPD, m) + DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKLPS, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(UNPCKLPS, m) + DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKHPD, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(UNPCKHPD, m) + DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKHPS, r) + Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_UNPCK(UNPCKHPS, m) + DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VPERMILPI(PERMILPS, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_VPERMILPI(PERMILPS, m) + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32, + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VPERMILPI(PERMILPD, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_VPERMILPI(PERMILPD, m) + if (MI->getOperand(NumOperands - 1).isImm()) + DecodePSHUFMask(getRegOperandNumElts(MI, 64, 0), 64, + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPERM2F128rr: + case X86::VPERM2I128rr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + LLVM_FALLTHROUGH; + + case X86::VPERM2F128rm: + case X86::VPERM2I128rm: + // For instruction comments purpose, assume the 256-bit vector is v4i64. + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVPERM2X128Mask(4, MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VPERM(PERMPD, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_VPERM(PERMPD, m) + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VPERM(PERMQ, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + LLVM_FALLTHROUGH; + + CASE_VPERM(PERMQ, m) + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVSDrr: + case X86::VMOVSDrr: + case X86::VMOVSDZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + + case X86::MOVSDrm_alt: + case X86::MOVSDrm: + case X86::VMOVSDrm_alt: + case X86::VMOVSDrm: + case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: + DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVSSrr: + case X86::VMOVSSrr: + case X86::VMOVSSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + + case X86::MOVSSrm: + case X86::MOVSSrm_alt: + case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: + case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: + DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVPQI2QIrr: + case X86::MOVZPQILo2PQIrr: + case X86::VMOVPQI2QIrr: + case X86::VMOVPQI2QIZrr: + case X86::VMOVZPQILo2PQIrr: + case X86::VMOVZPQILo2PQIZrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; + + case X86::MOVQI2PQIrm: + case X86::VMOVQI2PQIrm: + case X86::VMOVQI2PQIZrm: + DecodeZeroMoveLowMask(2, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVDI2PDIrm: + case X86::VMOVDI2PDIrm: + case X86::VMOVDI2PDIZrm: + DecodeZeroMoveLowMask(4, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::EXTRQI: + if (MI->getOperand(2).isImm() && + MI->getOperand(3).isImm()) + DecodeEXTRQIMask(16, 8, MI->getOperand(2).getImm(), + MI->getOperand(3).getImm(), ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + + case X86::INSERTQI: + if (MI->getOperand(3).isImm() && + MI->getOperand(4).isImm()) + DecodeINSERTQIMask(16, 8, MI->getOperand(3).getImm(), + MI->getOperand(4).getImm(), ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + break; + + case X86::VBROADCASTF128: + case X86::VBROADCASTI128: + CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm) + DecodeSubVectorBroadcast(4, 2, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm) + DecodeSubVectorBroadcast(8, 2, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm) + DecodeSubVectorBroadcast(8, 4, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm) + DecodeSubVectorBroadcast(8, 4, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm) + DecodeSubVectorBroadcast(16, 4, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm) + DecodeSubVectorBroadcast(16, 8, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m) + DecodeSubVectorBroadcast(4, 2, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) + DecodeSubVectorBroadcast(8, 2, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) + DecodeSubVectorBroadcast(16, 2, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_PMOVZX(PMOVZXBW, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXBW, m) + DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false, + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_PMOVZX(PMOVZXBD, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXBD, m) + DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false, + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_PMOVZX(PMOVZXBQ, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXBQ, m) + DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_PMOVZX(PMOVZXWD, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXWD, m) + DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false, + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_PMOVZX(PMOVZXWQ, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXWQ, m) + DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_PMOVZX(PMOVZXDQ, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXDQ, m) + DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + } + + // The only comments we decode are shuffles, so give up if we were unable to + // decode a shuffle mask. + if (ShuffleMask.empty()) + return false; + + if (!DestName) DestName = Src1Name; + if (DestName) { + OS << DestName; + printMasking(OS, MI, MCII); + } else + OS << "mem"; + + OS << " = "; + + // If the two sources are the same, canonicalize the input elements to be + // from the first src so that we get larger element spans. + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] -= e; + } + } + + // The shuffle mask specifies which elements of the src1/src2 fill in the + // destination, with a few sentinel values. Loop through and print them + // out. + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) + OS << ','; + if (ShuffleMask[i] == SM_SentinelZero) { + OS << "zero"; + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); + const char *SrcName = isSrc1 ? Src1Name : Src2Name; + OS << (SrcName ? SrcName : "mem") << '['; + bool IsFirst = true; + while (i != e && (int)ShuffleMask[i] != SM_SentinelZero && + (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { + if (!IsFirst) + OS << ','; + else + IsFirst = false; + if (ShuffleMask[i] == SM_SentinelUndef) + OS << "u"; + else + OS << ShuffleMask[i] % ShuffleMask.size(); + ++i; + } + OS << ']'; + --i; // For loop increments element #. + } + OS << '\n'; + + // We successfully added a comment to this instruction. + return true; +} diff --git a/lib/Target/X86/MCTargetDesc/X86InstComments.h b/lib/Target/X86/MCTargetDesc/X86InstComments.h new file mode 100644 index 000000000000..96760664012a --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86InstComments.h @@ -0,0 +1,26 @@ +//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H + +namespace llvm { + + class MCInst; + class MCInstrInfo; + class raw_ostream; + bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const MCInstrInfo &MCII); +} + +#endif diff --git a/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp new file mode 100644 index 000000000000..a21555076976 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -0,0 +1,362 @@ +//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes common code for rendering MCInst instances as Intel-style +// and Intel-style assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86InstPrinterCommon.h" +#include "X86BaseInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" +#include +#include + +using namespace llvm; + +void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid condcode argument!"); + case 0: O << "o"; break; + case 1: O << "no"; break; + case 2: O << "b"; break; + case 3: O << "ae"; break; + case 4: O << "e"; break; + case 5: O << "ne"; break; + case 6: O << "be"; break; + case 7: O << "a"; break; + case 8: O << "s"; break; + case 9: O << "ns"; break; + case 0xa: O << "p"; break; + case 0xb: O << "np"; break; + case 0xc: O << "l"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "le"; break; + case 0xf: O << "g"; break; + } +} + +void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid ssecc/avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } +} + +void X86InstPrinterCommon::printVPCOMMnemonic(const MCInst *MI, + raw_ostream &OS) { + OS << "vpcom"; + + int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid vpcom argument!"); + case 0: OS << "lt"; break; + case 1: OS << "le"; break; + case 2: OS << "gt"; break; + case 3: OS << "ge"; break; + case 4: OS << "eq"; break; + case 5: OS << "neq"; break; + case 6: OS << "false"; break; + case 7: OS << "true"; break; + } + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::VPCOMBmi: case X86::VPCOMBri: OS << "b\t"; break; + case X86::VPCOMDmi: case X86::VPCOMDri: OS << "d\t"; break; + case X86::VPCOMQmi: case X86::VPCOMQri: OS << "q\t"; break; + case X86::VPCOMUBmi: case X86::VPCOMUBri: OS << "ub\t"; break; + case X86::VPCOMUDmi: case X86::VPCOMUDri: OS << "ud\t"; break; + case X86::VPCOMUQmi: case X86::VPCOMUQri: OS << "uq\t"; break; + case X86::VPCOMUWmi: case X86::VPCOMUWri: OS << "uw\t"; break; + case X86::VPCOMWmi: case X86::VPCOMWri: OS << "w\t"; break; + } +} + +void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI, + raw_ostream &OS) { + OS << "vpcmp"; + + printSSEAVXCC(MI, MI->getNumOperands() - 1, OS); + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri: + case X86::VPCMPBZrmi: case X86::VPCMPBZrri: + case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmik: case X86::VPCMPBZrrik: + OS << "b\t"; + break; + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri: + case X86::VPCMPDZrmi: case X86::VPCMPDZrri: + case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmik: case X86::VPCMPDZrrik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + OS << "d\t"; + break; + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri: + case X86::VPCMPQZrmi: case X86::VPCMPQZrri: + case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmik: case X86::VPCMPQZrrik: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + OS << "q\t"; + break; + case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri: + case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri: + case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri: + case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik: + case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik: + case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik: + OS << "ub\t"; + break; + case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri: + case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri: + case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri: + case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik: + case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik: + case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik: + case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk: + case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk: + case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk: + OS << "ud\t"; + break; + case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri: + case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri: + case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri: + case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik: + case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik: + case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik: + case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk: + case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk: + case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk: + OS << "uq\t"; + break; + case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri: + case X86::VPCMPUWZ256rri: case X86::VPCMPUWZ256rmi: + case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri: + case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik: + case X86::VPCMPUWZ256rrik: case X86::VPCMPUWZ256rmik: + case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik: + OS << "uw\t"; + break; + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri: + case X86::VPCMPWZrmi: case X86::VPCMPWZrri: + case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmik: case X86::VPCMPWZrrik: + OS << "w\t"; + break; + } +} + +void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, + raw_ostream &OS) { + OS << (IsVCmp ? "vcmp" : "cmp"); + + printSSEAVXCC(MI, MI->getNumOperands() - 1, OS); + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::CMPPDrmi: case X86::CMPPDrri: + case X86::VCMPPDrmi: case X86::VCMPPDrri: + case X86::VCMPPDYrmi: case X86::VCMPPDYrri: + case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri: + case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri: + case X86::VCMPPDZrmi: case X86::VCMPPDZrri: + case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik: + case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik: + case X86::VCMPPDZrmik: case X86::VCMPPDZrrik: + case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: + case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: + case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: + case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: + OS << "pd\t"; + break; + case X86::CMPPSrmi: case X86::CMPPSrri: + case X86::VCMPPSrmi: case X86::VCMPPSrri: + case X86::VCMPPSYrmi: case X86::VCMPPSYrri: + case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri: + case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri: + case X86::VCMPPSZrmi: case X86::VCMPPSZrri: + case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: + case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: + case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: + case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik: + case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik: + case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: + case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: + OS << "ps\t"; + break; + case X86::CMPSDrm: case X86::CMPSDrr: + case X86::CMPSDrm_Int: case X86::CMPSDrr_Int: + case X86::VCMPSDrm: case X86::VCMPSDrr: + case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int: + case X86::VCMPSDZrm: case X86::VCMPSDZrr: + case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int: + case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk: + case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: + OS << "sd\t"; + break; + case X86::CMPSSrm: case X86::CMPSSrr: + case X86::CMPSSrm_Int: case X86::CMPSSrr_Int: + case X86::VCMPSSrm: case X86::VCMPSSrr: + case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int: + case X86::VCMPSSZrm: case X86::VCMPSSZrr: + case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int: + case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk: + case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + OS << "ss\t"; + break; + } +} + +void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: + llvm_unreachable("Invalid rounding control!"); + case X86::TO_NEAREST_INT: + O << "{rn-sae}"; + break; + case X86::TO_NEG_INF: + O << "{rd-sae}"; + break; + case X86::TO_POS_INF: + O << "{ru-sae}"; + break; + case X86::TO_ZERO: + O << "{rz-sae}"; + break; + } +} + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value (e.g. for jumps and calls). In +/// Intel-style these print slightly differently than normal immediates. +/// for example, a $ is not emitted. +void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << formatImm(Op.getImm()); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { + O << formatHex((uint64_t)Address); + } else { + // Otherwise, just print the expression. + Op.getExpr()->print(O, &MAI); + } + } +} + +void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getReg()) { + printOperand(MI, OpNo, O); + O << ':'; + } +} + +void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + unsigned Flags = MI->getFlags(); + + if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK)) + O << "\tlock\t"; + + if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK)) + O << "\tnotrack\t"; + + if (Flags & X86::IP_HAS_REPEAT_NE) + O << "\trepne\t"; + else if (Flags & X86::IP_HAS_REPEAT) + O << "\trep\t"; +} + +void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + // In assembly listings, a pair is represented by one of its members, any + // of the two. Here, we pick k0, k2, k4, k6, but we could as well + // print K2_K3 as "k3". It would probably make a lot more sense, if + // the assembly would look something like: + // "vp2intersect %zmm5, %zmm7, {%k2, %k3}" + // but this can work too. + switch (MI->getOperand(OpNo).getReg()) { + case X86::K0_K1: + printRegName(OS, X86::K0); + return; + case X86::K2_K3: + printRegName(OS, X86::K2); + return; + case X86::K4_K5: + printRegName(OS, X86::K4); + return; + case X86::K6_K7: + printRegName(OS, X86::K6); + return; + } + llvm_unreachable("Unknown mask pair register name"); +} diff --git a/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h new file mode 100644 index 000000000000..8e28f24b619a --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -0,0 +1,41 @@ +//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes code common for rendering MCInst instances as AT&T-style +// and Intel-style assembly. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class X86InstPrinterCommon : public MCInstPrinter { +public: + using MCInstPrinter::MCInstPrinter; + + virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0; + void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printVPCOMMnemonic(const MCInst *MI, raw_ostream &OS); + void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS); + void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS); + void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); +protected: + void printInstFlags(const MCInst *MI, raw_ostream &O); + void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H diff --git a/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp new file mode 100644 index 000000000000..ea28bef42569 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -0,0 +1,445 @@ +//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as Intel-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86IntelInstPrinter.h" +#include "X86BaseInfo.h" +#include "X86InstComments.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter1.inc" + +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, + const MCSubtargetInfo &STI) { + printInstFlags(MI, OS); + + // In 16-bit mode, print data16 as data32. + if (MI->getOpcode() == X86::DATA16_PREFIX && + STI.getFeatureBits()[X86::Mode16Bit]) { + OS << "\tdata32"; + } else if (!printAliasInstr(MI, OS) && + !printVecCompareInstr(MI, OS)) + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + EmitAnyX86InstComments(MI, *CommentStream, MII); +} + +bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS) { + if (MI->getNumOperands() == 0 || + !MI->getOperand(MI->getNumOperands() - 1).isImm()) + return false; + + int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + + // Custom print the vector compare instructions to get the immediate + // translated into the mnemonic. + switch (MI->getOpcode()) { + case X86::CMPPDrmi: case X86::CMPPDrri: + case X86::CMPPSrmi: case X86::CMPPSrri: + case X86::CMPSDrm: case X86::CMPSDrr: + case X86::CMPSDrm_Int: case X86::CMPSDrr_Int: + case X86::CMPSSrm: case X86::CMPSSrr: + case X86::CMPSSrm_Int: case X86::CMPSSrr_Int: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/false, OS); + printOperand(MI, 0, OS); + OS << ", "; + // Skip operand 1 as its tied to the dest. + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, 2, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, 2, OS); + else + printxmmwordmem(MI, 2, OS); + } else + printOperand(MI, 2, OS); + + return true; + } + break; + + case X86::VCMPPDrmi: case X86::VCMPPDrri: + case X86::VCMPPDYrmi: case X86::VCMPPDYrri: + case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri: + case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri: + case X86::VCMPPDZrmi: case X86::VCMPPDZrri: + case X86::VCMPPSrmi: case X86::VCMPPSrri: + case X86::VCMPPSYrmi: case X86::VCMPPSYrri: + case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri: + case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri: + case X86::VCMPPSZrmi: case X86::VCMPPSZrri: + case X86::VCMPSDrm: case X86::VCMPSDrr: + case X86::VCMPSDZrm: case X86::VCMPSDZrr: + case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int: + case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int: + case X86::VCMPSSrm: case X86::VCMPSSrr: + case X86::VCMPSSZrm: case X86::VCMPSSZrr: + case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int: + case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int: + case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik: + case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik: + case X86::VCMPPDZrmik: case X86::VCMPPDZrrik: + case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: + case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: + case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: + case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk: + case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk: + case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: + case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: + case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: + case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik: + case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik: + case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: + case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: + case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: + case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: + case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + if (Imm >= 0 && Imm <= 31) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/true, OS); + + unsigned CurOp = 0; + printOperand(MI, CurOp++, OS); + + if (Desc.TSFlags & X86II::EVEX_K) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp++, OS); + OS << "}"; + } + OS << ", "; + printOperand(MI, CurOp++, OS); + OS << ", "; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp++, OS); + else + printdwordmem(MI, CurOp++, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, CurOp++, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, CurOp++, OS); + else if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp++, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp++, OS); + else + printxmmwordmem(MI, CurOp++, OS); + } + } else { + printOperand(MI, CurOp++, OS); + if (Desc.TSFlags & X86II::EVEX_B) + OS << ", {sae}"; + } + + return true; + } + break; + + case X86::VPCOMBmi: case X86::VPCOMBri: + case X86::VPCOMDmi: case X86::VPCOMDri: + case X86::VPCOMQmi: case X86::VPCOMQri: + case X86::VPCOMUBmi: case X86::VPCOMUBri: + case X86::VPCOMUDmi: case X86::VPCOMUDri: + case X86::VPCOMUQmi: case X86::VPCOMUQri: + case X86::VPCOMUWmi: case X86::VPCOMUWri: + case X86::VPCOMWmi: case X86::VPCOMWri: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printVPCOMMnemonic(MI, OS); + printOperand(MI, 0, OS); + OS << ", "; + printOperand(MI, 1, OS); + OS << ", "; + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) + printxmmwordmem(MI, 2, OS); + else + printOperand(MI, 2, OS); + return true; + } + break; + + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri: + case X86::VPCMPBZrmi: case X86::VPCMPBZrri: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri: + case X86::VPCMPDZrmi: case X86::VPCMPDZrri: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri: + case X86::VPCMPQZrmi: case X86::VPCMPQZrri: + case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri: + case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri: + case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri: + case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri: + case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri: + case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri: + case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri: + case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri: + case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri: + case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri: + case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri: + case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri: + case X86::VPCMPWZrmi: case X86::VPCMPWZrri: + case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmik: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmik: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmik: case X86::VPCMPQZrrik: + case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik: + case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik: + case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik: + case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik: + case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik: + case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik: + case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik: + case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik: + case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik: + case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik: + case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik: + case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik: + case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmik: case X86::VPCMPWZrrik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk: + case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk: + case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk: + case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk: + case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk: + case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk: + if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) { + OS << '\t'; + printVPCMPMnemonic(MI, OS); + + unsigned CurOp = 0; + printOperand(MI, CurOp++, OS); + + if (Desc.TSFlags & X86II::EVEX_K) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp++, OS); + OS << "}"; + } + OS << ", "; + printOperand(MI, CurOp++, OS); + OS << ", "; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit as only D and Q are supported. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp++, OS); + else + printdwordmem(MI, CurOp++, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp++, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp++, OS); + else + printxmmwordmem(MI, CurOp++, OS); + } + } else { + printOperand(MI, CurOp++, OS); + } + + return true; + } + break; + } + + return false; +} + +void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + O << formatImm((int64_t)Op.getImm()); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << "offset "; + Op.getExpr()->print(O, &MAI); + } +} + +void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); + unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O); + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op+X86::AddrBaseReg, O); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+X86::AddrIndexReg, O); + NeedPlus = true; + } + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + DispSpec.getExpr()->print(O, &MAI); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << formatImm(DispVal); + } + } + + O << ']'; +} + +void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + 1, O); + O << '['; + printOperand(MI, Op, O); + O << ']'; +} + +void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + // DI accesses are always ES-based. + O << "es:["; + printOperand(MI, Op, O); + O << ']'; +} + +void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + 1, O); + + O << '['; + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + DispSpec.getExpr()->print(O, &MAI); + } + + O << ']'; +} + +void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + if (MI->getOperand(Op).isExpr()) + return MI->getOperand(Op).getExpr()->print(O, &MAI); + + O << formatImm(MI->getOperand(Op).getImm() & 0xff); +} + +void X86IntelInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + unsigned Reg = Op.getReg(); + // Override the default printing to print st(0) instead st. + if (Reg == X86::ST0) + OS << "st(0)"; + else + printRegName(OS, Reg); +} diff --git a/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h new file mode 100644 index 000000000000..f32f49f7c417 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h @@ -0,0 +1,144 @@ +//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to Intel style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H + +#include "X86InstPrinterCommon.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class X86IntelInstPrinter final : public X86InstPrinterCommon { +public: + X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : X86InstPrinterCommon(MAI, MII, MRI) {} + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; + bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS); + + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override; + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); + void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O); + void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printMemReference(MI, OpNo, O); + } + void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printMemReference(MI, OpNo, O); + } + void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printMemReference(MI, OpNo, O); + } + void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printMemReference(MI, OpNo, O); + } + void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "xmmword ptr "; + printMemReference(MI, OpNo, O); + } + void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; + printMemReference(MI, OpNo, O); + } + void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "tbyte ptr "; + printMemReference(MI, OpNo, O); + } + + + void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printSrcIdx(MI, OpNo, O); + } + void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printDstIdx(MI, OpNo, O); + } + void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printDstIdx(MI, OpNo, O); + } + void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printDstIdx(MI, OpNo, O); + } + void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printDstIdx(MI, OpNo, O); + } + void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printMemOffset(MI, OpNo, O); + } +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index fa7c352a1b63..e1125c176b25 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index 30d5c802d1ed..b2369647a40f 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -1,9 +1,8 @@ //===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index f5371db9e77a..31d26d08a63f 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -525,9 +524,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // indirect register encoding, this handles addresses like [EAX]. The // encoding for [EBP] with no displacement means [disp32] so we handle it // by emitting a displacement of 0 below. - if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) { - EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); - return; + if (BaseRegNo != N86::EBP) { + if (Disp.isImm() && Disp.getImm() == 0) { + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + + // If the displacement is @tlscall, treat it as a zero. + if (Disp.isExpr()) { + auto *Sym = dyn_cast(Disp.getExpr()); + if (Sym && Sym->getKind() == MCSymbolRefExpr::VK_TLSCALL) { + // This is exclusively used by call *a@tlscall(base). The relocation + // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning. + Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc())); + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + } } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. @@ -880,7 +893,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (HasEVEX_RC) { unsigned RcOperand = NumOps-1; assert(RcOperand >= CurOp); - EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3; + EVEX_rc = MI.getOperand(RcOperand).getImm(); + assert(EVEX_rc <= 3 && "Invalid rounding control!"); } EncodeRC = true; } @@ -979,7 +993,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); // Can we use the 2 byte VEX prefix? - if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { + if (!(MI.getFlags() & X86::IP_USE_VEX3) && + Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { EmitByte(0xC5, CurByte, OS); EmitByte(LastByte | (VEX_R << 7), CurByte, OS); return; @@ -1060,16 +1075,17 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; case X86II::MRMSrcReg: + case X86II::MRMSrcRegCC: REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; - case X86II::MRMSrcMem: { + case X86II::MRMSrcMem: + case X86II::MRMSrcMemCC: REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X CurOp += X86::AddrNumOperands; break; - } case X86II::MRMDestReg: REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R @@ -1080,7 +1096,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, CurOp += X86::AddrNumOperands; REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R break; - case X86II::MRMXm: + case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: @@ -1088,7 +1104,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X break; - case X86II::MRMXr: + case X86II::MRMXrCC: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: @@ -1272,6 +1288,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow) BaseOpcode = 0x0F; // Weird 3DNow! encoding. + unsigned OpcodeOffset = 0; + uint64_t Form = TSFlags & X86II::FormMask; switch (Form) { default: errs() << "FORM: " << Form << "\n"; @@ -1318,8 +1336,14 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); break; } - case X86II::RawFrm: { - EmitByte(BaseOpcode, CurByte, OS); + case X86II::AddCCFrm: { + // This will be added to the opcode in the fallthrough. + OpcodeOffset = MI.getOperand(NumOps - 1).getImm(); + assert(OpcodeOffset < 16 && "Unexpected opcode offset!"); + --NumOps; // Drop the operand from the end. + LLVM_FALLTHROUGH; + case X86II::RawFrm: + EmitByte(BaseOpcode + OpcodeOffset, CurByte, OS); if (!is64BitMode(STI) || !isPCRel32Branch(MI)) break; @@ -1436,6 +1460,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = SrcRegNum + 1; break; } + case X86II::MRMSrcRegCC: { + unsigned FirstOp = CurOp++; + unsigned SecondOp = CurOp++; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + + EmitRegModRMByte(MI.getOperand(SecondOp), + GetX86RegNum(MI.getOperand(FirstOp)), CurByte, OS); + break; + } case X86II::MRMSrcMem: { unsigned FirstMemOp = CurOp+1; @@ -1481,6 +1516,27 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = FirstMemOp + X86::AddrNumOperands; break; } + case X86II::MRMSrcMemCC: { + unsigned RegOp = CurOp++; + unsigned FirstMemOp = CurOp; + CurOp = FirstMemOp + X86::AddrNumOperands; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + + emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(RegOp)), + TSFlags, Rex, CurByte, OS, Fixups, STI); + break; + } + + case X86II::MRMXrCC: { + unsigned RegOp = CurOp++; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + EmitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS); + break; + } case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: @@ -1497,6 +1553,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurByte, OS); break; + case X86II::MRMXmCC: { + unsigned FirstMemOp = CurOp; + CurOp = FirstMemOp + X86::AddrNumOperands; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + + emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI); + break; + } + case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: diff --git a/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/lib/Target/X86/MCTargetDesc/X86MCExpr.h index 1070f70468fa..532fecd9951b 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCExpr.h +++ b/lib/Target/X86/MCTargetDesc/X86MCExpr.h @@ -1,9 +1,8 @@ //=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H -#include "InstPrinter/X86ATTInstPrinter.h" +#include "X86ATTInstPrinter.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index ea4aaf14223d..ce05ad974507 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,13 +11,15 @@ //===----------------------------------------------------------------------===// #include "X86MCTargetDesc.h" -#include "InstPrinter/X86ATTInstPrinter.h" -#include "InstPrinter/X86IntelInstPrinter.h" +#include "TargetInfo/X86TargetInfo.h" +#include "X86ATTInstPrinter.h" #include "X86BaseInfo.h" +#include "X86IntelInstPrinter.h" #include "X86MCAsmInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -117,6 +118,15 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::ST6, X86::FP6}, {codeview::RegisterId::ST7, X86::FP7}, + {codeview::RegisterId::MM0, X86::MM0}, + {codeview::RegisterId::MM1, X86::MM1}, + {codeview::RegisterId::MM2, X86::MM2}, + {codeview::RegisterId::MM3, X86::MM3}, + {codeview::RegisterId::MM4, X86::MM4}, + {codeview::RegisterId::MM5, X86::MM5}, + {codeview::RegisterId::MM6, X86::MM6}, + {codeview::RegisterId::MM7, X86::MM7}, + {codeview::RegisterId::XMM0, X86::XMM0}, {codeview::RegisterId::XMM1, X86::XMM1}, {codeview::RegisterId::XMM2, X86::XMM2}, diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 4e9f5ba60d2e..00dd5908cbf5 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -1,9 +1,8 @@ //===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -35,9 +34,6 @@ class StringRef; class raw_ostream; class raw_pwrite_stream; -Target &getTheX86_32Target(); -Target &getTheX86_64Target(); - /// Flavour of dwarf regnumbers /// namespace DWARFFlavour { diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 883278b7bc1f..fc7e99f61e5e 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h index 10a282dd2962..3b1e9e7c34fb 100644 --- a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h +++ b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h @@ -1,9 +1,8 @@ //===- X86TargetStreamer.h ------------------------------*- C++ -*---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 2aec695b2dbf..3baab9da1c41 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 0085787e576a..796a27a17255 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -1,9 +1,8 @@ //===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index bee9b7046338..e9987d1f62bd 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/ShadowCallStack.cpp b/lib/Target/X86/ShadowCallStack.cpp deleted file mode 100644 index ab2cebcb58ee..000000000000 --- a/lib/Target/X86/ShadowCallStack.cpp +++ /dev/null @@ -1,322 +0,0 @@ -//===------- ShadowCallStack.cpp - Shadow Call Stack pass -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// The ShadowCallStack pass instruments function prologs/epilogs to check that -// the return address has not been corrupted during the execution of the -// function. The return address is stored in a 'shadow call stack' addressed -// using the %gs segment register. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrBuilder.h" -#include "X86InstrInfo.h" -#include "X86Subtarget.h" - -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -class ShadowCallStack : public MachineFunctionPass { -public: - static char ID; - - ShadowCallStack() : MachineFunctionPass(ID) { - initializeShadowCallStackPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - -private: - // Do not instrument leaf functions with this many or fewer instructions. The - // shadow call stack instrumented prolog/epilog are slightly race-y reading - // and checking the saved return address, so it is better to not instrument - // functions that have fewer instructions than the instrumented prolog/epilog - // race. - static const size_t SkipLeafInstructions = 3; -}; - -char ShadowCallStack::ID = 0; -} // end anonymous namespace. - -static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL); -static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL, - MCPhysReg FreeRegister); - -static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB); -static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB, - MCPhysReg FreeRegister); -// Generate a longer epilog that only uses r10 when a tailcall branches to r11. -static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB); - -// Helper function to add ModR/M references for [Seg: Reg + Offset] memory -// accesses -static inline const MachineInstrBuilder & -addSegmentedMem(const MachineInstrBuilder &MIB, MCPhysReg Seg, MCPhysReg Reg, - int Offset = 0) { - return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset).addReg(Seg); -} - -static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL) { - const MCPhysReg ReturnReg = X86::R10; - const MCPhysReg OffsetReg = X86::R11; - - auto MBBI = MBB.begin(); - // mov r10, [rsp] - addDirectMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(ReturnReg), - X86::RSP); - // xor r11, r11 - BuildMI(MBB, MBBI, DL, TII->get(X86::XOR64rr)) - .addDef(OffsetReg) - .addReg(OffsetReg, RegState::Undef) - .addReg(OffsetReg, RegState::Undef); - // add QWORD [gs:r11], 8 - addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64mi8)), X86::GS, - OffsetReg) - .addImm(8); - // mov r11, [gs:r11] - addSegmentedMem( - BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(OffsetReg), X86::GS, - OffsetReg); - // mov [gs:r11], r10 - addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64mr)), X86::GS, - OffsetReg) - .addReg(ReturnReg); -} - -static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL, - MCPhysReg FreeRegister) { - // mov REG, [rsp] - addDirectMem(BuildMI(MBB, MBB.begin(), DL, TII->get(X86::MOV64rm)) - .addDef(FreeRegister), - X86::RSP); -} - -static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB) { - const DebugLoc &DL = MI.getDebugLoc(); - - // xor r11, r11 - BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr)) - .addDef(X86::R11) - .addReg(X86::R11, RegState::Undef) - .addReg(X86::R11, RegState::Undef); - // mov r10, [gs:r11] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R11); - // mov r10, [gs:r10] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R10); - // sub QWORD [gs:r11], 8 - // This instruction should not be moved up to avoid a signal race. - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), - X86::GS, X86::R11) - .addImm(8); - // cmp [rsp], r10 - addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP) - .addReg(X86::R10); - // jne trap - BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB); - MBB.addSuccessor(&TrapBB); -} - -static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB, - MCPhysReg FreeRegister) { - const DebugLoc &DL = MI.getDebugLoc(); - - // cmp [rsp], REG - addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP) - .addReg(FreeRegister); - // jne trap - BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB); - MBB.addSuccessor(&TrapBB); -} - -static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB) { - const DebugLoc &DL = MI.getDebugLoc(); - - // xor r10, r10 - BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr)) - .addDef(X86::R10) - .addReg(X86::R10, RegState::Undef) - .addReg(X86::R10, RegState::Undef); - // mov r10, [gs:r10] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R10); - // mov r10, [gs:r10] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R10); - // sub QWORD [gs:0], 8 - // This instruction should not be moved up to avoid a signal race. - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), X86::GS, 0) - .addImm(8); - // cmp [rsp], r10 - addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP) - .addReg(X86::R10); - // jne trap - BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB); - MBB.addSuccessor(&TrapBB); -} - -bool ShadowCallStack::runOnMachineFunction(MachineFunction &Fn) { - if (!Fn.getFunction().hasFnAttribute(Attribute::ShadowCallStack) || - Fn.getFunction().hasFnAttribute(Attribute::Naked)) - return false; - - if (Fn.empty() || !Fn.getRegInfo().tracksLiveness()) - return false; - - // FIXME: Skip functions that have r10 or r11 live on entry (r10 can be live - // on entry for parameters with the nest attribute.) - if (Fn.front().isLiveIn(X86::R10) || Fn.front().isLiveIn(X86::R11)) - return false; - - // FIXME: Skip functions with conditional and r10 tail calls for now. - bool HasReturn = false; - for (auto &MBB : Fn) { - if (MBB.empty()) - continue; - - const MachineInstr &MI = MBB.instr_back(); - if (MI.isReturn()) - HasReturn = true; - - if (MI.isReturn() && MI.isCall()) { - if (MI.findRegisterUseOperand(X86::EFLAGS)) - return false; - // This should only be possible on Windows 64 (see GR64_TC versus - // GR64_TCW64.) - if (MI.findRegisterUseOperand(X86::R10) || - MI.hasRegisterImplicitUseOperand(X86::R10)) - return false; - } - } - - if (!HasReturn) - return false; - - // For leaf functions: - // 1. Do not instrument very short functions where it would not improve that - // function's security. - // 2. Detect if there is an unused caller-saved register we can reserve to - // hold the return address instead of writing/reading it from the shadow - // call stack. - MCPhysReg LeafFuncRegister = X86::NoRegister; - if (!Fn.getFrameInfo().adjustsStack()) { - size_t InstructionCount = 0; - std::bitset UsedRegs; - for (auto &MBB : Fn) { - for (auto &LiveIn : MBB.liveins()) - UsedRegs.set(LiveIn.PhysReg); - for (auto &MI : MBB) { - if (!MI.isDebugValue() && !MI.isCFIInstruction() && !MI.isLabel()) - InstructionCount++; - for (auto &Op : MI.operands()) - if (Op.isReg() && Op.isDef()) - UsedRegs.set(Op.getReg()); - } - } - - if (InstructionCount <= SkipLeafInstructions) - return false; - - std::bitset CalleeSavedRegs; - const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs(); - for (size_t i = 0; CSRegs[i]; i++) - CalleeSavedRegs.set(CSRegs[i]); - - const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); - for (auto &Reg : X86::GR64_NOSPRegClass.getRegisters()) { - // FIXME: Optimization opportunity: spill/restore a callee-saved register - // if a caller-saved register is unavailable. - if (CalleeSavedRegs.test(Reg)) - continue; - - bool Used = false; - for (MCSubRegIterator SR(Reg, TRI, true); SR.isValid(); ++SR) - if ((Used = UsedRegs.test(*SR))) - break; - - if (!Used) { - LeafFuncRegister = Reg; - break; - } - } - } - - const bool LeafFuncOptimization = LeafFuncRegister != X86::NoRegister; - if (LeafFuncOptimization) - // Mark the leaf function register live-in for all MBBs except the entry MBB - for (auto I = ++Fn.begin(), E = Fn.end(); I != E; ++I) - I->addLiveIn(LeafFuncRegister); - - MachineBasicBlock &MBB = Fn.front(); - const MachineBasicBlock *NonEmpty = MBB.empty() ? MBB.getFallThrough() : &MBB; - const DebugLoc &DL = NonEmpty->front().getDebugLoc(); - - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - if (LeafFuncOptimization) - addPrologLeaf(Fn, TII, MBB, DL, LeafFuncRegister); - else - addProlog(Fn, TII, MBB, DL); - - MachineBasicBlock *Trap = nullptr; - for (auto &MBB : Fn) { - if (MBB.empty()) - continue; - - MachineInstr &MI = MBB.instr_back(); - if (MI.isReturn()) { - if (!Trap) { - Trap = Fn.CreateMachineBasicBlock(); - BuildMI(Trap, MI.getDebugLoc(), TII->get(X86::TRAP)); - Fn.push_back(Trap); - } - - if (LeafFuncOptimization) - addEpilogLeaf(TII, MBB, MI, *Trap, LeafFuncRegister); - else if (MI.findRegisterUseOperand(X86::R11)) - addEpilogOnlyR10(TII, MBB, MI, *Trap); - else - addEpilog(TII, MBB, MI, *Trap); - } - } - - return true; -} - -INITIALIZE_PASS(ShadowCallStack, "shadow-call-stack", "Shadow Call Stack", - false, false) - -FunctionPass *llvm::createShadowCallStackPass() { - return new ShadowCallStack(); -} diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index 16c2b56c48b5..47c41626a666 100644 --- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -1,13 +1,12 @@ //===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "TargetInfo/X86TargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.h b/lib/Target/X86/TargetInfo/X86TargetInfo.h new file mode 100644 index 000000000000..caf6b8d424fc --- /dev/null +++ b/lib/Target/X86/TargetInfo/X86TargetInfo.h @@ -0,0 +1,21 @@ +//===-- X86TargetInfo.h - X86 Target Implementation -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H +#define LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheX86_32Target(); +Target &getTheX86_64Target(); + +} + +#endif // LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index bed940d0d0e9..48fd3e0b7ab9 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -1,9 +1,8 @@ //===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -300,7 +299,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, unsigned HalfMask = Imm >> (l * 4); unsigned HalfBegin = (HalfMask & 0x3) * HalfSize; for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i) - ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i); + ShuffleMask.push_back((HalfMask & 8) ? SM_SentinelZero : (int)i); } } @@ -384,7 +383,8 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm, } void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, - unsigned NumDstElts, SmallVectorImpl &Mask) { + unsigned NumDstElts, bool IsAnyExtend, + SmallVectorImpl &Mask) { unsigned Scale = DstScalarBits / SrcScalarBits; assert(SrcScalarBits < DstScalarBits && "Expected zero extension mask to increase scalar size"); @@ -392,7 +392,7 @@ void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, for (unsigned i = 0; i != NumDstElts; i++) { Mask.push_back(i); for (unsigned j = 1; j != Scale; j++) - Mask.push_back(SM_SentinelZero); + Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero); } } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 85cde14a3241..f52785063071 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -1,9 +1,8 @@ //===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -137,7 +136,7 @@ void DecodeVPPERMMask(ArrayRef RawMask, const APInt &UndefElts, /// Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, - unsigned NumDstElts, + unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl &ShuffleMask); /// Decode a move lower and zero upper instruction as a shuffle mask. diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 1c8813815b86..a95f68434d12 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -1,9 +1,8 @@ //===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass(); /// transition penalty between functions encoded with AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// This pass instruments the function prolog to save the return address to a -/// 'shadow call stack' and the function epilog to check that the return address -/// did not change during function execution. -FunctionPass *createShadowCallStackPass(); - /// This pass inserts ENDBR instructions before indirect jump/call /// destinations as part of CET IBT mechanism. FunctionPass *createX86IndirectBranchTrackingPass(); @@ -138,11 +132,12 @@ FunctionPass *createX86SpeculativeLoadHardeningPass(); void initializeEvexToVexInstPassPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); -void initializeShadowCallStackPass(PassRegistry &); +void initializeFPSPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); +void initializeX86ExpandPseudoPass(PassRegistry&); void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 6b1749fc7500..3112f00c91f2 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -1,9 +1,8 @@ //===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,6 +39,9 @@ def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", "Enable conditional move instructions">; +def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true", + "Support CMPXCHG8B instructions">; + def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", "Support POPCNT instruction">; @@ -165,9 +167,16 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", "Enable AVX-512 Vector Neural Network Instructions", [FeatureAVX512]>; +def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true", + "Support bfloat16 floating point", + [FeatureBWI]>; def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true", "Enable AVX-512 Bit Algorithms", [FeatureBWI]>; +def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect", + "HasVP2INTERSECT", "true", + "Enable AVX-512 vp2intersect", + [FeatureAVX512]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -258,6 +267,8 @@ def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", "Support RDPID instructions">; def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; +def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", + "Has ENQCMD instructions">; // On some processors, instructions that implicitly take two memory operands are // slow. In practice, this means that CALL, PUSH, and POP with memory operands // should be avoided in favor of a MOV + register CALL/PUSH/POP. @@ -274,7 +285,7 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", - "Use software floating point features.">; + "Use software floating point features">; def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt", "HasPOPCNTFalseDeps", "true", "POPCNT has a false dependency on dest register">; @@ -342,6 +353,12 @@ def FeatureERMSB "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; +// Bulldozer and newer processors can merge CMP/TEST (but not other +// instructions) with conditional branches. +def FeatureBranchFusion + : SubtargetFeature<"branchfusion", "HasBranchFusion", "true", + "CMP/TEST can be fused with conditional branches">; + // Sandy Bridge and newer processors have many instructions that can be // fused with conditional branches and pass through the CPU as a single // operation. @@ -355,7 +372,7 @@ def FeatureMacroFusion // similar to Skylake Server (AVX-512). def FeatureHasFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", - "Indicates if gather is reasonably fast.">; + "Indicates if gather is reasonably fast">; def FeaturePrefer256Bit : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", @@ -366,7 +383,7 @@ def FeaturePrefer256Bit def FeatureRetpolineIndirectCalls : SubtargetFeature< "retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true", - "Remove speculation of indirect calls from the generated code.">; + "Remove speculation of indirect calls from the generated code">; // Lower indirect branches and switches either using conditional branch trees // or using a special construct called a `retpoline` to mitigate potential @@ -374,7 +391,7 @@ def FeatureRetpolineIndirectCalls def FeatureRetpolineIndirectBranches : SubtargetFeature< "retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true", - "Remove speculation of indirect branches from the generated code.">; + "Remove speculation of indirect branches from the generated code">; // Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and // `retpoline-indirect-branches` above. @@ -382,7 +399,7 @@ def FeatureRetpoline : SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true", "Remove speculation of indirect branches from the " "generated code, either by avoiding them entirely or " - "lowering them with a speculation blocking construct.", + "lowering them with a speculation blocking construct", [FeatureRetpolineIndirectCalls, FeatureRetpolineIndirectBranches]>; @@ -395,7 +412,7 @@ def FeatureRetpolineExternalThunk "When lowering an indirect call or branch using a `retpoline`, rely " "on the specified user provided thunk rather than emitting one " "ourselves. Only has effect when combined with some other retpoline " - "feature.", [FeatureRetpolineIndirectCalls]>; + "feature", [FeatureRetpolineIndirectCalls]>; // Direct Move instructions. def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", @@ -405,7 +422,7 @@ def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true", "Indicates that the BEXTR instruction is implemented as a single uop " - "with good throughput.">; + "with good throughput">; // Combine vector math operations with shuffles into horizontal math // instructions if a CPU implements horizontal operations (introduced with @@ -416,12 +433,33 @@ def FeatureFastHorizontalOps "Prefer horizontal vector math instructions (haddp, phsub, etc.) over " "normal vector instructions with shuffles", [FeatureSSE3]>; +def FeatureFastScalarShiftMasks + : SubtargetFeature< + "fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true", + "Prefer a left/right scalar logical shift pair over a shift+and pair">; + +def FeatureFastVectorShiftMasks + : SubtargetFeature< + "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true", + "Prefer a left/right vector logical shift pair over a shift+and pair">; + // Merge branches using three-way conditional code. def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "ThreewayBranchProfitable", "true", "Merge branches to a three-way " "conditional branch">; +// Bonnell +def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">; +// Silvermont +def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">; +// Goldmont +def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">; +// Goldmont Plus +def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">; +// Tremont +def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -440,7 +478,7 @@ include "X86SchedPredicates.td" def X86InstrInfo : InstrInfo; //===----------------------------------------------------------------------===// -// X86 processors supported. +// X86 Scheduler Models //===----------------------------------------------------------------------===// include "X86ScheduleAtom.td" @@ -454,37 +492,468 @@ include "X86ScheduleBtVer2.td" include "X86SchedSkylakeClient.td" include "X86SchedSkylakeServer.td" -def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", - "Intel Atom processors">; -def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", - "Intel Silvermont processors">; -def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM", - "Intel Goldmont processors">; -def ProcIntelGLP : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP", - "Intel Goldmont Plus processors">; -def ProcIntelTRM : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM", - "Intel Tremont processors">; +//===----------------------------------------------------------------------===// +// X86 Processor Feature Lists +//===----------------------------------------------------------------------===// + +def ProcessorFeatures { + // Nehalem + list NHMInheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureLAHFSAHF, + FeatureMacroFusion]; + list NHMSpecificFeatures = []; + list NHMFeatures = + !listconcat(NHMInheritableFeatures, NHMSpecificFeatures); + + // Westmere + list WSMAdditionalFeatures = [FeaturePCLMUL]; + list WSMSpecificFeatures = []; + list WSMInheritableFeatures = + !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures); + list WSMFeatures = + !listconcat(WSMInheritableFeatures, WSMSpecificFeatures); + + // Sandybridge + list SNBAdditionalFeatures = [FeatureAVX, + FeatureSlowDivide64, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlow3OpsLEA, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureMergeToThreeWayBranch]; + list SNBSpecificFeatures = [FeatureSlowUAMem32, + FeaturePOPCNTFalseDeps]; + list SNBInheritableFeatures = + !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures); + list SNBFeatures = + !listconcat(SNBInheritableFeatures, SNBSpecificFeatures); + + // Ivybridge + list IVBAdditionalFeatures = [FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase]; + list IVBSpecificFeatures = [FeatureSlowUAMem32, + FeaturePOPCNTFalseDeps]; + list IVBInheritableFeatures = + !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures); + list IVBFeatures = + !listconcat(IVBInheritableFeatures, IVBSpecificFeatures); + + // Haswell + list HSWAdditionalFeatures = [FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureERMSB, + FeatureFMA, + FeatureINVPCID, + FeatureLZCNT, + FeatureMOVBE, + FeatureFastVariableShuffle]; + list HSWSpecificFeatures = [FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps]; + list HSWInheritableFeatures = + !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures); + list HSWFeatures = + !listconcat(HSWInheritableFeatures, HSWSpecificFeatures); + + // Broadwell + list BDWAdditionalFeatures = [FeatureADX, + FeatureRDSEED, + FeaturePRFCHW]; + list BDWSpecificFeatures = [FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps]; + list BDWInheritableFeatures = + !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures); + list BDWFeatures = + !listconcat(BDWInheritableFeatures, BDWSpecificFeatures); + + // Skylake + list SKLAdditionalFeatures = [FeatureAES, + FeatureMPX, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureFastVectorFSQRT]; + list SKLSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps, + FeatureSGX]; + list SKLInheritableFeatures = + !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures); + list SKLFeatures = + !listconcat(SKLInheritableFeatures, SKLSpecificFeatures); + + // Skylake-AVX512 + list SKXAdditionalFeatures = [FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCLWB]; + list SKXSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps]; + list SKXInheritableFeatures = + !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures); + list SKXFeatures = + !listconcat(SKXInheritableFeatures, SKXSpecificFeatures); + + // Cascadelake + list CLXAdditionalFeatures = [FeatureVNNI]; + list CLXSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps]; + list CLXInheritableFeatures = + !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures); + list CLXFeatures = + !listconcat(CLXInheritableFeatures, CLXSpecificFeatures); + + // Cooperlake + list CPXAdditionalFeatures = [FeatureBF16]; + list CPXSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps]; + list CPXInheritableFeatures = + !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures); + list CPXFeatures = + !listconcat(CPXInheritableFeatures, CPXSpecificFeatures); + + // Cannonlake + list CNLAdditionalFeatures = [FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureVBMI, + FeatureIFMA, + FeatureSHA, + FeatureSGX]; + list CNLSpecificFeatures = [FeatureHasFastGather]; + list CNLInheritableFeatures = + !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures); + list CNLFeatures = + !listconcat(CNLInheritableFeatures, CNLSpecificFeatures); + + // Icelake + list ICLAdditionalFeatures = [FeatureBITALG, + FeatureVAES, + FeatureVBMI2, + FeatureVNNI, + FeatureVPCLMULQDQ, + FeatureVPOPCNTDQ, + FeatureGFNI, + FeatureCLWB, + FeatureRDPID]; + list ICLSpecificFeatures = [FeatureHasFastGather]; + list ICLInheritableFeatures = + !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures); + list ICLFeatures = + !listconcat(ICLInheritableFeatures, ICLSpecificFeatures); + + // Icelake Server + list ICXSpecificFeatures = [FeaturePCONFIG, + FeatureWBNOINVD, + FeatureHasFastGather]; + list ICXFeatures = + !listconcat(ICLInheritableFeatures, ICXSpecificFeatures); + + // Atom + list AtomInheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowTwoMemOps, + FeatureLAHFSAHF]; + list AtomSpecificFeatures = [ProcIntelAtom, + FeatureSlowUAMem16, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureLEAUsesAG, + FeaturePadShortFunctions]; + list AtomFeatures = + !listconcat(AtomInheritableFeatures, AtomSpecificFeatures); + + // Silvermont + list SLMAdditionalFeatures = [FeatureSSE42, + FeaturePOPCNT, + FeaturePCLMUL, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureRDRAND]; + list SLMSpecificFeatures = [ProcIntelSLM, + FeatureSlowDivide64, + FeatureSlowPMULLD, + FeaturePOPCNTFalseDeps]; + list SLMInheritableFeatures = + !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures); + list SLMFeatures = + !listconcat(SLMInheritableFeatures, SLMSpecificFeatures); + + // Goldmont + list GLMAdditionalFeatures = [FeatureAES, + FeatureMPX, + FeatureSHA, + FeatureRDSEED, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureFSGSBase]; + list GLMSpecificFeatures = [ProcIntelGLM, + FeaturePOPCNTFalseDeps]; + list GLMInheritableFeatures = + !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures); + list GLMFeatures = + !listconcat(GLMInheritableFeatures, GLMSpecificFeatures); + + // Goldmont Plus + list GLPAdditionalFeatures = [FeaturePTWRITE, + FeatureRDPID, + FeatureSGX]; + list GLPSpecificFeatures = [ProcIntelGLP]; + list GLPInheritableFeatures = + !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures); + list GLPFeatures = + !listconcat(GLPInheritableFeatures, GLPSpecificFeatures); + + // Tremont + list TRMAdditionalFeatures = [FeatureCLDEMOTE, + FeatureGFNI, + FeatureMOVDIRI, + FeatureMOVDIR64B, + FeatureWAITPKG]; + list TRMSpecificFeatures = [ProcIntelTRM]; + list TRMFeatures = + !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures, + TRMSpecificFeatures); + + // Knights Landing + list KNLFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureSlowDivide64, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF, + FeatureSlow3OpsLEA, + FeatureSlowIncDec, + FeatureAES, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureAVX512, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeaturePREFETCHWT1, + FeatureADX, + FeatureRDSEED, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeaturePRFCHW, + FeatureSlowTwoMemOps, + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather, + FeatureSlowPMADDWD]; + // TODO Add AVX5124FMAPS/AVX5124VNNIW features + list KNMFeatures = + !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]); + + + // Bobcat + list BtVer1InheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast15ByteNOP, + FeatureFastScalarShiftMasks, + FeatureFastVectorShiftMasks]; + list BtVer1Features = BtVer1InheritableFeatures; + + // Jaguar + list BtVer2AdditionalFeatures = [FeatureAVX, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureXSAVE, + FeatureXSAVEOPT]; + list BtVer2SpecificFeatures = [FeatureFastLZCNT, + FeatureFastBEXTR, + FeatureFastPartialYMMorZMMWrite, + FeatureFastHorizontalOps]; + list BtVer2InheritableFeatures = + !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures); + list BtVer2Features = + !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures); + + // Bulldozer + list BdVer1InheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureXOP, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureFXSR, + FeatureNOPL, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureLWP, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast11ByteNOP, + FeatureFastScalarShiftMasks, + FeatureBranchFusion]; + list BdVer1Features = BdVer1InheritableFeatures; + + // PileDriver + list BdVer2AdditionalFeatures = [FeatureF16C, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureFastBEXTR]; + list BdVer2InheritableFeatures = + !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures); + list BdVer2Features = BdVer2InheritableFeatures; + + // Steamroller + list BdVer3AdditionalFeatures = [FeatureXSAVEOPT, + FeatureFSGSBase]; + list BdVer3InheritableFeatures = + !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures); + list BdVer3Features = BdVer3InheritableFeatures; + + // Excavator + list BdVer4AdditionalFeatures = [FeatureAVX2, + FeatureBMI2, + FeatureMWAITX]; + list BdVer4InheritableFeatures = + !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures); + list BdVer4Features = BdVer4InheritableFeatures; + + + // AMD Zen Processors common ISAs + list ZNFeatures = [FeatureADX, + FeatureAES, + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureCLFLUSHOPT, + FeatureCLZERO, + FeatureCMOV, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureF16C, + FeatureFMA, + FeatureFSGSBase, + FeatureFXSR, + FeatureNOPL, + FeatureFastLZCNT, + FeatureLAHFSAHF, + FeatureLZCNT, + FeatureFastBEXTR, + FeatureFast15ByteNOP, + FeatureBranchFusion, + FeatureFastScalarShiftMasks, + FeatureMMX, + FeatureMOVBE, + FeatureMWAITX, + FeaturePCLMUL, + FeaturePOPCNT, + FeaturePRFCHW, + FeatureRDRAND, + FeatureRDSEED, + FeatureSHA, + FeatureSSE4A, + FeatureSlowSHLD, + FeatureX87, + FeatureXSAVE, + FeatureXSAVEC, + FeatureXSAVEOPT, + FeatureXSAVES]; + list ZN2AdditionalFeatures = [FeatureCLWB, + FeatureRDPID, + FeatureWBNOINVD]; + list ZN2Features = + !listconcat(ZNFeatures, ZN2AdditionalFeatures); +} + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// class Proc Features> : ProcessorModel; -def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>; +// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled +// if i386/i486 is specifically requested. +def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B]>; def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>; def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; - -def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>; -def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, - FeatureNOPL]>; - -def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureCMOV, FeatureFXSR, FeatureNOPL]>; +def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B]>; +def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B]>; +def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureMMX]>; + +def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureCMOV]>; +def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureCMOV, FeatureNOPL]>; + +def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureCMOV, FeatureFXSR, + FeatureNOPL]>; foreach P = ["pentium3", "pentium3m"] in { - def : Proc; + def : Proc; } // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. @@ -498,13 +967,15 @@ foreach P = ["pentium3", "pentium3m"] in { // changes slightly. def : ProcessorModel<"pentium-m", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; foreach P = ["pentium4", "pentium4m"] in { def : ProcessorModel; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; } // Intel Quark. @@ -512,16 +983,19 @@ def : Proc<"lakemont", []>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; // NetBurst. def : ProcessorModel<"prescott", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE3, @@ -535,6 +1009,7 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [ def : ProcessorModel<"core2", SandyBridgeModel, [ FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSSE3, @@ -548,6 +1023,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE41, @@ -560,638 +1036,131 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ ]>; // Atom CPUs. -class BonnellProc : ProcessorModel; -def : BonnellProc<"bonnell">; -def : BonnellProc<"atom">; // Pin the generic name to the baseline. - -class SilvermontProc : ProcessorModel; -def : SilvermontProc<"silvermont">; -def : SilvermontProc<"slm">; // Legacy alias. - -class ProcessorFeatures Inherited, - list NewFeatures> { - list Value = !listconcat(Inherited, NewFeatures); +foreach P = ["bonnell", "atom"] in { + def : ProcessorModel; } -class ProcModel ProcFeatures, - list OtherFeatures> : - ProcessorModel; - -def GLMFeatures : ProcessorFeatures<[], [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSE42, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureAES, - FeaturePRFCHW, - FeatureSlowTwoMemOps, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureLAHFSAHF, - FeatureMPX, - FeatureSHA, - FeatureRDRAND, - FeatureRDSEED, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureXSAVEC, - FeatureXSAVES, - FeatureCLFLUSHOPT, - FeatureFSGSBase -]>; +foreach P = ["silvermont", "slm"] in { + def : ProcessorModel; +} -class GoldmontProc : ProcModel; -def : GoldmontProc<"goldmont">; - -def GLPFeatures : ProcessorFeatures; - -class GoldmontPlusProc : ProcModel; -def : GoldmontPlusProc<"goldmont-plus">; - -class TremontProc : ProcModel; -def : TremontProc<"tremont">; +def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>; +def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>; +def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>; // "Arrandale" along with corei3 and corei5 -class NehalemProc : ProcessorModel; -def : NehalemProc<"nehalem">; -def : NehalemProc<"corei7">; +foreach P = ["nehalem", "corei7"] in { + def : ProcessorModel; +} -// Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -class WestmereProc : ProcessorModel; -def : WestmereProc<"westmere">; - -// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, -// rather than a superset. -def SNBFeatures : ProcessorFeatures<[], [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeatureSlowDivide64, - FeaturePCLMUL, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureLAHFSAHF, - FeatureSlow3OpsLEA, - FeatureFastScalarFSQRT, - FeatureFastSHLDRotate, - FeatureSlowIncDec, - FeatureMergeToThreeWayBranch, - FeatureMacroFusion -]>; - -class SandyBridgeProc : ProcModel; -def : SandyBridgeProc<"sandybridge">; -def : SandyBridgeProc<"corei7-avx">; // Legacy alias. - -def IVBFeatures : ProcessorFeatures; - -class IvyBridgeProc : ProcModel; -def : IvyBridgeProc<"ivybridge">; -def : IvyBridgeProc<"core-avx-i">; // Legacy alias. - -def HSWFeatures : ProcessorFeatures; - -class HaswellProc : ProcModel; -def : HaswellProc<"haswell">; -def : HaswellProc<"core-avx2">; // Legacy alias. +def : ProcessorModel<"westmere", SandyBridgeModel, + ProcessorFeatures.WSMFeatures>; -def BDWFeatures : ProcessorFeatures; -class BroadwellProc : ProcModel; -def : BroadwellProc<"broadwell">; - -def SKLFeatures : ProcessorFeatures; - -class SkylakeClientProc : ProcModel; -def : SkylakeClientProc<"skylake">; +foreach P = ["sandybridge", "corei7-avx"] in { + def : ProcessorModel; +} -def KNLFeatures : ProcessorFeatures<[], [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeatureSlowDivide64, - FeaturePCLMUL, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureLAHFSAHF, - FeatureSlow3OpsLEA, - FeatureSlowIncDec, - FeatureAES, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureAVX512, - FeatureERI, - FeatureCDI, - FeaturePFI, - FeaturePREFETCHWT1, - FeatureADX, - FeatureRDSEED, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeaturePRFCHW -]>; +foreach P = ["ivybridge", "core-avx-i"] in { + def : ProcessorModel; +} -// FIXME: define KNL model -class KnightsLandingProc : ProcModel; -def : KnightsLandingProc<"knl">; - -class KnightsMillProc : ProcModel; -def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features - -def SKXFeatures : ProcessorFeatures; +foreach P = ["haswell", "core-avx2"] in { + def : ProcessorModel; +} -class SkylakeServerProc : ProcModel; -def : SkylakeServerProc<"skylake-avx512">; -def : SkylakeServerProc<"skx">; // Legacy alias. +def : ProcessorModel<"broadwell", BroadwellModel, + ProcessorFeatures.BDWFeatures>; -def CLXFeatures : ProcessorFeatures; +def : ProcessorModel<"skylake", SkylakeClientModel, + ProcessorFeatures.SKLFeatures>; -class CascadelakeProc : ProcModel; -def : CascadelakeProc<"cascadelake">; - -def CNLFeatures : ProcessorFeatures; +// FIXME: define KNL scheduler model +def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>; +def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>; -class CannonlakeProc : ProcModel; -def : CannonlakeProc<"cannonlake">; - -def ICLFeatures : ProcessorFeatures; - -class IcelakeClientProc : ProcModel; -def : IcelakeClientProc<"icelake-client">; +foreach P = ["skylake-avx512", "skx"] in { + def : ProcessorModel; +} -class IcelakeServerProc : ProcModel; -def : IcelakeServerProc<"icelake-server">; +def : ProcessorModel<"cascadelake", SkylakeServerModel, + ProcessorFeatures.CLXFeatures>; +def : ProcessorModel<"cooperlake", SkylakeServerModel, + ProcessorFeatures.CPXFeatures>; +def : ProcessorModel<"cannonlake", SkylakeServerModel, + ProcessorFeatures.CNLFeatures>; +def : ProcessorModel<"icelake-client", SkylakeServerModel, + ProcessorFeatures.ICLFeatures>; +def : ProcessorModel<"icelake-server", SkylakeServerModel, + ProcessorFeatures.ICXFeatures>; // AMD CPUs. -def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX]>; +def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + Feature3DNow]>; +def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + Feature3DNow]>; foreach P = ["athlon", "athlon-tbird"] in { - def : Proc; + def : Proc; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { - def : Proc; + def : Proc; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { - def : Proc; + def : Proc; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { - def : Proc; + def : Proc; } foreach P = ["amdfam10", "barcelona"] in { - def : Proc; + def : Proc; } // Bobcat -def : Proc<"btver1", [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureSSE4A, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureLZCNT, - FeaturePOPCNT, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast15ByteNOP -]>; - +def : Proc<"btver1", ProcessorFeatures.BtVer1Features>; // Jaguar -def : ProcessorModel<"btver2", BtVer2Model, [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureAES, - FeaturePCLMUL, - FeatureBMI, - FeatureF16C, - FeatureMOVBE, - FeatureLZCNT, - FeatureFastLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast15ByteNOP, - FeatureFastBEXTR, - FeatureFastPartialYMMorZMMWrite, - FeatureFastHorizontalOps -]>; +def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>; // Bulldozer -def : ProcessorModel<"bdver1", BdVer2Model, [ - FeatureX87, - FeatureCMOV, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureLWP, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureMacroFusion -]>; +def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>; // Piledriver -def : ProcessorModel<"bdver2", BdVer2Model, [ - FeatureX87, - FeatureCMOV, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - FeatureF16C, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureBMI, - FeatureTBM, - FeatureLWP, - FeatureFMA, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureFastBEXTR, - FeatureMacroFusion -]>; - +def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>; // Steamroller -def : Proc<"bdver3", [ - FeatureX87, - FeatureCMOV, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - FeatureF16C, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureBMI, - FeatureTBM, - FeatureLWP, - FeatureFMA, - FeatureXSAVEOPT, - FeatureSlowSHLD, - FeatureFSGSBase, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureFastBEXTR, - FeatureMacroFusion -]>; - +def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>; // Excavator -def : Proc<"bdver4", [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureAVX2, - FeatureFXSR, - FeatureNOPL, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureF16C, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureBMI, - FeatureBMI2, - FeatureTBM, - FeatureLWP, - FeatureFMA, - FeatureXSAVEOPT, - FeatureSlowSHLD, - FeatureFSGSBase, - FeatureLAHFSAHF, - FeatureFastBEXTR, - FeatureFast11ByteNOP, - FeatureMWAITX, - FeatureMacroFusion -]>; +def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>; -// Znver1 -def: ProcessorModel<"znver1", Znver1Model, [ - FeatureADX, - FeatureAES, - FeatureAVX2, - FeatureBMI, - FeatureBMI2, - FeatureCLFLUSHOPT, - FeatureCLZERO, - FeatureCMOV, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureF16C, - FeatureFMA, - FeatureFSGSBase, - FeatureFXSR, - FeatureNOPL, - FeatureFastLZCNT, - FeatureLAHFSAHF, - FeatureLZCNT, - FeatureFastBEXTR, - FeatureFast15ByteNOP, - FeatureMacroFusion, - FeatureMMX, - FeatureMOVBE, - FeatureMWAITX, - FeaturePCLMUL, - FeaturePOPCNT, - FeaturePRFCHW, - FeatureRDRAND, - FeatureRDSEED, - FeatureSHA, - FeatureSSE4A, - FeatureSlowSHLD, - FeatureX87, - FeatureXSAVE, - FeatureXSAVEC, - FeatureXSAVEOPT, - FeatureXSAVES]>; +def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>; +def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>; -def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; +def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE1, FeatureFXSR, FeatureCMOV]>; +def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE1, FeatureFXSR, + FeatureCMOV]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -1205,6 +1174,7 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, [ FeatureX87, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2, diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 36cef98a1ef5..80120722e0e6 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" -#include "InstPrinter/X86ATTInstPrinter.h" +#include "MCTargetDesc/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86TargetStreamer.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "llvm/BinaryFormat/COFF.h" @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -104,16 +105,16 @@ void X86AsmPrinter::EmitFunctionBodyEnd() { } } -/// printSymbolOperand - Print a raw symbol reference operand. This handles +/// PrintSymbolOperand - Print a raw symbol reference operand. This handles /// jump tables, constant pools, global address and external symbols, all of /// which print to a label with various suffixes for relocation types etc. -static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, - raw_ostream &O) { +void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { switch (MO.getType()) { default: llvm_unreachable("unknown symbol type!"); case MachineOperand::MO_ConstantPoolIndex: - P.GetCPISymbol(MO.getIndex())->print(O, P.MAI); - P.printOffset(MO.getOffset(), O); + GetCPISymbol(MO.getIndex())->print(O, MAI); + printOffset(MO.getOffset(), O); break; case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); @@ -121,38 +122,37 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, MCSymbol *GVSym; if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) - GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); else - GVSym = P.getSymbol(GV); + GVSym = getSymbol(GV); // Handle dllimport linkage. if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) - GVSym = - P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName()); + GVSym = OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName()); else if (MO.getTargetFlags() == X86II::MO_COFFSTUB) GVSym = - P.OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName()); + OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName()); if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { - MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MCSymbol *Sym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = - P.MMI->getObjFileInfo().getGVStubEntry(Sym); + MMI->getObjFileInfo().getGVStubEntry(Sym); if (!StubSym.getPointer()) - StubSym = MachineModuleInfoImpl:: - StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), + !GV->hasInternalLinkage()); } // If the name begins with a dollar-sign, enclose it in parens. We do this // to avoid having it look like an integer immediate to the assembler. if (GVSym->getName()[0] != '$') - GVSym->print(O, P.MAI); + GVSym->print(O, MAI); else { O << '('; - GVSym->print(O, P.MAI); + GVSym->print(O, MAI); O << ')'; } - P.printOffset(MO.getOffset(), O); + printOffset(MO.getOffset(), O); break; } } @@ -169,13 +169,13 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, break; case X86II::MO_GOT_ABSOLUTE_ADDRESS: O << " + [.-"; - P.MF->getPICBaseSymbol()->print(O, P.MAI); + MF->getPICBaseSymbol()->print(O, MAI); O << ']'; break; case X86II::MO_PIC_BASE_OFFSET: case X86II::MO_DARWIN_NONLAZY_PIC_BASE: O << '-'; - P.MF->getPICBaseSymbol()->print(O, P.MAI); + MF->getPICBaseSymbol()->print(O, MAI); break; case X86II::MO_TLSGD: O << "@TLSGD"; break; case X86II::MO_TLSLD: O << "@TLSLD"; break; @@ -193,76 +193,91 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, case X86II::MO_TLVP: O << "@TLVP"; break; case X86II::MO_TLVP_PIC_BASE: O << "@TLVP" << '-'; - P.MF->getPICBaseSymbol()->print(O, P.MAI); + MF->getPICBaseSymbol()->print(O, MAI); break; case X86II::MO_SECREL: O << "@SECREL32"; break; } } -static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, - unsigned OpNo, raw_ostream &O, - const char *Modifier = nullptr, unsigned AsmVariant = 0); - -/// printPCRelImm - This is used to print an immediate value that ends up -/// being encoded as a pc-relative value. These print slightly differently, for -/// example, a $ is not emitted. -static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI, - unsigned OpNo, raw_ostream &O) { +void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); + const bool IsATT = MI->getInlineAsmDialect() == InlineAsm::AD_ATT; switch (MO.getType()) { - default: llvm_unreachable("Unknown pcrel immediate operand"); - case MachineOperand::MO_Register: - // pc-relativeness was handled when computing the value in the reg. - printOperand(P, MI, OpNo, O); + default: llvm_unreachable("unknown operand type!"); + case MachineOperand::MO_Register: { + if (IsATT) + O << '%'; + O << X86ATTInstPrinter::getRegisterName(MO.getReg()); return; + } + case MachineOperand::MO_Immediate: + if (IsATT) + O << '$'; O << MO.getImm(); return; - case MachineOperand::MO_GlobalAddress: - printSymbolOperand(P, MO, O); - return; + + case MachineOperand::MO_GlobalAddress: { + if (IsATT) + O << '$'; + PrintSymbolOperand(MO, O); + break; + } + case MachineOperand::MO_BlockAddress: { + MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress()); + Sym->print(O, MAI); + break; + } } } -static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, - unsigned OpNo, raw_ostream &O, const char *Modifier, - unsigned AsmVariant) { +/// PrintModifiedOperand - Print subregisters based on supplied modifier, +/// deferring to PrintOperand() if no modifier was supplied or if operand is not +/// a register. +void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { const MachineOperand &MO = MI->getOperand(OpNo); - switch (MO.getType()) { - default: llvm_unreachable("unknown operand type!"); - case MachineOperand::MO_Register: { - // FIXME: Enumerating AsmVariant, so we can remove magic number. - if (AsmVariant == 0) O << '%'; - unsigned Reg = MO.getReg(); - if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : - (strcmp(Modifier+6,"32") == 0) ? 32 : - (strcmp(Modifier+6,"16") == 0) ? 16 : 8; - Reg = getX86SubSuperRegister(Reg, Size); - } - O << X86ATTInstPrinter::getRegisterName(Reg); - return; + if (!Modifier || MO.getType() != MachineOperand::MO_Register) + return PrintOperand(MI, OpNo, O); + if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT) + O << '%'; + unsigned Reg = MO.getReg(); + if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : + (strcmp(Modifier+6,"32") == 0) ? 32 : + (strcmp(Modifier+6,"16") == 0) ? 16 : 8; + Reg = getX86SubSuperRegister(Reg, Size); } + O << X86ATTInstPrinter::getRegisterName(Reg); +} +/// PrintPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. These print slightly differently, for +/// example, a $ is not emitted. +void X86AsmPrinter::PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("Unknown pcrel immediate operand"); + case MachineOperand::MO_Register: + // pc-relativeness was handled when computing the value in the reg. + PrintOperand(MI, OpNo, O); + return; case MachineOperand::MO_Immediate: - if (AsmVariant == 0) O << '$'; O << MO.getImm(); return; - - case MachineOperand::MO_GlobalAddress: { - if (AsmVariant == 0) O << '$'; - printSymbolOperand(P, MO, O); - break; - } + case MachineOperand::MO_GlobalAddress: + PrintSymbolOperand(MO, O); + return; } } -static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, - unsigned Op, raw_ostream &O, - const char *Modifier = nullptr) { - const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); +void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg); + const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg); + const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp); // If we really don't want to print out (rip), don't. bool HasBaseReg = BaseReg.getReg() != 0; @@ -284,7 +299,8 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, } case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ConstantPoolIndex: - printSymbolOperand(P, DispSpec, O); + PrintSymbolOperand(DispSpec, O); + break; } if (Modifier && strcmp(Modifier, "H") == 0) @@ -296,12 +312,12 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, O << '('; if (HasBaseReg) - printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier); + PrintModifiedOperand(MI, OpNo + X86::AddrBaseReg, O, Modifier); if (IndexReg.getReg()) { O << ','; - printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + PrintModifiedOperand(MI, OpNo + X86::AddrIndexReg, O, Modifier); + unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm(); if (ScaleVal != 1) O << ',' << ScaleVal; } @@ -309,31 +325,28 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, } } -static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI, - unsigned Op, raw_ostream &O, - const char *Modifier = nullptr) { - assert(isMem(*MI, Op) && "Invalid memory reference!"); - const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg); +void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + assert(isMem(*MI, OpNo) && "Invalid memory reference!"); + const MachineOperand &Segment = MI->getOperand(OpNo + X86::AddrSegmentReg); if (Segment.getReg()) { - printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier); + PrintModifiedOperand(MI, OpNo + X86::AddrSegmentReg, O, Modifier); O << ':'; } - printLeaMemReference(P, MI, Op, O, Modifier); + PrintLeaMemReference(MI, OpNo, O, Modifier); } -static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, - unsigned Op, raw_ostream &O, - const char *Modifier = nullptr, - unsigned AsmVariant = 1) { - const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); - const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); - const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); +void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, + unsigned OpNo, raw_ostream &O) { + const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg); + unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm(); + const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg); + const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp); + const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg); // If this has a segment register, print it. if (SegReg.getReg()) { - printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrSegmentReg, O); O << ':'; } @@ -341,7 +354,7 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, bool NeedPlus = false; if (BaseReg.getReg()) { - printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrBaseReg, O); NeedPlus = true; } @@ -349,13 +362,13 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, if (NeedPlus) O << " + "; if (ScaleVal != 1) O << ScaleVal << '*'; - printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrIndexReg, O); NeedPlus = true; } if (!DispSpec.isImm()) { if (NeedPlus) O << " + "; - printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrDisp, O); } else { int64_t DispVal = DispSpec.getImm(); if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { @@ -418,7 +431,6 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { @@ -429,7 +441,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); case 'a': // This is an address. Currently only 'i' and 'r' are expected. switch (MO.getType()) { default: @@ -442,13 +454,13 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case MachineOperand::MO_ExternalSymbol: llvm_unreachable("unexpected operand type!"); case MachineOperand::MO_GlobalAddress: - printSymbolOperand(*this, MO, O); + PrintSymbolOperand(MO, O); if (Subtarget->isPICStyleRIPRel()) O << "(%rip)"; return false; case MachineOperand::MO_Register: O << '('; - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); O << ')'; return false; } @@ -456,7 +468,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'c': // Don't print "$" before a global var name or constant. switch (MO.getType()) { default: - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); break; case MachineOperand::MO_Immediate: O << MO.getImm(); @@ -466,7 +478,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case MachineOperand::MO_ExternalSymbol: llvm_unreachable("unexpected operand type!"); case MachineOperand::MO_GlobalAddress: - printSymbolOperand(*this, MO, O); + PrintSymbolOperand(MO, O); break; } return false; @@ -474,7 +486,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'A': // Print '*' before a register (it must be a register) if (MO.isReg()) { O << '*'; - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); return false; } return true; @@ -487,11 +499,11 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'V': // Print native register without '%' if (MO.isReg()) return printAsmMRegister(*this, MO, ExtraCode[0], O); - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); return false; case 'P': // This is the operand of a call, treat specially. - printPCRelImm(*this, MI, OpNo, O); + PrintPCRelImm(MI, OpNo, O); return false; case 'n': // Negate the immediate or print a '-' before the operand. @@ -505,16 +517,15 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } } - printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant); + PrintOperand(MI, OpNo, O); return false; } -bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, +bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) { - if (AsmVariant) { - printIntelMemReference(*this, MI, OpNo, O); + if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { + PrintIntelMemReference(MI, OpNo, O); return false; } @@ -531,14 +542,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, // These only apply to registers, ignore on mem. break; case 'H': - printMemReference(*this, MI, OpNo, O, "H"); + PrintMemReference(MI, OpNo, O, "H"); return false; case 'P': // Don't print @PLT, but do print as memory. - printMemReference(*this, MI, OpNo, O, "no-rip"); + PrintMemReference(MI, OpNo, O, "no-rip"); return false; } } - printMemReference(*this, MI, OpNo, O); + PrintMemReference(MI, OpNo, O, nullptr); return false; } @@ -683,26 +694,31 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { // stripping. Since LLVM never generates code that does this, it is always // safe to set. OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - return; - } - - if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) { - StringRef SymbolName = - (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused"; - MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); - OutStreamer->EmitSymbolAttribute(S, MCSA_Global); - return; - } - - if (TT.isOSBinFormatCOFF()) { + } else if (TT.isOSBinFormatCOFF()) { + if (MMI->usesMSVCFloatingPoint()) { + // In Windows' libcmt.lib, there is a file which is linked in only if the + // symbol _fltused is referenced. Linking this in causes some + // side-effects: + // + // 1. For x86-32, it will set the x87 rounding mode to 53-bit instead of + // 64-bit mantissas at program start. + // + // 2. It links in support routines for floating-point in scanf and printf. + // + // MSVC emits an undefined reference to _fltused when there are any + // floating point operations in the program (including calls). A program + // that only has: `scanf("%f", &global_float);` may fail to trigger this, + // but oh well...that's a documented issue. + StringRef SymbolName = + (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused"; + MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); + OutStreamer->EmitSymbolAttribute(S, MCSA_Global); + return; + } emitStackMaps(SM); - return; - } - - if (TT.isOSBinFormatELF()) { + } else if (TT.isOSBinFormatELF()) { emitStackMaps(SM); FM.serializeToFaultMapSection(); - return; } } diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 55abdf2ba601..a011310970b3 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -1,9 +1,8 @@ //===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -103,6 +102,18 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // Choose between emitting .seh_ directives and .cv_fpo_ directives. void EmitSEHInstruction(const MachineInstr *MI); + void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override; + void PrintOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + void PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier); + void PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + void PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier); + void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier); + void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O); + public: X86AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); @@ -124,11 +135,9 @@ public: } bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool doInitialization(Module &M) override { SMShadowTracker.reset(0); diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index 627a6cb14514..3dcc1015dc7c 100644 --- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -1,9 +1,8 @@ //===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -69,9 +68,7 @@ using DisplacementSizeMap = std::map; class X86AvoidSFBPass : public MachineFunctionPass { public: static char ID; - X86AvoidSFBPass() : MachineFunctionPass(ID) { - initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry()); - } + X86AvoidSFBPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 Avoid Store Forwarding Blocks"; @@ -343,6 +340,8 @@ findPotentialBlockers(MachineInstr *LoadInst) { for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)), E = LoadInst->getParent()->rend(); PBInst != E; ++PBInst) { + if (PBInst->isMetaInstruction()) + continue; BlockCount++; if (BlockCount >= InspectionLimit) break; @@ -366,6 +365,8 @@ findPotentialBlockers(MachineInstr *LoadInst) { for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(), PME = PMBB->rend(); PBInst != PME; ++PBInst) { + if (PBInst->isMetaInstruction()) + continue; PredCount++; if (PredCount >= LimitLeft) break; @@ -407,7 +408,10 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, // If the load and store are consecutive, use the loadInst location to // reduce register pressure. MachineInstr *StInst = StoreInst; - if (StoreInst->getPrevNode() == LoadInst) + auto PrevInstrIt = skipDebugInstructionsBackward( + std::prev(MachineBasicBlock::instr_iterator(StoreInst)), + MBB->instr_begin()); + if (PrevInstrIt.getNodePtr() == LoadInst) StInst = LoadInst; MachineInstr *NewStore = BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode)) @@ -492,19 +496,22 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst, static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) { MachineOperand &LoadBase = getBaseOperand(LoadInst); MachineOperand &StoreBase = getBaseOperand(StoreInst); + auto StorePrevNonDbgInstr = skipDebugInstructionsBackward( + std::prev(MachineBasicBlock::instr_iterator(StoreInst)), + LoadInst->getParent()->instr_begin()).getNodePtr(); if (LoadBase.isReg()) { MachineInstr *LastLoad = LoadInst->getPrevNode(); // If the original load and store to xmm/ymm were consecutive // then the partial copies were also created in // a consecutive order to reduce register pressure, // and the location of the last load is before the last store. - if (StoreInst->getPrevNode() == LoadInst) + if (StorePrevNonDbgInstr == LoadInst) LastLoad = LoadInst->getPrevNode()->getPrevNode(); getBaseOperand(LastLoad).setIsKill(LoadBase.isKill()); } if (StoreBase.isReg()) { MachineInstr *StInst = StoreInst; - if (StoreInst->getPrevNode() == LoadInst) + if (StorePrevNonDbgInstr == LoadInst) StInst = LoadInst; getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill()); } @@ -531,7 +538,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { if (!isPotentialBlockedMemCpyLd(MI.getOpcode())) continue; int DefVR = MI.getOperand(0).getReg(); - if (!MRI->hasOneUse(DefVR)) + if (!MRI->hasOneNonDBGUse(DefVR)) continue; for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end(); UI != UE;) { diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 24d7a219e751..4df849a2e14c 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -1,9 +1,8 @@ //===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -60,10 +59,7 @@ namespace { class X86CallFrameOptimization : public MachineFunctionPass { public: - X86CallFrameOptimization() : MachineFunctionPass(ID) { - initializeX86CallFrameOptimizationPass( - *PassRegistry::getPassRegistry()); - } + X86CallFrameOptimization() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index 1dc83b76595d..b16b3839c85a 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -1,9 +1,8 @@ //===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -48,8 +47,6 @@ using namespace llvm; -#include "X86GenCallingConv.inc" - X86CallLowering::X86CallLowering(const X86TargetLowering &TLI) : CallLowering(&TLI) {} @@ -64,6 +61,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, SmallVector SplitVTs; SmallVector Offsets; ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet"); if (OrigArg.Ty->isVoidTy()) return true; @@ -73,12 +71,12 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, if (NumParts == 1) { // replace the original type ( pointer -> GPR ). - SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context), + SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context), OrigArg.Flags, OrigArg.IsFixed); return true; } - SmallVector SplitRegs; + SmallVector SplitRegs; EVT PartVT = TLI.getRegisterType(Context, VT); Type *PartTy = PartVT.getTypeForEVT(Context); @@ -88,7 +86,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)), PartTy, OrigArg.Flags}; SplitArgs.push_back(Info); - SplitRegs.push_back(Info.Reg); + SplitRegs.push_back(Info.Regs[0]); } PerformArgSplit(SplitRegs); @@ -104,28 +102,28 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { DL(MIRBuilder.getMF().getDataLayout()), STI(MIRBuilder.getMF().getSubtarget()) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0)); LLT SType = LLT::scalar(DL.getPointerSizeInBits(0)); - unsigned SPReg = MRI.createGenericVirtualRegister(p0); + Register SPReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister()); - unsigned OffsetReg = MRI.createGenericVirtualRegister(SType); + Register OffsetReg = MRI.createGenericVirtualRegister(SType); MIRBuilder.buildConstant(OffsetReg, Offset); - unsigned AddrReg = MRI.createGenericVirtualRegister(p0); + Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); return AddrReg; } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); - unsigned ExtReg; + Register ExtReg; // If we are copying the value to a physical register with the // size larger than the size of the value itself - build AnyExt // to the size of the register first and only then do the copy. @@ -146,12 +144,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MIRBuilder.buildCopy(PhysReg, ExtReg); } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - unsigned ExtReg = extendRegister(ValVReg, VA); + Register ExtReg = extendRegister(ValVReg, VA); auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(), - /* Alignment */ 0); + /* Alignment */ 1); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } @@ -185,7 +183,7 @@ protected: bool X86CallLowering::lowerReturn( MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const { + ArrayRef VRegs) const { assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && "Return value without a vreg"); auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0); @@ -208,7 +206,7 @@ bool X86CallLowering::lowerReturn( ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)}; setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, - [&](ArrayRef Regs) { + [&](ArrayRef Regs) { MIRBuilder.buildUnmerge(Regs, VRegs[i]); })) return false; @@ -231,7 +229,9 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { : ValueHandler(MIRBuilder, MRI, AssignFn), DL(MIRBuilder.getMF().getDataLayout()) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + bool isArgumentHandler() const override { return true; } + + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { auto &MFI = MIRBuilder.getMF().getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); @@ -243,15 +243,15 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { return AddrReg; } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, - 0); + 1); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { markPhysRegUsed(PhysReg); @@ -320,9 +320,9 @@ protected: } // end anonymous namespace -bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef VRegs) const { +bool X86CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { if (F.arg_empty()) return true; @@ -344,14 +344,14 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, Arg.hasAttribute(Attribute::StructRet) || Arg.hasAttribute(Attribute::SwiftSelf) || Arg.hasAttribute(Attribute::SwiftError) || - Arg.hasAttribute(Attribute::Nest)) + Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) return false; ArgInfo OrigArg(VRegs[Idx], Arg.getType()); setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef Regs) { - MIRBuilder.buildMerge(VRegs[Idx], Regs); + [&](ArrayRef Regs) { + MIRBuilder.buildMerge(VRegs[Idx][0], Regs); })) return false; Idx++; @@ -409,9 +409,12 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (OrigArg.Flags.isByVal()) return false; + if (OrigArg.Regs.size() > 1) + return false; + if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef Regs) { - MIRBuilder.buildUnmerge(Regs, OrigArg.Reg); + [&](ArrayRef Regs) { + MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]); })) return false; } @@ -451,12 +454,15 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. - if (OrigRet.Reg) { + if (!OrigRet.Ty->isVoidTy()) { + if (OrigRet.Regs.size() > 1) + return false; + SplitArgs.clear(); - SmallVector NewRegs; + SmallVector NewRegs; if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI, - [&](ArrayRef Regs) { + [&](ArrayRef Regs) { NewRegs.assign(Regs.begin(), Regs.end()); })) return false; @@ -466,7 +472,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; if (!NewRegs.empty()) - MIRBuilder.buildMerge(OrigRet.Reg, NewRegs); + MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs); } CallSeqStart.addImm(Handler.getStackSize()) diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index f5f8f9a3ef6d..0445331bc3ff 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -1,9 +1,8 @@ //===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,10 +29,10 @@ public: X86CallLowering(const X86TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef VRegs) const override; + ArrayRef VRegs) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef VRegs) const override; + ArrayRef> VRegs) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, @@ -41,7 +40,7 @@ public: private: /// A function of this type is used to perform value split action. - using SplitArgTy = std::function)>; + using SplitArgTy = std::function)>; bool splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl &SplitArgs, diff --git a/lib/Target/X86/X86CallingConv.cpp b/lib/Target/X86/X86CallingConv.cpp index 59dde982f512..aee344a26764 100644 --- a/lib/Target/X86/X86CallingConv.cpp +++ b/lib/Target/X86/X86CallingConv.cpp @@ -1,9 +1,8 @@ //=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,16 +11,23 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86CallingConv.h" #include "X86Subtarget.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" -namespace llvm { - -bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { +using namespace llvm; + +/// When regcall calling convention compiled to 32 bit arch, special treatment +/// is required for 64 bit masks. +/// The value should be assigned to two GPRs. +/// \return true if registers were allocated and false otherwise. +static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { // List of GPR registers that are available to store values in regcall // calling convention. static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI, @@ -113,9 +119,15 @@ static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT, return false; } -bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 64 bit arch. +/// For HVAs shadow registers might be allocated on the first pass +/// and actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { // On the second pass, go through the HVAs only. if (ArgFlags.isSecArgPass()) { if (ArgFlags.isHva()) @@ -150,7 +162,10 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, // created on top of the basic 32 bytes of win64. // It can happen if the fifth or sixth argument is vector type or HVA. // At that case for each argument a shadow stack of 8 bytes is allocated. - if (Reg == X86::XMM4 || Reg == X86::XMM5) + const TargetRegisterInfo *TRI = + State.getMachineFunction().getSubtarget().getRegisterInfo(); + if (TRI->regsOverlap(Reg, X86::XMM4) || + TRI->regsOverlap(Reg, X86::XMM5)) State.AllocateStack(8, 8); if (!ArgFlags.isHva()) { @@ -165,9 +180,14 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return ArgFlags.isHva(); } -bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 32 bit arch. +/// For HVAs actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +static bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { // On the second pass, go through the HVAs only. if (ArgFlags.isSecArgPass()) { if (ArgFlags.isHva()) @@ -205,4 +225,110 @@ bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return false; // No register was assigned - Continue the search. } -} // End llvm namespace +static bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, + CCValAssign::LocInfo &, ISD::ArgFlagsTy &, + CCState &) { + llvm_unreachable("The AnyReg calling convention is only supported by the " + "stackmap and patchpoint intrinsics."); + // gracefully fallback to X86 C calling convention on Release builds. + return false; +} + +static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure + // not to split i64 and double between a register and stack + static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; + static const unsigned NumRegs = sizeof(RegList) / sizeof(RegList[0]); + + SmallVectorImpl &PendingMembers = State.getPendingLocs(); + + // If this is the first part of an double/i64/i128, or if we're already + // in the middle of a split, add to the pending list. If this is not + // the end of the split, return, otherwise go on to process the pending + // list + if (ArgFlags.isSplit() || !PendingMembers.empty()) { + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + if (!ArgFlags.isSplitEnd()) + return true; + } + + // If there are no pending members, we are not in the middle of a split, + // so do the usual inreg stuff. + if (PendingMembers.empty()) { + if (unsigned Reg = State.AllocateReg(RegList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; + } + + assert(ArgFlags.isSplitEnd()); + + // We now have the entire original argument in PendingMembers, so decide + // whether to use registers or the stack. + // Per the MCU ABI: + // a) To use registers, we need to have enough of them free to contain + // the entire argument. + // b) We never want to use more than 2 registers for a single argument. + + unsigned FirstFree = State.getFirstUnallocated(RegList); + bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); + + for (auto &It : PendingMembers) { + if (UseRegs) + It.convertToReg(State.AllocateReg(RegList[FirstFree++])); + else + It.convertToMem(State.AllocateStack(4, 4)); + State.addLoc(It); + } + + PendingMembers.clear(); + + return true; +} + +/// X86 interrupt handlers can only take one or two stack arguments, but if +/// there are two arguments, they are in the opposite order from the standard +/// convention. Therefore, we have to look at the argument count up front before +/// allocating stack for each argument. +static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + const MachineFunction &MF = State.getMachineFunction(); + size_t ArgCount = State.getMachineFunction().getFunction().arg_size(); + bool Is64Bit = static_cast(MF.getSubtarget()).is64Bit(); + unsigned SlotSize = Is64Bit ? 8 : 4; + unsigned Offset; + if (ArgCount == 1 && ValNo == 0) { + // If we have one argument, the argument is five stack slots big, at fixed + // offset zero. + Offset = State.AllocateStack(5 * SlotSize, 4); + } else if (ArgCount == 2 && ValNo == 0) { + // If we have two arguments, the stack slot is *after* the error code + // argument. Pretend it doesn't consume stack space, and account for it when + // we assign the second argument. + Offset = SlotSize; + } else if (ArgCount == 2 && ValNo == 1) { + // If this is the second of two arguments, it must be the error code. It + // appears first on the stack, and is then followed by the five slot + // interrupt struct. + Offset = 0; + (void)State.AllocateStack(6 * SlotSize, 4); + } else { + report_fatal_error("unsupported x86 interrupt prototype"); + } + + // FIXME: This should be accounted for in + // X86FrameLowering::getFrameIndexReference, not here. + if (Is64Bit && ArgCount == 2) + Offset += SlotSize; + + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return true; +} + +// Provides entry points of CC_X86 and RetCC_X86. +#include "X86GenCallingConv.inc" diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h index d0fcbd313312..191e0fa619b2 100644 --- a/lib/Target/X86/X86CallingConv.h +++ b/lib/Target/X86/X86CallingConv.h @@ -1,9 +1,8 @@ //=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,99 +20,12 @@ namespace llvm { -/// When regcall calling convention compiled to 32 bit arch, special treatment -/// is required for 64 bit masks. -/// The value should be assigned to two GPRs. -/// \return true if registers were allocated and false otherwise. -bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State); - -/// Vectorcall calling convention has special handling for vector types or -/// HVA for 64 bit arch. -/// For HVAs shadow registers might be allocated on the first pass -/// and actual XMM registers are allocated on the second pass. -/// For vector types, actual XMM registers are allocated on the first pass. -/// \return true if registers were allocated and false otherwise. -bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State); - -/// Vectorcall calling convention has special handling for vector types or -/// HVA for 32 bit arch. -/// For HVAs actual XMM registers are allocated on the second pass. -/// For vector types, actual XMM registers are allocated on the first pass. -/// \return true if registers were allocated and false otherwise. -bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State); - -inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, - CCValAssign::LocInfo &, ISD::ArgFlagsTy &, - CCState &) { - llvm_unreachable("The AnyReg calling convention is only supported by the " \ - "stackmap and patchpoint intrinsics."); - // gracefully fallback to X86 C calling convention on Release builds. - return false; -} - -inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure - // not to split i64 and double between a register and stack - static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; - static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); - - SmallVectorImpl &PendingMembers = State.getPendingLocs(); - - // If this is the first part of an double/i64/i128, or if we're already - // in the middle of a split, add to the pending list. If this is not - // the end of the split, return, otherwise go on to process the pending - // list - if (ArgFlags.isSplit() || !PendingMembers.empty()) { - PendingMembers.push_back( - CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); - if (!ArgFlags.isSplitEnd()) - return true; - } - - // If there are no pending members, we are not in the middle of a split, - // so do the usual inreg stuff. - if (PendingMembers.empty()) { - if (unsigned Reg = State.AllocateReg(RegList)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return true; - } - return false; - } - - assert(ArgFlags.isSplitEnd()); - - // We now have the entire original argument in PendingMembers, so decide - // whether to use registers or the stack. - // Per the MCU ABI: - // a) To use registers, we need to have enough of them free to contain - // the entire argument. - // b) We never want to use more than 2 registers for a single argument. - - unsigned FirstFree = State.getFirstUnallocated(RegList); - bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); - - for (auto &It : PendingMembers) { - if (UseRegs) - It.convertToReg(State.AllocateReg(RegList[FirstFree++])); - else - It.convertToMem(State.AllocateStack(4, 4)); - State.addLoc(It); - } - - PendingMembers.clear(); +bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); - return true; -} +bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); } // End llvm namespace diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index fe49c9ffbd95..1c3034a5116a 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -1,9 +1,8 @@ //===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -148,7 +147,8 @@ def CC_#NAME : CallingConv<[ CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> ]>; def RetCC_#NAME : CallingConv<[ @@ -477,6 +477,7 @@ def RetCC_X86_64 : CallingConv<[ ]>; // This is the return-value convention used for the entire X86 backend. +let Entry = 1 in def RetCC_X86 : CallingConv<[ // Check if this is the Intel OpenCL built-ins calling convention @@ -567,7 +568,7 @@ def CC_X86_64_C : CallingConv<[ CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -612,7 +613,7 @@ def CC_X86_Win64_C : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer - CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect>, + CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, @@ -985,14 +986,6 @@ def CC_Intel_OCL_BI : CallingConv<[ CCDelegateTo ]>; -def CC_X86_32_Intr : CallingConv<[ - CCAssignToStack<4, 4> -]>; - -def CC_X86_64_Intr : CallingConv<[ - CCAssignToStack<8, 8> -]>; - //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// @@ -1001,7 +994,7 @@ def CC_X86_64_Intr : CallingConv<[ def CC_X86_32 : CallingConv<[ // X86_INTR calling convention is valid in MCU target and should override the // MCU calling convention. Thus, this should be checked before isTargetMCU(). - CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, + CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>, CCIfSubtarget<"isTargetMCU()", CCDelegateTo>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, @@ -1029,7 +1022,7 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_RegCall", CCIfSubtarget<"isTargetWin64()", CCDelegateTo>>, CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo>, - CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, + CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, @@ -1039,6 +1032,7 @@ def CC_X86_64 : CallingConv<[ ]>; // This is the argument convention used for the entire X86 backend. +let Entry = 1 in def CC_X86 : CallingConv<[ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp index c3e76fd2a856..a61fa3246f09 100644 --- a/lib/Target/X86/X86CmovConversion.cpp +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -1,9 +1,8 @@ //====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -102,9 +101,7 @@ namespace { /// Converts X86 cmov instructions into branches when profitable. class X86CmovConverterPass : public MachineFunctionPass { public: - X86CmovConverterPass() : MachineFunctionPass(ID) { - initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry()); - } + X86CmovConverterPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 cmov Conversion"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -281,7 +278,8 @@ bool X86CmovConverterPass::collectCmovCandidates( Group.clear(); // Condition code of first CMOV instruction current processed range and its // opposite condition code. - X86::CondCode FirstCC, FirstOppCC, MemOpCC; + X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID, + MemOpCC = X86::COND_INVALID; // Indicator of a non CMOVrr instruction in the current processed range. bool FoundNonCMOVInst = false; // Indicator for current processed CMOV-group if it should be skipped. @@ -291,7 +289,7 @@ bool X86CmovConverterPass::collectCmovCandidates( // Skip debug instructions. if (I.isDebugInstr()) continue; - X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); + X86::CondCode CC = X86::getCondFromCMov(I); // Check if we found a X86::CMOVrr instruction. if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) { if (Group.empty()) { @@ -546,7 +544,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( } unsigned CondCost = - DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth; + DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth; unsigned ValCost = getDepthOfOptCmov( DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); @@ -594,7 +592,7 @@ static bool checkEFLAGSLive(MachineInstr *MI) { /// move all debug instructions to after the last CMOV instruction, making the /// CMOV group consecutive. static void packCmovGroup(MachineInstr *First, MachineInstr *Last) { - assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID && + assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID && "Last instruction in a CMOV group must be a CMOV instruction"); SmallVector DBGInstructions; @@ -652,14 +650,14 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MachineInstr *LastCMOV = Group.back(); DebugLoc DL = MI.getDebugLoc(); - X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); + X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI)); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); // Potentially swap the condition codes so that any memory operand to a CMOV // is in the *false* position instead of the *true* position. We can invert // any non-memory operand CMOV instructions to cope with this and we ensure // memory operand CMOVs are only included with a single condition code. if (llvm::any_of(Group, [&](MachineInstr *I) { - return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC; + return I->mayLoad() && X86::getCondFromCMov(*I) == CC; })) std::swap(CC, OppCC); @@ -690,7 +688,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. - BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB); + BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // Add the sink block to the false block successors. FalseMBB->addSuccessor(SinkMBB); @@ -713,8 +711,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( if (!MI.mayLoad()) { // Remember the false-side register input. unsigned FalseReg = - MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2) - .getReg(); + MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg(); // Walk back through any intermediate cmovs referenced. while (true) { auto FRIt = FalseBBRegRewriteTable.find(FalseReg); @@ -729,7 +726,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // The condition must be the *opposite* of the one we've decided to branch // on as the branch will go *around* the load and the load should happen // when the CMOV condition is false. - assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC && + assert(X86::getCondFromCMov(MI) == OppCC && "Can only handle memory-operand cmov instructions with a condition " "opposite to the selected branch direction."); @@ -768,7 +765,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // Move the new CMOV to just before the old one and reset any impacted // iterator. auto *NewCMOV = NewMIs.pop_back_val(); - assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC && + assert(X86::getCondFromCMov(*NewCMOV) == OppCC && "Last new instruction isn't the expected CMOV!"); LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump()); MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV); @@ -820,7 +817,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // If this CMOV we are processing is the opposite condition from the jump we // generated, then we have to swap the operands for the PHI that is going to // be generated. - if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC) + if (X86::getCondFromCMov(*MIIt) == OppCC) std::swap(Op1Reg, Op2Reg); auto Op1Itr = RegRewriteTable.find(Op1Reg); diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp index 7ce443c4656a..9dea94f1368d 100644 --- a/lib/Target/X86/X86CondBrFolding.cpp +++ b/lib/Target/X86/X86CondBrFolding.cpp @@ -1,9 +1,8 @@ //===---- X86CondBrFolding.cpp - optimize conditional branches ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file defines a pass that optimizes condition branches on x86 by taking @@ -62,9 +61,7 @@ STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded"); namespace { class X86CondBrFoldingPass : public MachineFunctionPass { public: - X86CondBrFoldingPass() : MachineFunctionPass(ID) { - initializeX86CondBrFoldingPassPass(*PassRegistry::getPassRegistry()); - } + X86CondBrFoldingPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 CondBr Folding"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -226,10 +223,9 @@ void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB, MachineInstr *BrMI; if (MBBInfo->TBB == OrigDest) { BrMI = MBBInfo->BrInstr; - unsigned JNCC = GetCondBranchFromCond(MBBInfo->BranchCode); MachineInstrBuilder MIB = - BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(JNCC)) - .addMBB(NewDest); + BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1)) + .addMBB(NewDest).addImm(MBBInfo->BranchCode); MBBInfo->TBB = NewDest; MBBInfo->BrInstr = MIB.getInstr(); } else { // Should be the unconditional jump stmt. @@ -255,8 +251,8 @@ void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) { MachineInstr *BrMI = MBBInfo->BrInstr; X86::CondCode CC = MBBInfo->BranchCode; MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), - TII->get(GetCondBranchFromCond(CC))) - .addMBB(MBBInfo->TBB); + TII->get(X86::JCC_1)) + .addMBB(MBBInfo->TBB).addImm(CC); BrMI->eraseFromParent(); MBBInfo->BrInstr = MIB.getInstr(); @@ -324,8 +320,8 @@ void X86CondBrFolding::optimizeCondBr( llvm_unreachable("unexpected condtional code."); } BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), - TII->get(GetCondBranchFromCond(NewCC))) - .addMBB(RootMBBInfo->FBB); + TII->get(X86::JCC_1)) + .addMBB(RootMBBInfo->FBB).addImm(NewCC); // RootMBB: Jump to TargetMBB BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), @@ -513,7 +509,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) { if (I->isBranch()) { if (TBB) return nullptr; - CC = X86::getCondFromBranchOpc(I->getOpcode()); + CC = X86::getCondFromBranch(*I); switch (CC) { default: return nullptr; diff --git a/lib/Target/X86/X86DiscriminateMemOps.cpp b/lib/Target/X86/X86DiscriminateMemOps.cpp index 3654bf04f4e9..7051550d52e6 100644 --- a/lib/Target/X86/X86DiscriminateMemOps.cpp +++ b/lib/Target/X86/X86DiscriminateMemOps.cpp @@ -1,9 +1,8 @@ //===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -27,6 +26,22 @@ using namespace llvm; #define DEBUG_TYPE "x86-discriminate-memops" +static cl::opt EnableDiscriminateMemops( + DEBUG_TYPE, cl::init(false), + cl::desc("Generate unique debug info for each instruction with a memory " + "operand. Should be enabled for profile-drived cache prefetching, " + "both in the build of the binary being profiled, as well as in " + "the build of the binary consuming the profile."), + cl::Hidden); + +static cl::opt BypassPrefetchInstructions( + "x86-bypass-prefetch-instructions", cl::init(true), + cl::desc("When discriminating instructions with memory operands, ignore " + "prefetch instructions. This ensures the other memory operand " + "instructions have the same identifiers after inserting " + "prefetches, allowing for successive insertions."), + cl::Hidden); + namespace { using Location = std::pair; @@ -55,6 +70,10 @@ public: X86DiscriminateMemOps(); }; +bool IsPrefetchOpcode(unsigned Opcode) { + return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 || + Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2; +} } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -67,6 +86,9 @@ char X86DiscriminateMemOps::ID = 0; X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {} bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { + if (!EnableDiscriminateMemops) + return false; + DISubprogram *FDI = MF.getFunction().getSubprogram(); if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling()) return false; @@ -75,7 +97,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { // have any debug info. const DILocation *ReferenceDI = DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI); - + assert(ReferenceDI && "ReferenceDI should not be nullptr"); DenseMap MemOpDiscriminators; MemOpDiscriminators[diToLocation(ReferenceDI)] = 0; @@ -88,6 +110,8 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { const auto &DI = MI.getDebugLoc(); if (!DI) continue; + if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode)) + continue; Location Loc = diToLocation(DI); MemOpDiscriminators[Loc] = std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator()); @@ -104,15 +128,18 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { for (auto &MI : MBB) { if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0) continue; + if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode)) + continue; const DILocation *DI = MI.getDebugLoc(); - if (!DI) { + bool HasDebug = DI; + if (!HasDebug) { DI = ReferenceDI; } Location L = diToLocation(DI); DenseSet &Set = Seen[L]; const std::pair::iterator, bool> TryInsert = Set.insert(DI->getBaseDiscriminator()); - if (!TryInsert.second) { + if (!TryInsert.second || !HasDebug) { unsigned BF, DF, CI = 0; DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI); Optional EncodedDiscriminator = DILocation::encodeDiscriminator( @@ -133,6 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { // Since we were able to encode, bump the MemOpDiscriminators. ++MemOpDiscriminators[L]; DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue()); + assert(DI && "DI should not be nullptr"); updateDebugInfo(&MI, DI); Changed = true; std::pair::iterator, bool> MustInsert = diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index d9ebbb506ca4..18bbfa32e11b 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -1,9 +1,8 @@ //===--- X86DomainReassignment.cpp - Selectively switch register classes---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -387,9 +386,7 @@ class X86DomainReassignment : public MachineFunctionPass { public: static char ID; - X86DomainReassignment() : MachineFunctionPass(ID) { - initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry()); - } + X86DomainReassignment() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -557,6 +554,7 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { // Register already in this closure. if (!C.insertEdge(CurReg)) continue; + EnclosedEdges.insert(Reg); MachineInstr *DefMI = MRI->getVRegDef(CurReg); encloseInstr(C, DefMI); diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp index 80674c7251fe..58680f1815bb 100755 --- a/lib/Target/X86/X86EvexToVex.cpp +++ b/lib/Target/X86/X86EvexToVex.cpp @@ -1,10 +1,9 @@ //===- X86EvexToVex.cpp ---------------------------------------------------===// // Compress EVEX instructions to VEX encoding when possible to reduce code size // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,15 +12,15 @@ /// are encoded using the EVEX prefix and if possible replaces them by their /// corresponding VEX encoding which is usually shorter by 2 bytes. /// EVEX instructions may be encoded via the VEX prefix when the AVX-512 -/// instruction has a corresponding AVX/AVX2 opcode and when it does not -/// use the xmm or the mask registers or xmm/ymm registers with indexes -/// higher than 15. +/// instruction has a corresponding AVX/AVX2 opcode, when vector length +/// accessed by instruction is less than 512 bits and when it does not use +// the xmm or the mask registers or xmm/ymm registers with indexes higher than 15. /// The pass applies code reduction on the generated code for AVX-512 instrs. // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86InstComments.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86InstComments.h" #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" @@ -69,9 +68,7 @@ class EvexToVexInstPass : public MachineFunctionPass { public: static char ID; - EvexToVexInstPass() : MachineFunctionPass(ID) { - initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry()); - } + EvexToVexInstPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return EVEX2VEX_DESC; } @@ -255,7 +252,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable) : makeArrayRef(X86EvexToVex128CompressTable); - auto I = std::lower_bound(Table.begin(), Table.end(), MI.getOpcode()); + auto I = llvm::lower_bound(Table, MI.getOpcode()); if (I == Table.end() || I->EvexOpcode != MI.getOpcode()) return false; diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index 1dd73163080b..b8624b40f2f7 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -1,9 +1,8 @@ //===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,6 +26,7 @@ using namespace llvm; #define DEBUG_TYPE "x86-pseudo" +#define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass" namespace { class X86ExpandPseudo : public MachineFunctionPass { @@ -66,8 +66,12 @@ private: bool ExpandMBB(MachineBasicBlock &MBB); }; char X86ExpandPseudo::ID = 0; + } // End anonymous namespace. +INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false, + false) + void X86ExpandPseudo::ExpandICallBranchFunnel( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) { MachineBasicBlock *JTMBB = MBB; @@ -83,6 +87,8 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal(); auto CmpTarget = [&](unsigned Target) { + if (Selector.isReg()) + MBB->addLiveIn(Selector.getReg()); BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11) .addReg(X86::RIP) .addImm(1) @@ -98,11 +104,13 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( auto CreateMBB = [&]() { auto *NewMBB = MF->CreateMachineBasicBlock(BB); MBB->addSuccessor(NewMBB); + if (!MBB->isLiveIn(X86::EFLAGS)) + MBB->addLiveIn(X86::EFLAGS); return NewMBB; }; - auto EmitCondJump = [&](unsigned Opcode, MachineBasicBlock *ThenMBB) { - BuildMI(*MBB, MBBI, DL, TII->get(Opcode)).addMBB(ThenMBB); + auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) { + BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC); auto *ElseMBB = CreateMBB(); MF->insert(InsPt, ElseMBB); @@ -110,10 +118,10 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( MBBI = MBB->end(); }; - auto EmitCondJumpTarget = [&](unsigned Opcode, unsigned Target) { + auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) { auto *ThenMBB = CreateMBB(); TargetMBBs.push_back({ThenMBB, Target}); - EmitCondJump(Opcode, ThenMBB); + EmitCondJump(CC, ThenMBB); }; auto EmitTailCall = [&](unsigned Target) { @@ -130,23 +138,23 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( if (NumTargets == 2) { CmpTarget(FirstTarget + 1); - EmitCondJumpTarget(X86::JB_1, FirstTarget); + EmitCondJumpTarget(X86::COND_B, FirstTarget); EmitTailCall(FirstTarget + 1); return; } if (NumTargets < 6) { CmpTarget(FirstTarget + 1); - EmitCondJumpTarget(X86::JB_1, FirstTarget); - EmitCondJumpTarget(X86::JE_1, FirstTarget + 1); + EmitCondJumpTarget(X86::COND_B, FirstTarget); + EmitCondJumpTarget(X86::COND_E, FirstTarget + 1); EmitBranchFunnel(FirstTarget + 2, NumTargets - 2); return; } auto *ThenMBB = CreateMBB(); CmpTarget(FirstTarget + (NumTargets / 2)); - EmitCondJump(X86::JB_1, ThenMBB); - EmitCondJumpTarget(X86::JE_1, FirstTarget + (NumTargets / 2)); + EmitCondJump(X86::COND_B, ThenMBB); + EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2)); EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1, NumTargets - (NumTargets / 2) - 1); @@ -254,16 +262,19 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, for (unsigned i = 0; i != 5; ++i) MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { + JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) - .addReg(JumpTarget.getReg(), RegState::Kill); + .add(JumpTarget); } else { + JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr)) - .addReg(JumpTarget.getReg(), RegState::Kill); + .add(JumpTarget); } MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); + MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 9dd3f2652543..7b9ce0271205 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1,9 +1,8 @@ //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -85,7 +84,7 @@ private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, const DebugLoc &DL); - bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, + bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment = 1); bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, @@ -290,7 +289,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, } bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { - EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true); + EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) // Unhandled type. Halt "fast" selection and bail. return false; @@ -312,12 +311,10 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); } -#include "X86GenCallingConv.inc" - /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. -bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, +bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment) { bool HasSSE41 = Subtarget->hasSSE41(); @@ -327,46 +324,42 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); + // Treat i1 loads the same as i8 loads. Masking will be done when storing. + if (VT == MVT::i1) + VT = MVT::i8; + // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return false; - case MVT::i1: case MVT::i8: Opc = X86::MOV8rm; - RC = &X86::GR8RegClass; break; case MVT::i16: Opc = X86::MOV16rm; - RC = &X86::GR16RegClass; break; case MVT::i32: Opc = X86::MOV32rm; - RC = &X86::GR32RegClass; break; case MVT::i64: // Must be in x86-64 mode. Opc = X86::MOV64rm; - RC = &X86::GR64RegClass; break; case MVT::f32: - if (X86ScalarSSEf32) { - Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; - RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; - } else { + if (X86ScalarSSEf32) + Opc = HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt; + else Opc = X86::LD_Fp32m; - RC = &X86::RFP32RegClass; - } break; case MVT::f64: - if (X86ScalarSSEf64) { - Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; - RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; - } else { + if (X86ScalarSSEf64) + Opc = HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt; + else Opc = X86::LD_Fp64m; - RC = &X86::RFP64RegClass; - } break; case MVT::f80: // No f80 support yet. @@ -381,7 +374,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; - RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) @@ -393,13 +385,12 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVUPDZ128rm : HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; - RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - if (IsNonTemporal && Alignment >= 16) + if (IsNonTemporal && Alignment >= 16 && HasSSE41) Opc = HasVLX ? X86::VMOVNTDQAZ128rm : HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) @@ -408,7 +399,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVDQU64Z128rm : HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; - RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); @@ -420,7 +410,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; - RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); @@ -432,7 +421,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; - RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v8i32: case MVT::v4i64: @@ -447,7 +435,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; - RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v16f32: assert(HasAVX512); @@ -455,7 +442,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm; - RC = &X86::VR512RegClass; break; case MVT::v8f64: assert(HasAVX512); @@ -463,7 +449,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm; - RC = &X86::VR512RegClass; break; case MVT::v8i64: case MVT::v16i32: @@ -476,10 +461,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm; - RC = &X86::VR512RegClass; break; } + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); @@ -1483,8 +1469,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { - { X86::SETEr, X86::SETNPr, X86::AND8rr }, - { X86::SETNEr, X86::SETPr, X86::OR8rr } + { X86::COND_E, X86::COND_NP, X86::AND8rr }, + { X86::COND_NE, X86::COND_P, X86::OR8rr } }; const uint16_t *SETFOpc = nullptr; switch (Predicate) { @@ -1500,10 +1486,10 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), - FlagReg1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), - FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg1).addImm(SETFOpc[0]); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg2).addImm(SETFOpc[1]); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), ResultReg).addReg(FlagReg1).addReg(FlagReg2); updateValueMap(I, ResultReg); @@ -1514,7 +1500,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) std::swap(LHS, RHS); @@ -1523,7 +1508,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + ResultReg).addImm(CC); updateValueMap(I, ResultReg); return true; } @@ -1693,11 +1679,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } bool SwapArgs; - unsigned BranchOpc; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) std::swap(CmpLHS, CmpRHS); @@ -1705,14 +1689,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(CC); // X86 requires a second branch to handle UNE (and OEQ, which is mapped // to UNE above). if (NeedExtraBranch) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(X86::COND_P); } finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); @@ -1739,14 +1723,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); - unsigned JmpOpc = X86::JNE_1; + unsigned JmpCond = X86::COND_NE; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); - JmpOpc = X86::JE_1; + JmpCond = X86::COND_E; } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(JmpCond); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; @@ -1759,10 +1743,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (TmpReg == 0) return false; - unsigned BranchOpc = X86::GetCondBranchFromCond(CC); - - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(CC); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1786,8 +1768,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg) .addImm(1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(X86::COND_NE); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -2050,8 +2032,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { - { X86::SETNPr, X86::SETEr , X86::TEST8rr }, - { X86::SETPr, X86::SETNEr, X86::OR8rr } + { X86::COND_NP, X86::COND_E, X86::TEST8rr }, + { X86::COND_P, X86::COND_NE, X86::OR8rr } }; const uint16_t *SETFOpc = nullptr; switch (Predicate) { @@ -2083,10 +2065,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { if (SETFOpc) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), - FlagReg1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), - FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg1).addImm(SETFOpc[0]); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg2).addImm(SETFOpc[1]); auto const &II = TII.get(SETFOpc[2]); if (II.getNumDefs()) { unsigned TmpReg = createResultReg(&X86::GR8RegClass); @@ -2147,9 +2129,9 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); - unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8); - unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, - LHSReg, LHSIsKill); + unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8); + unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CC); updateValueMap(I, ResultReg); return true; } @@ -2194,19 +2176,6 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - // Choose the SSE instruction sequence based on data type (float or double). - static const uint16_t OpcTable[2][4] = { - { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, - { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } - }; - - const uint16_t *Opc = nullptr; - switch (RetVT.SimpleTy) { - default: return false; - case MVT::f32: Opc = &OpcTable[0][0]; break; - case MVT::f64: Opc = &OpcTable[1][0]; break; - } - const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); @@ -2277,6 +2246,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { + // Choose the SSE instruction sequence based on data type (float or double). + static const uint16_t OpcTable[2][4] = { + { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, + { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } + }; + + const uint16_t *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; + } + const TargetRegisterClass *VR128 = &X86::VR128RegClass; unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); @@ -2303,8 +2285,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; case MVT::i32: Opc = X86::CMOV_GR32; break; - case MVT::f32: Opc = X86::CMOV_FR32; break; - case MVT::f64: Opc = X86::CMOV_FR64; break; + case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X + : X86::CMOV_FR32; break; + case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X + : X86::CMOV_FR64; break; } const Value *Cond = I->getOperand(0); @@ -2485,13 +2469,14 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, assert((I->getOpcode() == Instruction::FPExt || I->getOpcode() == Instruction::FPTrunc) && "Instruction must be an FPExt or FPTrunc!"); + bool HasAVX = Subtarget->hasAVX(); unsigned OpReg = getRegForValue(I->getOperand(0)); if (OpReg == 0) return false; unsigned ImplicitDefReg; - if (Subtarget->hasAVX()) { + if (HasAVX) { ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); @@ -2503,7 +2488,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), ResultReg); - if (Subtarget->hasAVX()) + if (HasAVX) MIB.addReg(ImplicitDefReg); MIB.addReg(OpReg); @@ -2519,8 +2504,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { unsigned Opc = HasAVX512 ? X86::VCVTSS2SDZrr : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; - return X86SelectFPExtOrFPTrunc( - I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass); + return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64)); } return false; @@ -2534,8 +2518,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { unsigned Opc = HasAVX512 ? X86::VCVTSD2SSZrr : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; - return X86SelectFPExtOrFPTrunc( - I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass); + return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32)); } return false; @@ -2900,21 +2883,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { isCommutativeIntrinsic(II)) std::swap(LHS, RHS); - unsigned BaseOpc, CondOpc; + unsigned BaseOpc, CondCode; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: - BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + BaseOpc = ISD::ADD; CondCode = X86::COND_O; break; case Intrinsic::uadd_with_overflow: - BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + BaseOpc = ISD::ADD; CondCode = X86::COND_B; break; case Intrinsic::ssub_with_overflow: - BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + BaseOpc = ISD::SUB; CondCode = X86::COND_O; break; case Intrinsic::usub_with_overflow: - BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + BaseOpc = ISD::SUB; CondCode = X86::COND_B; break; case Intrinsic::smul_with_overflow: - BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break; case Intrinsic::umul_with_overflow: - BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break; } unsigned LHSReg = getRegForValue(LHS); @@ -2931,7 +2914,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { }; if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) && - CondOpc == X86::SETOr) { + CondCode == X86::COND_O) { // We can use INC/DEC. ResultReg = createResultReg(TLI.getRegClassFor(VT)); bool IsDec = BaseOpc == ISD::SUB; @@ -2990,8 +2973,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // Assign to a GPR since the overflow return value is lowered to a SETcc. unsigned ResultReg2 = createResultReg(&X86::GR8RegClass); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), - ResultReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + ResultReg2).addImm(CondCode); updateValueMap(II, ResultReg, 2); return true; @@ -3509,8 +3492,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // This will be a direct call, or an indirect call through memory for // NonLazyBind calls or dllimport calls. - bool NeedLoad = - OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL; + bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT || + OpFlags == X86II::MO_GOTPCREL || + OpFlags == X86II::MO_COFFSTUB; unsigned CallOpc = NeedLoad ? (Is64Bit ? X86::CALL64m : X86::CALL32m) : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); @@ -3595,7 +3579,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)), FI) .addReg(CopyReg); - Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt; addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg + i), FI); } @@ -3662,24 +3646,19 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return true; } case Instruction::BitCast: { - // Select SSE2/AVX bitcasts between 128/256 bit vector types. + // Select SSE2/AVX bitcasts between 128/256/512 bit vector types. if (!Subtarget->hasSSE2()) return false; - EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(DL, I->getType()); - - if (!SrcVT.isSimple() || !DstVT.isSimple()) + MVT SrcVT, DstVT; + if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) || + !isTypeLegal(I->getType(), DstVT)) return false; - MVT SVT = SrcVT.getSimpleVT(); - MVT DVT = DstVT.getSimpleVT(); - - if (!SVT.is128BitVector() && - !(Subtarget->hasAVX() && SVT.is256BitVector()) && - !(Subtarget->hasAVX512() && SVT.is512BitVector() && - (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 && - DVT.getScalarSizeInBits() >= 32)))) + // Only allow vectors that use xmm/ymm/zmm. + if (!SrcVT.isVector() || !DstVT.isVector() || + SrcVT.getVectorElementType() == MVT::i1 || + DstVT.getVectorElementType() == MVT::i1) return false; unsigned Reg = getRegForValue(I->getOperand(0)); @@ -3757,30 +3736,25 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; + bool HasAVX = Subtarget->hasAVX(); + bool HasAVX512 = Subtarget->hasAVX512(); switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) { - Opc = Subtarget->hasAVX512() - ? X86::VMOVSSZrm - : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; - RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; - } else { + if (X86ScalarSSEf32) + Opc = HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt; + else Opc = X86::LD_Fp32m; - RC = &X86::RFP32RegClass; - } break; case MVT::f64: - if (X86ScalarSSEf64) { - Opc = Subtarget->hasAVX512() - ? X86::VMOVSDZrm - : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; - RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; - } else { + if (X86ScalarSSEf64) + Opc = HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt; + else Opc = X86::LD_Fp64m; - RC = &X86::RFP64RegClass; - } break; case MVT::f80: // No f80 support yet. @@ -3806,7 +3780,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { // Create the load from the constant pool. unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); - unsigned ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); if (CM == CodeModel::Large) { unsigned AddrReg = createResultReg(&X86::GR64RegClass); @@ -3916,33 +3890,26 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { // Get opcode and regclass for the given zero. bool HasAVX512 = Subtarget->hasAVX512(); unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) { + if (X86ScalarSSEf32) Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; - RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; - } else { + else Opc = X86::LD_Fp032; - RC = &X86::RFP32RegClass; - } break; case MVT::f64: - if (X86ScalarSSEf64) { + if (X86ScalarSSEf64) Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; - RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; - } else { + else Opc = X86::LD_Fp064; - RC = &X86::RFP64RegClass; - } break; case MVT::f80: // No f80 support yet. return 0; } - unsigned ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); return ResultReg; } @@ -3992,6 +3959,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, } Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); + Result->cloneInstrSymbols(*FuncInfo.MF, *MI); MachineBasicBlock::iterator I(MI); removeDeadCode(I, std::next(I)); return true; diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index ed297e678203..bf541d933790 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -1,9 +1,8 @@ //===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -103,9 +102,7 @@ public: StringRef getPassName() const override { return FIXUPBW_DESC; } - FixupBWInstPass() : MachineFunctionPass(ID) { - initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry()); - } + FixupBWInstPass() : MachineFunctionPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); // Machine loop info is used to @@ -151,7 +148,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { this->MF = &MF; TII = MF.getSubtarget().getInstrInfo(); - OptForSize = MF.getFunction().optForSize(); + OptForSize = MF.getFunction().hasOptSize(); MLI = &getAnalysis(); LiveRegs.init(TII->getRegisterInfo()); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index a346085a52cb..041529a0be68 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -1,15 +1,14 @@ //===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the pass that finds instructions that can be // re-written as LEA instructions in order to reduce pipeline delays. -// When optimizing for size it replaces suitable LEAs with INC or DEC. +// It replaces LEAs with ADD/INC/DEC when that is better for size/speed. // //===----------------------------------------------------------------------===// @@ -36,31 +35,25 @@ namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - /// Loop over all of the instructions in the basic block - /// replacing applicable instructions with LEA instructions, - /// where appropriate. - bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI, - bool IsSlowLEA, bool IsSlow3OpsLEA); - /// Given a machine register, look for the instruction /// which writes it in the current basic block. If found, /// try to replace it with an equivalent LEA instruction. /// If replacement succeeds, then also process the newly created /// instruction. void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// Given a memory access or LEA instruction /// whose address mode uses a base and/or index register, look for /// an opportunity to replace the instruction which sets the base or index /// register with an equivalent LEA instruction. void processInstruction(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// Given a LEA instruction which is unprofitable /// on SlowLEA targets try to replace it with an equivalent ADD instruction. void processInstructionForSlowLEA(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// Given a LEA instruction which is unprofitable /// on SNB+ try to replace it with other instructions. @@ -75,12 +68,13 @@ class FixupLEAPass : public MachineFunctionPass { /// - LEA that uses 16-bit addressing mode " /// This function currently handles the first 2 cases only. MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); - /// Look for LEAs that add 1 to reg or subtract 1 from reg - /// and convert them to INC or DEC respectively. - bool fixupIncDec(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) const; + /// Look for LEAs that are really two address LEAs that we might be able to + /// turn into regular ADD instructions. + bool optTwoAddrLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec, + bool UseLEAForSP) const; /// Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. @@ -91,12 +85,12 @@ class FixupLEAPass : public MachineFunctionPass { /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. MachineBasicBlock::iterator searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// if an instruction can be converted to an /// equivalent LEA, insert the new instruction into the basic block /// and return a pointer to it. Otherwise, return zero. - MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const; public: @@ -104,9 +98,7 @@ public: StringRef getPassName() const override { return FIXUPLEA_DESC; } - FixupLEAPass() : MachineFunctionPass(ID) { - initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); - } + FixupLEAPass() : MachineFunctionPass(ID) { } /// Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -121,10 +113,8 @@ public: private: TargetSchedModel TSM; - MachineFunction *MF; - const X86InstrInfo *TII; // Machine instruction info. - bool OptIncDec; - bool OptLEA; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; }; } @@ -133,7 +123,7 @@ char FixupLEAPass::ID = 0; INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) MachineInstr * -FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, +FixupLEAPass::postRAConvertToLEA(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { MachineInstr &MI = *MBBI; switch (MI.getOpcode()) { @@ -142,7 +132,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, const MachineOperand &Src = MI.getOperand(1); const MachineOperand &Dest = MI.getOperand(0); MachineInstr *NewMI = - BuildMI(*MF, MI.getDebugLoc(), + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) .add(Dest) @@ -151,9 +141,17 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, .addReg(0) .addImm(0) .addReg(0); - MFI->insert(MBBI, NewMI); // Insert the new inst return NewMI; } + } + + if (!MI.isConvertibleTo3Addr()) + return nullptr; + + switch (MI.getOpcode()) { + default: + // Only convert instructions that we've verified are safe. + return nullptr; case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: @@ -162,52 +160,80 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, case X86::ADD32ri8: case X86::ADD32ri_DB: case X86::ADD32ri8_DB: - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri_DB: - case X86::ADD16ri8_DB: if (!MI.getOperand(2).isImm()) { // convertToThreeAddress will call getImm() // which requires isImm() to be true return nullptr; } break; - case X86::ADD16rr: - case X86::ADD16rr_DB: - if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) { - // if src1 != src2, then convertToThreeAddress will - // need to create a Virtual register, which we cannot do - // after register allocation. - return nullptr; - } + case X86::SHL64ri: + case X86::SHL32ri: + case X86::INC64r: + case X86::INC32r: + case X86::DEC64r: + case X86::DEC32r: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: + // These instructions are all fine to convert. + break; } + MachineFunction::iterator MFI = MBB.getIterator(); return TII->convertToThreeAddress(MFI, MI, nullptr); } FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } -bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { - if (skipFunction(Func.getFunction())) +static bool isLEA(unsigned Opcode) { + return Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r; +} + +bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) return false; - MF = &Func; - const X86Subtarget &ST = Func.getSubtarget(); + const X86Subtarget &ST = MF.getSubtarget(); bool IsSlowLEA = ST.slowLEA(); bool IsSlow3OpsLEA = ST.slow3OpsLEA(); + bool LEAUsesAG = ST.LEAusesAG(); - OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize(); - OptLEA = ST.LEAusesAG() || IsSlowLEA || IsSlow3OpsLEA; - - if (!OptLEA && !OptIncDec) - return false; + bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize(); + bool UseLEAForSP = ST.useLeaForSP(); - TSM.init(&Func.getSubtarget()); + TSM.init(&ST); TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";); - // Process all basic blocks. - for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) - processBasicBlock(Func, I, IsSlowLEA, IsSlow3OpsLEA); + for (MachineBasicBlock &MBB : MF) { + // First pass. Try to remove or optimize existing LEAs. + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (!isLEA(I->getOpcode())) + continue; + + if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) + continue; + + if (IsSlowLEA) { + processInstructionForSlowLEA(I, MBB); + } else if (IsSlow3OpsLEA) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) { + MBB.erase(I); + I = NewMI; + } + } + } + + // Second pass for creating LEAs. This may reverse some of the + // transformations above. + if (LEAUsesAG) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) + processInstruction(I, MBB); + } + } + LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";); return true; @@ -218,7 +244,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { RegUsageState RegUsage = RU_NotUsed; MachineInstr &MI = *I; - for (unsigned int i = 0; i < MI.getNumOperands(); ++i) { + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { MachineOperand &opnd = MI.getOperand(i); if (opnd.isReg() && opnd.getReg() == p.getReg()) { if (opnd.isDef()) @@ -234,10 +260,10 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { /// wrapping around to the last instruction of the block if the block /// branches to itself. static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { - if (I == MFI->begin()) { - if (MFI->isPredecessor(&*MFI)) { - I = --MFI->end(); + MachineBasicBlock &MBB) { + if (I == MBB.begin()) { + if (MBB.isPredecessor(&MBB)) { + I = --MBB.end(); return true; } else return false; @@ -248,14 +274,14 @@ static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { + MachineBasicBlock &MBB) { int InstrDistance = 1; MachineBasicBlock::iterator CurInst; static const int INSTR_DISTANCE_THRESHOLD = 5; CurInst = I; bool Found; - Found = getPreviousInstr(CurInst, MFI); + Found = getPreviousInstr(CurInst, MBB); while (Found && I != CurInst) { if (CurInst->isCall() || CurInst->isInlineAsm()) break; @@ -265,17 +291,12 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return CurInst; } InstrDistance += TSM.computeInstrLatency(&*CurInst); - Found = getPreviousInstr(CurInst, MFI); + Found = getPreviousInstr(CurInst, MBB); } return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int Opcode) { - return Opcode == X86::LEA16r || Opcode == X86::LEA32r || - Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; -} - -static inline bool isInefficientLEAReg(unsigned int Reg) { +static inline bool isInefficientLEAReg(unsigned Reg) { return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13D || Reg == X86::R13; } @@ -298,27 +319,24 @@ static inline bool hasLEAOffset(const MachineOperand &Offset) { return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); } -static inline int getADDrrFromLEA(int LEAOpcode) { +static inline unsigned getADDrrFromLEA(unsigned LEAOpcode) { switch (LEAOpcode) { default: llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - return X86::ADD16rr; case X86::LEA32r: - return X86::ADD32rr; case X86::LEA64_32r: + return X86::ADD32rr; case X86::LEA64r: return X86::ADD64rr; } } -static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { +static inline unsigned getADDriFromLEA(unsigned LEAOpcode, + const MachineOperand &Offset) { bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); switch (LEAOpcode) { default: llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; case X86::LEA32r: case X86::LEA64_32r: return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; @@ -327,56 +345,110 @@ static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { } } -/// isLEASimpleIncOrDec - Does this LEA have one these forms: -/// lea %reg, 1(%reg) -/// lea %reg, -1(%reg) -static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) { - unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg(); - unsigned DstReg = LEA.getOperand(0).getReg(); - const MachineOperand &AddrDisp = LEA.getOperand(1 + X86::AddrDisp); - return SrcReg == DstReg && - LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && - LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 && - AddrDisp.isImm() && - (AddrDisp.getImm() == 1 || AddrDisp.getImm() == -1); +static inline unsigned getINCDECFromLEA(unsigned LEAOpcode, bool IsINC) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA32r: + case X86::LEA64_32r: + return IsINC ? X86::INC32r : X86::DEC32r; + case X86::LEA64r: + return IsINC ? X86::INC64r : X86::DEC64r; + } } -bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) const { +bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec, + bool UseLEAForSP) const { MachineInstr &MI = *I; - int Opcode = MI.getOpcode(); - if (!isLEA(Opcode)) + + const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); + const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt); + const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg); + const MachineOperand &Disp = MI.getOperand(1 + X86::AddrDisp); + const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); + + if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 || + !TII->isSafeToClobberEFLAGS(MBB, I)) return false; - if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) { - int NewOpcode; - bool isINC = MI.getOperand(1 + X86::AddrDisp).getImm() == 1; - switch (Opcode) { - case X86::LEA16r: - NewOpcode = isINC ? X86::INC16r : X86::DEC16r; - break; - case X86::LEA32r: - case X86::LEA64_32r: - NewOpcode = isINC ? X86::INC32r : X86::DEC32r; - break; - case X86::LEA64r: - NewOpcode = isINC ? X86::INC64r : X86::DEC64r; - break; - } + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned BaseReg = Base.getReg(); + unsigned IndexReg = Index.getReg(); - MachineInstr *NewMI = - BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode)) - .add(MI.getOperand(0)) - .add(MI.getOperand(1 + X86::AddrBaseReg)); - MFI->erase(I); - I = static_cast(NewMI); - return true; + // Don't change stack adjustment LEAs. + if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP)) + return false; + + // LEA64_32 has 64-bit operands but 32-bit result. + if (MI.getOpcode() == X86::LEA64_32r) { + if (BaseReg != 0) + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + if (IndexReg != 0) + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); } - return false; + + MachineInstr *NewMI = nullptr; + + // Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1 + // which can be turned into add %reg2, %reg1 + if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 && + (DestReg == BaseReg || DestReg == IndexReg)) { + unsigned NewOpcode = getADDrrFromLEA(MI.getOpcode()); + if (DestReg != BaseReg) + std::swap(BaseReg, IndexReg); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addReg(IndexReg) + .addReg(Base.getReg(), RegState::Implicit) + .addReg(Index.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addReg(IndexReg); + } + } else if (DestReg == BaseReg && IndexReg == 0) { + // This is an LEA with only a base register and a displacement, + // We can use ADDri or INC/DEC. + + // Does this LEA have one these forms: + // lea %reg, 1(%reg) + // lea %reg, -1(%reg) + if (OptIncDec && (Disp.getImm() == 1 || Disp.getImm() == -1)) { + bool IsINC = Disp.getImm() == 1; + unsigned NewOpcode = getINCDECFromLEA(MI.getOpcode(), IsINC); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addReg(Base.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg); + } + } else { + unsigned NewOpcode = getADDriFromLEA(MI.getOpcode(), Disp); + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addImm(Disp.getImm()) + .addReg(Base.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addImm(Disp.getImm()); + } + } + } else + return false; + + MBB.erase(I); + I = NewMI; + return true; } void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { + MachineBasicBlock &MBB) { // Process a load, store, or LEA instruction. MachineInstr &MI = *I; const MCInstrDesc &Desc = MI.getDesc(); @@ -385,40 +457,38 @@ void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, AddrOffset += X86II::getOperandBias(Desc); MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg); if (p.isReg() && p.getReg() != X86::ESP) { - seekLEAFixup(p, I, MFI); + seekLEAFixup(p, I, MBB); } MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg); if (q.isReg() && q.getReg() != X86::ESP) { - seekLEAFixup(q, I, MFI); + seekLEAFixup(q, I, MBB); } } } void FixupLEAPass::seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { - MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MBB); if (MBI != MachineBasicBlock::iterator()) { - MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI); + MachineInstr *NewMI = postRAConvertToLEA(MBB, MBI); if (NewMI) { ++NumLEAs; LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); // now to replace with an equivalent LEA... LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); - MFI->erase(MBI); + MBB.erase(MBI); MachineBasicBlock::iterator J = static_cast(NewMI); - processInstruction(J, MFI); + processInstruction(J, MBB); } } } void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { + MachineBasicBlock &MBB) { MachineInstr &MI = *I; - const int Opcode = MI.getOpcode(); - if (!isLEA(Opcode)) - return; + const unsigned Opcode = MI.getOpcode(); const MachineOperand &Dst = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); @@ -428,7 +498,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); if (Segment.getReg() != 0 || !Offset.isImm() || - !TII->isSafeToClobberEFLAGS(*MFI, I)) + !TII->isSafeToClobberEFLAGS(MBB, I)) return; const unsigned DstR = Dst.getReg(); const unsigned SrcR1 = Base.getReg(); @@ -445,7 +515,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); const MachineOperand &Src = SrcR1 == DstR ? Index : Base; NewMI = - BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); + BuildMI(MBB, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); LLVM_DEBUG(NewMI->dump();); } // Make ADD instruction for immediate @@ -453,24 +523,21 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(Opcode, Offset)); const MachineOperand &SrcR = SrcR1 == DstR ? Base : Index; - NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(Offset.getImm()); LLVM_DEBUG(NewMI->dump();); } if (NewMI) { - MFI->erase(I); + MBB.erase(I); I = NewMI; } } MachineInstr * FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineFunction::iterator MFI) { - - const int LEAOpcode = MI.getOpcode(); - if (!isLEA(LEAOpcode)) - return nullptr; + MachineBasicBlock &MBB) { + const unsigned LEAOpcode = MI.getOpcode(); const MachineOperand &Dst = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); @@ -481,13 +548,13 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || - !TII->isSafeToClobberEFLAGS(*MFI, MI) || + !TII->isSafeToClobberEFLAGS(MBB, MI) || Segment.getReg() != X86::NoRegister) return nullptr; - unsigned int DstR = Dst.getReg(); - unsigned int BaseR = Base.getReg(); - unsigned int IndexR = Index.getReg(); + unsigned DstR = Dst.getReg(); + unsigned BaseR = Base.getReg(); + unsigned IndexR = Index.getReg(); unsigned SSDstR = (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; bool IsScale1 = Scale.getImm() == 1; @@ -516,11 +583,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { const MachineOperand &Src = DstR == BaseR ? Index : Base; MachineInstr *NewMI = - BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); LLVM_DEBUG(NewMI->dump();); // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); LLVM_DEBUG(NewMI->dump();); } return NewMI; @@ -530,7 +597,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, // lea offset(%base,%index,scale),%dst => // lea (%base,%index,scale); add offset,%dst if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) .add(Dst) .add(IsInefficientBase ? Index : Base) .add(Scale) @@ -540,7 +607,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, LLVM_DEBUG(NewMI->dump();); // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); LLVM_DEBUG(NewMI->dump();); } return NewMI; @@ -552,17 +619,17 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst if (IsScale1 && !hasLEAOffset(Offset)) { bool BIK = Base.isKill() && BaseR != IndexR; - TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, BIK); + TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK); LLVM_DEBUG(MI.getPrevNode()->dump();); MachineInstr *NewMI = - BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); LLVM_DEBUG(NewMI->dump();); return NewMI; } // lea offset(%base,%index,scale), %dst => // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) .add(Dst) .addReg(0) .add(Scale) @@ -571,35 +638,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, .add(Segment); LLVM_DEBUG(NewMI->dump();); - NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); LLVM_DEBUG(NewMI->dump();); return NewMI; } - -bool FixupLEAPass::processBasicBlock(MachineFunction &MF, - MachineFunction::iterator MFI, - bool IsSlowLEA, bool IsSlow3OpsLEA) { - for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (OptIncDec) - if (fixupIncDec(I, MFI)) - continue; - - if (OptLEA) { - if (IsSlowLEA) { - processInstructionForSlowLEA(I, MFI); - continue; - } - - if (IsSlow3OpsLEA) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { - MFI->erase(I); - I = NewMI; - } - continue; - } - - processInstruction(I, MFI); - } - } - return false; -} diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp index a86eb997635e..e2d4d1ede6f3 100644 --- a/lib/Target/X86/X86FixupSetCC.cpp +++ b/lib/Target/X86/X86FixupSetCC.cpp @@ -1,9 +1,8 @@ //===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -68,30 +67,6 @@ char X86FixupSetCCPass::ID = 0; FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); } -bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) { - switch (Opcode) { - default: - return false; - case X86::SETOr: - case X86::SETNOr: - case X86::SETBr: - case X86::SETAEr: - case X86::SETEr: - case X86::SETNEr: - case X86::SETBEr: - case X86::SETAr: - case X86::SETSr: - case X86::SETNSr: - case X86::SETPr: - case X86::SETNPr: - case X86::SETLr: - case X86::SETGEr: - case X86::SETLEr: - case X86::SETGr: - return true; - } -} - // We expect the instruction *immediately* before the setcc to imp-def // EFLAGS (because of scheduling glue). To make this less brittle w.r.t // scheduling, look backwards until we hit the beginning of the @@ -103,7 +78,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB, auto MBBStart = MBB->rend(); for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI) for (auto &Op : MI->implicit_operands()) - if ((Op.getReg() == X86::EFLAGS) && (Op.isDef())) + if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isDef()) return &*MI; return nullptr; @@ -111,7 +86,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB, bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) { for (auto &Op : MI->implicit_operands()) - if ((Op.getReg() == X86::EFLAGS) && (Op.isUse())) + if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isUse()) return true; return false; @@ -129,7 +104,7 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { // Find a setcc that is used by a zext. // This doesn't have to be the only use, the transformation is safe // regardless. - if (!isSetCCr(MI.getOpcode())) + if (MI.getOpcode() != X86::SETCCr) continue; MachineInstr *ZExt = nullptr; diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp index 778aa505b2d9..5ce3255ea96a 100644 --- a/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -1,9 +1,8 @@ //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -71,12 +70,6 @@ STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); STATISTIC(NumTestsInserted, "Number of test instructions inserted"); STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); -namespace llvm { - -void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); - -} // end namespace llvm - namespace { // Convenient array type for storing registers associated with each condition. @@ -84,9 +77,7 @@ using CondRegArray = std::array; class X86FlagsCopyLoweringPass : public MachineFunctionPass { public: - X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { - initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); - } + X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -252,13 +243,13 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, "Split instruction must be in the split block!"); assert(SplitI.isBranch() && "Only designed to split a tail of branch instructions!"); - assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && + assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID && "Must split on an actual jCC instruction!"); // Dig out the previous instruction to the split point. MachineInstr &PrevI = *std::prev(SplitI.getIterator()); assert(PrevI.isBranch() && "Must split after a branch!"); - assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && + assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID && "Must split after an actual jCC instruction!"); assert(!std::prev(PrevI.getIterator())->isTerminator() && "Must only have this one terminator prior to the split!"); @@ -588,22 +579,21 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // branch folding or black placement. As a consequence, we get to deal // with the simpler formulation of conditional branches followed by tail // calls. - if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { + if (X86::getCondFromBranch(MI) != X86::COND_INVALID) { auto JmpIt = MI.getIterator(); do { JmpIs.push_back(&*JmpIt); ++JmpIt; } while (JmpIt != UseMBB.instr_end() && - X86::getCondFromBranchOpc(JmpIt->getOpcode()) != + X86::getCondFromBranch(*JmpIt) != X86::COND_INVALID); break; } // Otherwise we can just rewrite in-place. - if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { + if (X86::getCondFromCMov(MI) != X86::COND_INVALID) { rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); - } else if (X86::getCondFromSETOpc(MI.getOpcode()) != - X86::COND_INVALID) { + } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) { rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (MI.getOpcode() == TargetOpcode::COPY) { rewriteCopy(MI, *FlagUse, CopyDefI); @@ -730,7 +720,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( // Scan backwards across the range of instructions with live EFLAGS. for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { - X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); + X86::CondCode Cond = X86::getCondFromSETCC(MI); if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && TRI->isVirtualRegister(MI.getOperand(0).getReg())) { assert(MI.getOperand(0).isDef() && @@ -751,7 +741,7 @@ unsigned X86FlagsCopyLoweringPass::promoteCondToReg( DebugLoc TestLoc, X86::CondCode Cond) { unsigned Reg = MRI->createVirtualRegister(PromoteRC); auto SetI = BuildMI(TestMBB, TestPos, TestLoc, - TII->get(X86::getSETFromCond(Cond)), Reg); + TII->get(X86::SETCCr), Reg).addImm(Cond); (void)SetI; LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); ++NumSetCCsInserted; @@ -842,7 +832,7 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, MachineOperand &FlagUse, CondRegArray &CondRegs) { // First get the register containing this specific condition. - X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); + X86::CondCode Cond = X86::getCondFromCMov(CMovI); unsigned CondReg; bool Inverted; std::tie(CondReg, Inverted) = @@ -853,12 +843,10 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, // Insert a direct test of the saved register. insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); - // Rewrite the CMov to use the !ZF flag from the test (but match register - // size and memory operand), and then kill its use of the flags afterward. - auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); - CMovI.setDesc(TII->get(X86::getCMovFromCond( - Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, - !CMovI.memoperands_empty()))); + // Rewrite the CMov to use the !ZF flag from the test, and then kill its use + // of the flags afterward. + CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1) + .setImm(Inverted ? X86::COND_E : X86::COND_NE); FlagUse.setIsKill(true); LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); } @@ -867,7 +855,7 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp( MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { // First get the register containing this specific condition. - X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); + X86::CondCode Cond = X86::getCondFromBranch(JmpI); unsigned CondReg; bool Inverted; std::tie(CondReg, Inverted) = @@ -880,10 +868,8 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp( // Rewrite the jump to use the !ZF flag from the test, and kill its use of // flags afterward. - JmpI.setDesc(TII->get( - X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); - const int ImplicitEFLAGSOpIdx = 1; - JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); + JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE); + JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); } @@ -1026,7 +1012,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, MachineInstr &SetCCI, MachineOperand &FlagUse, CondRegArray &CondRegs) { - X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); + X86::CondCode Cond = X86::getCondFromSETCC(SetCCI); // Note that we can't usefully rewrite this to the inverse without complex // analysis of the users of the setCC. Largely we rely on duplicates which // could have been avoided already being avoided here. diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index f330acff61a1..074cf21d03f5 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -1,9 +1,8 @@ //===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -60,7 +59,6 @@ namespace { struct FPS : public MachineFunctionPass { static char ID; FPS() : MachineFunctionPass(ID) { - initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); // This is really only to keep valgrind quiet. // The logic in isLive() is too much for it. memset(Stack, 0, sizeof(Stack)); @@ -299,9 +297,16 @@ namespace { void setKillFlags(MachineBasicBlock &MBB) const; }; - char FPS::ID = 0; } +char FPS::ID = 0; + +INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier", + false, false) +INITIALIZE_PASS_DEPENDENCY(EdgeBundles) +INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier", + false, false) + FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } /// getFPReg - Return the X86::FPx register number for the specified operand. @@ -591,7 +596,7 @@ namespace { } static int Lookup(ArrayRef Table, unsigned Opcode) { - const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode); + const TableEntry *I = llvm::lower_bound(Table, Opcode); if (I != Table.end() && I->from == Opcode) return I->to; return -1; @@ -1096,6 +1101,8 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { // Change from the pseudo instruction to the concrete instruction. MI.RemoveOperand(0); // Remove the explicit ST(0) operand MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); + MI.addOperand( + MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true)); // Result gets pushed on the stack. pushReg(DestReg); @@ -1140,6 +1147,8 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { // Convert from the pseudo instruction to the concrete instruction. MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); + MI.addOperand( + MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true)); if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m || MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m || @@ -1369,8 +1378,6 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { /// register arguments and no explicit destinations. /// void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { - ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); - ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); MachineInstr &MI = *I; unsigned NumOperands = MI.getDesc().getNumOperands(); @@ -1475,7 +1482,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { break; } - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { // The inline asm MachineInstr currently only *uses* FP registers for the // 'f' constraint. These should be turned into the current ST(x) register // in the machine instr. diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 984db12201ed..e310fe069117 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1,9 +1,8 @@ //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -585,23 +584,23 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, // registers. For the prolog expansion we use RAX, RCX and RDX. MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RegClass = &X86::GR64RegClass; - const unsigned SizeReg = InProlog ? (unsigned)X86::RAX + const Register SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass), - ZeroReg = InProlog ? (unsigned)X86::RCX + ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), - CopyReg = InProlog ? (unsigned)X86::RDX + CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - TestReg = InProlog ? (unsigned)X86::RDX + TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - FinalReg = InProlog ? (unsigned)X86::RDX + FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - RoundedReg = InProlog ? (unsigned)X86::RDX + RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - LimitReg = InProlog ? (unsigned)X86::RCX + LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), - JoinReg = InProlog ? (unsigned)X86::RCX + JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), - ProbeReg = InProlog ? (unsigned)X86::RCX + ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass); // SP-relative offsets where we can save RCX and RDX. @@ -654,9 +653,10 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) .addReg(CopyReg) .addReg(SizeReg); - BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg) .addReg(TestReg) - .addReg(ZeroReg); + .addReg(ZeroReg) + .addImm(X86::COND_B); // FinalReg now holds final stack pointer value, or zero if // allocation would overflow. Compare against the current stack @@ -673,7 +673,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, .addReg(X86::GS); BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); // Jump if the desired stack pointer is at or above the stack limit. - BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE); // Add code to roundMBB to round the final stack pointer to a page boundary. RoundMBB->addLiveIn(FinalReg); @@ -710,7 +710,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); - BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE); MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); @@ -794,8 +794,8 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addExternalSymbol(MF.createExternalSymbolName(Symbol)); } - unsigned AX = Is64Bit ? X86::RAX : X86::EAX; - unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX; + unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP; CI.addReg(AX, RegState::Implicit) .addReg(SP, RegState::Implicit) .addReg(AX, RegState::Define | RegState::Implicit) @@ -809,7 +809,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, // adjusting %rsp. // All other platforms do not specify a particular ABI for the stack probe // function, so we arbitrarily define it to not adjust %esp/%rsp itself. - BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP) + BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP) .addReg(SP) .addReg(AX); } @@ -872,6 +872,17 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MI->getOperand(3).setIsDead(); } +bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { + // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be + // clobbered by any interrupt handler. + assert(&STI == &MF.getSubtarget() && + "MF used frame lowering for wrong subtarget"); + const Function &Fn = MF.getFunction(); + const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv()); + return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone); +} + + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -976,7 +987,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR; bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); - bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry(); // FIXME: Emit FPO data for EH funclets. @@ -1030,12 +1040,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) && + if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !UseStackProbe && // No stack probes. - !IsWin64CC && // Win64 has no Red Zone !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -1774,6 +1783,15 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int64_t FPDelta = 0; + // In an x86 interrupt, remove the offset we added to account for the return + // address from any stack object allocated in the caller's frame. Interrupts + // do not have a standard return address. Fixed objects in the current frame, + // such as SSE register spills, should not get this treatment. + if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR && + Offset >= 0) { + Offset += getOffsetOfLocalArea(); + } + if (IsWin64Prologue) { assert(!MFI.hasCalls() || (StackSize % 16) == 8); @@ -1888,8 +1906,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // If !hasReservedCallFrame the function might have SP adjustement in the // body. So, even though the offset is statically known, it depends on where // we are in the function. - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF)) + if (!IgnoreSPUpdates && !hasReservedCallFrame(MF)) return getFrameIndexReference(MF, FI, FrameReg); // We don't handle tail calls, and shouldn't be seeing them either. @@ -2407,7 +2424,7 @@ void X86FrameLowering::adjustForSegmentedStacks( // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -2637,7 +2654,7 @@ void X86FrameLowering::adjustForHiPEPrologue( // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB); + BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). @@ -2646,7 +2663,7 @@ void X86FrameLowering::adjustForHiPEPrologue( SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); + BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE); stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); @@ -2802,7 +2819,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); if (StackAdjustment) { - if (!(F.optForMinSize() && + if (!(F.hasMinSize() && adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment))) BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment, /*InEpilogue=*/false); @@ -3079,8 +3096,7 @@ void X86FrameLowering::orderFrameObjects( // Sort the objects using X86FrameSortingAlgorithm (see its comment for // info). - std::stable_sort(SortingObjects.begin(), SortingObjects.end(), - X86FrameSortingComparator()); + llvm::stable_sort(SortingObjects, X86FrameSortingComparator()); // Now modify the original list to represent the final order that // we want. The order will depend on whether we're going to access them @@ -3154,7 +3170,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8; int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; int UnwindHelpFI = - MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; // Store -2 into UnwindHelp on function entry. We have to scan forwards past diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 3bd805aae123..d32746e3a36e 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -1,9 +1,8 @@ //===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -172,6 +171,10 @@ public: unsigned getInitialCFARegister(const MachineFunction &MF) const override; + /// Return true if the function has a redzone (accessible bytes past the + /// frame of the top of stack function) as part of it's ABI. + bool has128ByteRedZone(const MachineFunction& MF) const; + private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def index 9cd3f96f83ac..0fdea9071c29 100644 --- a/lib/Target/X86/X86GenRegisterBankInfo.def +++ b/lib/Target/X86/X86GenRegisterBankInfo.def @@ -1,9 +1,8 @@ //===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 5ac153244df9..95d31e62cafc 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -74,6 +73,7 @@ namespace { int JT; unsigned Align; // CP alignment. unsigned char SymbolFlags; // X86II::MO_* + bool NegateIndex = false; X86ISelAddressMode() : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), @@ -116,6 +116,8 @@ namespace { dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; dbgs() << " Scale " << Scale << '\n' << "IndexReg "; + if (NegateIndex) + dbgs() << "negate "; if (IndexReg.getNode()) IndexReg.getNode()->dump(DAG); else @@ -170,8 +172,8 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false), - OptForMinSize(false) {} + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false), + OptForMinSize(false), IndirectTlsSegRefs(false) {} StringRef getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -182,6 +184,13 @@ namespace { Subtarget = &MF.getSubtarget(); IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( "indirect-tls-seg-refs"); + + // OptFor[Min]Size are used in pattern predicates that isel is matching. + OptForSize = MF.getFunction().hasOptSize(); + OptForMinSize = MF.getFunction().hasMinSize(); + assert((!OptForMinSize || OptForSize) && + "OptForMinSize implies OptForSize"); + SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -204,7 +213,7 @@ namespace { bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); - bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); @@ -252,16 +261,32 @@ namespace { void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, - SDValue &Base, SDValue &Scale, + MVT VT, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) - ? CurDAG->getTargetFrameIndex( - AM.Base_FrameIndex, - TLI->getPointerTy(CurDAG->getDataLayout())) - : AM.Base_Reg; + if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + Base = CurDAG->getTargetFrameIndex( + AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); + else if (AM.Base_Reg.getNode()) + Base = AM.Base_Reg; + else + Base = CurDAG->getRegister(0, VT); + Scale = getI8Imm(AM.Scale, DL); - Index = AM.IndexReg; + + // Negate the index if needed. + if (AM.NegateIndex) { + unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; + SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, + AM.IndexReg), 0); + AM.IndexReg = Neg; + } + + if (AM.IndexReg.getNode()) + Index = AM.IndexReg; + else + Index = CurDAG->getRegister(0, VT); + // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) @@ -290,7 +315,7 @@ namespace { if (AM.Segment.getNode()) Segment = AM.Segment; else - Segment = CurDAG->getRegister(0, MVT::i32); + Segment = CurDAG->getRegister(0, MVT::i16); } // Utility function to determine whether we should avoid selecting @@ -400,6 +425,19 @@ namespace { return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); } + // Helper to detect unneeded and instructions on shift amounts. Called + // from PatFrags in tablegen. + bool isUnneededShiftMask(SDNode *N, unsigned Width) const { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); + const APInt &Val = cast(N->getOperand(1))->getAPIntValue(); + + if (Val.countTrailingOnes() >= Width) + return true; + + APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; + return Mask.countTrailingOnes() >= Width; + } + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -464,6 +502,8 @@ namespace { bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); + bool tryShrinkShlLogicImm(SDNode *N); + bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); @@ -485,7 +525,7 @@ namespace { static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || - Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) { + Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. @@ -497,7 +537,7 @@ static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { } // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || - Opcode == X86ISD::FSETCCM_RND) + Opcode == X86ISD::FSETCCM_SAE) return true; return false; @@ -571,6 +611,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { Imm->getAPIntValue().getBitWidth() == 64 && Imm->getAPIntValue().isIntN(32)) return false; + + // If this really a zext_inreg that can be represented with a movzx + // instruction, prefer that. + // TODO: We could shrink the load and fold if it is non-volatile. + if (U->getOpcode() == ISD::AND && + (Imm->getAPIntValue() == UINT8_MAX || + Imm->getAPIntValue() == UINT16_MAX || + Imm->getAPIntValue() == UINT32_MAX)) + return false; + + // ADD/SUB with can negate the immediate and use the opposite operation + // to fit 128 into a sign extended 8 bit immediate. + if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && + (-Imm->getAPIntValue()).isSignedIntN(8)) + return false; } // If the other operand is a TLS address, we should fold it instead. @@ -720,11 +775,6 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptFor[Min]Size are used in pattern predicates that isel is matching. - OptForSize = MF->getFunction().optForSize(); - OptForMinSize = MF->getFunction().optForMinSize(); - assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. @@ -741,6 +791,143 @@ void X86DAGToDAGISel::PreprocessISelDAG() { continue; } + switch (N->getOpcode()) { + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + // Replace vector fp_to_s/uint with their X86 specific equivalent so we + // don't need 2 sets of patterns. + if (!N->getSimpleValueType(0).isVector()) + break; + + unsigned NewOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; + case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; + } + SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: { + // Replace vector shifts with their X86 specific equivalent so we don't + // need 2 sets of patterns. + if (!N->getValueType(0).isVector()) + break; + + unsigned NewOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::SHL: NewOpc = X86ISD::VSHLV; break; + case ISD::SRA: NewOpc = X86ISD::VSRAV; break; + case ISD::SRL: NewOpc = X86ISD::VSRLV; break; + } + SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: { + // Replace vector any extend with the zero extend equivalents so we don't + // need 2 sets of patterns. Ignore vXi1 extensions. + if (!N->getValueType(0).isVector() || + N->getOperand(0).getScalarValueSizeInBits() == 1) + break; + + unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND + ? ISD::ZERO_EXTEND + : ISD::ZERO_EXTEND_VECTOR_INREG; + + SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FRINT: { + // Replace fp rounding with their X86 specific equivalent so we don't + // need 2 sets of patterns. + unsigned Imm; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::FCEIL: Imm = 0xA; break; + case ISD::FFLOOR: Imm = 0x9; break; + case ISD::FTRUNC: Imm = 0xB; break; + case ISD::FNEARBYINT: Imm = 0xC; break; + case ISD::FRINT: Imm = 0x4; break; + } + SDLoc dl(N); + SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, + N->getValueType(0), + N->getOperand(0), + CurDAG->getConstant(Imm, dl, MVT::i8)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case X86ISD::FANDN: + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: { + // Widen scalar fp logic ops to vector to reduce isel patterns. + // FIXME: Can we do this during lowering/combine. + MVT VT = N->getSimpleValueType(0); + if (VT.isVector() || VT == MVT::f128) + break; + + MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; + SDLoc dl(N); + SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, + N->getOperand(0)); + SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, + N->getOperand(1)); + + SDValue Res; + if (Subtarget->hasSSE2()) { + EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); + Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); + Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); + unsigned Opc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; + case X86ISD::FAND: Opc = ISD::AND; break; + case X86ISD::FOR: Opc = ISD::OR; break; + case X86ISD::FXOR: Opc = ISD::XOR; break; + } + Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); + Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); + } else { + Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); + } + Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, + CurDAG->getIntPtrConstant(0, dl)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + } + if (OptLevel != CodeGenOpt::None && // Only do this when the target can fold the load into the call or // jmp. @@ -786,65 +973,135 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // and the node legalization. As such this pass basically does "really // late" legalization of these inline with the X86 isel pass. // FIXME: This should only happen when not compiled with -O0. - if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) - continue; + switch (N->getOpcode()) { + default: continue; + case ISD::FP_ROUND: + case ISD::FP_EXTEND: + { + MVT SrcVT = N->getOperand(0).getSimpleValueType(); + MVT DstVT = N->getSimpleValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) + continue; - MVT SrcVT = N->getOperand(0).getSimpleValueType(); - MVT DstVT = N->getSimpleValueType(0); + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + const X86TargetLowering *X86Lowering = + static_cast(TLI); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; - // If any of the sources are vectors, no fp stack involved. - if (SrcVT.isVector() || DstVT.isVector()) - continue; + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(1)) + continue; + } - // If the source and destination are SSE registers, then this is a legal - // conversion that should not be lowered. - const X86TargetLowering *X86Lowering = - static_cast(TLI); - bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); - bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); - if (SrcIsSSE && DstIsSSE) - continue; + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + SDLoc dl(N); - if (!SrcIsSSE && !DstIsSSE) { - // If this is an FPStack extension, it is a noop. - if (N->getOpcode() == ISD::FP_EXTEND) + // FIXME: optimize the case where the src/dest is a load or store? + + SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), + MemTmp, MachinePointerInfo(), MemVT); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), MemVT); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + break; + } + + //The sequence of events for lowering STRICT_FP versions of these nodes requires + //dealing with the chain differently, as there is already a preexisting chain. + case ISD::STRICT_FP_ROUND: + case ISD::STRICT_FP_EXTEND: + { + MVT SrcVT = N->getOperand(1).getSimpleValueType(); + MVT DstVT = N->getSimpleValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) continue; - // If this is a value-preserving FPStack truncation, it is a noop. - if (N->getConstantOperandVal(1)) + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + const X86TargetLowering *X86Lowering = + static_cast(TLI); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) continue; - } - // Here we could have an FP stack truncation or an FPStack <-> SSE convert. - // FPStack has extload and truncstore. SSE can fold direct loads into other - // operations. Based on this, decide what we want to do. - MVT MemVT; - if (N->getOpcode() == ISD::FP_ROUND) - MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. - else - MemVT = SrcIsSSE ? SrcVT : DstVT; + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::STRICT_FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(2)) + continue; + } + + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT MemVT; + if (N->getOpcode() == ISD::STRICT_FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + SDLoc dl(N); + + // FIXME: optimize the case where the src/dest is a load or store? - SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); - SDLoc dl(N); + //Since the operation is StrictFP, use the preexisting chain. + SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1), + MemTmp, MachinePointerInfo(), MemVT); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), MemVT); - // FIXME: optimize the case where the src/dest is a load or store? - SDValue Store = - CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), - MemTmp, MachinePointerInfo(), MemVT); - SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, - MachinePointerInfo(), MemVT); + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesWith(N, Result.getNode()); + break; + } + } - // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the - // extload we created. This will cause general havok on the dag because - // anything below the conversion could be folded into other existing nodes. - // To avoid invalidating 'I', back it up to the convert node. - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); // Now that we did that, the node is dead. Increment the iterator to the // next node to process, then delete N. ++I; CurDAG->DeleteNode(N); } + + // The load+call transform above can leave some dead nodes in the graph. Make + // sure we remove them. Its possible some of the other transforms do to so + // just remove dead nodes unconditionally. + CurDAG->RemoveDeadNodes(); } // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. @@ -1138,15 +1395,23 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { if (AM.hasSymbolicDisplacement()) return true; + bool IsRIPRelTLS = false; bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; + if (IsRIPRel) { + SDValue Val = N.getOperand(0); + if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) + IsRIPRelTLS = true; + } - // We can't use an addressing mode in the 64-bit large code model. In the - // medium code model, we use can use an mode when RIP wrappers are present. - // That signifies access to globals that are known to be "near", such as the - // GOT itself. + // We can't use an addressing mode in the 64-bit large code model. + // Global TLS addressing is an exception. In the medium code model, + // we use can use a mode when RIP wrappers are present. + // That signifies access to globals that are known to be "near", + // such as the GOT itself. CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit() && - (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel))) + ((M == CodeModel::Large && !IsRIPRelTLS) || + (M == CodeModel::Medium && !IsRIPRel))) return true; // Base and index reg must be 0 in order to use %rip as base. @@ -1212,20 +1477,25 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, // because it has a smaller encoding. // TODO: Which other code models can use this? - if (TM.getCodeModel() == CodeModel::Small && - Subtarget->is64Bit() && - AM.Scale == 1 && - AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() == nullptr && - AM.IndexReg.getNode() == nullptr && - AM.SymbolFlags == X86II::MO_NO_FLAG && - AM.hasSymbolicDisplacement()) - AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + switch (TM.getCodeModel()) { + default: break; + case CodeModel::Small: + case CodeModel::Kernel: + if (Subtarget->is64Bit() && + AM.Scale == 1 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == nullptr && + AM.IndexReg.getNode() == nullptr && + AM.SymbolFlags == X86II::MO_NO_FLAG && + AM.hasSymbolicDisplacement()) + AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + break; + } return false; } -bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, +bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth) { // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. @@ -1317,6 +1587,7 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, ShlCount); insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); + DAG.RemoveDeadNode(N.getNode()); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); return false; @@ -1326,13 +1597,31 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, - uint64_t Mask, - SDValue Shift, SDValue X, X86ISelAddressMode &AM) { + SDValue Shift = N.getOperand(0); + + // Use a signed mask so that shifting right will insert sign bits. These + // bits will be removed when we shift the result left so it doesn't matter + // what we use. This might allow a smaller immediate encoding. + int64_t Mask = cast(N->getOperand(1))->getSExtValue(); + + // If we have an any_extend feeding the AND, look through it to see if there + // is a shift behind it. But only if the AND doesn't use the extended bits. + // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? + bool FoundAnyExtend = false; + if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && + Shift.getOperand(0).getSimpleValueType() == MVT::i32 && + isUInt<32>(Mask)) { + FoundAnyExtend = true; + Shift = Shift.getOperand(0); + } + if (Shift.getOpcode() != ISD::SHL || !isa(Shift.getOperand(1))) return true; + SDValue X = Shift.getOperand(0); + // Not likely to be profitable if either the AND or SHIFT node has more // than one use (unless all uses are for address computation). Besides, // isel mechanism requires their node ids to be reused. @@ -1346,6 +1635,12 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, MVT VT = N.getSimpleValueType(); SDLoc DL(N); + if (FoundAnyExtend) { + SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); + insertDAGNode(DAG, N, NewX); + X = NewX; + } + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); @@ -1359,6 +1654,7 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, NewAnd); insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); + DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << ShiftAmt; AM.IndexReg = NewAnd; @@ -1469,6 +1765,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); + DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewSRL; @@ -1527,6 +1824,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); + DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewAnd; @@ -1634,14 +1932,15 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + assert(N.getSimpleValueType().getSizeInBits() <= 64 && + "Unexpected value size!"); + SDValue And = N.getOperand(0); if (And.getOpcode() != ISD::AND) break; SDValue X = And.getOperand(0); - // We only handle up to 64-bit values here as those are what matter for - // addressing mode optimizations. - if (X.getSimpleValueType().getSizeInBits() > 64) break; - // The mask used for the transform is expected to be post-shift, but we // found the shift first so just apply the shift to the mask before passing // it down. @@ -1712,9 +2011,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { + N = Handle.getValue(); AM = Backup; break; } + N = Handle.getValue(); // Test if the index field is free for use. if (AM.IndexReg.getNode() || AM.isRIPRelative()) { AM = Backup; @@ -1722,7 +2023,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } int Cost = 0; - SDValue RHS = Handle.getValue().getOperand(1); + SDValue RHS = N.getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. @@ -1735,9 +2036,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. - // FIXME: Don't rely on DELETED_NODEs. if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && - AM.Base_Reg->getOpcode() != ISD::DELETED_NODE && !AM.Base_Reg.getNode()->hasOneUse()) || AM.BaseType == X86ISelAddressMode::FrameIndexBase) --Cost; @@ -1754,14 +2053,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } // Ok, the transformation is legal and appears profitable. Go for it. - SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType()); - SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS); - AM.IndexReg = Neg; + // Negation will be emitted later to avoid creating dangling nodes if this + // was an unprofitable LEA. + AM.IndexReg = RHS; + AM.NegateIndex = true; AM.Scale = 1; - - // Insert the new nodes into the topological ordering. - insertDAGNode(*CurDAG, Handle.getValue(), Zero); - insertDAGNode(*CurDAG, Handle.getValue(), Neg); return false; } @@ -1789,37 +2085,77 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; - SDValue Shift = N.getOperand(0); - if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; - SDValue X = Shift.getOperand(0); - // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. - if (X.getSimpleValueType().getSizeInBits() > 64) break; + assert(N.getSimpleValueType().getSizeInBits() <= 64 && + "Unexpected value size!"); if (!isa(N.getOperand(1))) break; - uint64_t Mask = N.getConstantOperandVal(1); - // Try to fold the mask and shift into an extract and scale. - if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) - return false; + if (N.getOperand(0).getOpcode() == ISD::SRL) { + SDValue Shift = N.getOperand(0); + SDValue X = Shift.getOperand(0); - // Try to fold the mask and shift directly into the scale. - if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) - return false; + uint64_t Mask = N.getConstantOperandVal(1); + + // Try to fold the mask and shift into an extract and scale. + if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to fold the mask and shift directly into the scale. + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to fold the mask and shift into BEXTR and scale. + if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) + return false; + } // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. - if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) - return false; - - // Try to fold the mask and shift into BEXTR and scale. - if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) + if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) return false; break; } + case ISD::ZERO_EXTEND: { + // Try to widen a zexted shift left to the same size as its use, so we can + // match the shift as a scale factor. + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) + break; + if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) + break; + + // Give up if the shift is not a valid scale factor [1,2,3]. + SDValue Shl = N.getOperand(0); + auto *ShAmtC = dyn_cast(Shl.getOperand(1)); + if (!ShAmtC || ShAmtC->getZExtValue() > 3) + break; + + // The narrow shift must only shift out zero bits (it must be 'nuw'). + // That makes it safe to widen to the destination type. + APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), + ShAmtC->getZExtValue()); + if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) + break; + + // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); + SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); + SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); + + // Convert the shift to scale factor. + AM.Scale = 1 << ShAmtC->getZExtValue(); + AM.IndexReg = Zext; + + insertDAGNode(*CurDAG, N, Zext); + insertDAGNode(*CurDAG, N, NewShl); + CurDAG->ReplaceAllUsesWith(N, NewShl); + CurDAG->RemoveDeadNode(N.getNode()); + return false; + } } return matchAddressBase(N, AM); @@ -1885,17 +2221,14 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, if (AddrSpace == 258) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + // Try to match into the base and displacement fields. if (matchVectorAddress(N, AM)) return false; - MVT VT = N.getSimpleValueType(); - if (AM.BaseType == X86ISelAddressMode::RegBase) { - if (!AM.Base_Reg.getNode()) - AM.Base_Reg = CurDAG->getRegister(0, VT); - } - - getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } @@ -1917,6 +2250,8 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores Parent->getOpcode() != X86ISD::TLSCALL && // Fixme + Parent->getOpcode() != X86ISD::ENQCMD && // Fixme + Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = @@ -1930,19 +2265,14 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); } - if (matchAddress(N, AM)) - return false; - + // Save the DL and VT before calling matchAddress, it can invalidate N. + SDLoc DL(N); MVT VT = N.getSimpleValueType(); - if (AM.BaseType == X86ISelAddressMode::RegBase) { - if (!AM.Base_Reg.getNode()) - AM.Base_Reg = CurDAG->getRegister(0, VT); - } - if (!AM.IndexReg.getNode()) - AM.IndexReg = CurDAG->getRegister(0, VT); + if (matchAddress(N, AM)) + return false; - getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } @@ -1974,12 +2304,14 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, if (!hasSingleUsesFromRoot(Root, Parent)) return false; - // We can allow a full vector load here since narrowing a load is ok. + // We can allow a full vector load here since narrowing a load is ok unless + // it's volatile. if (ISD::isNON_EXTLoad(N.getNode())) { - PatternNodeWithChain = N; - if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { - LoadSDNode *LD = cast(PatternNodeWithChain); + LoadSDNode *LD = cast(N); + if (!LD->isVolatile() && + IsProfitableToFold(N, LD, Root) && + IsLegalToFold(N, Parent, Root, OptLevel)) { + PatternNodeWithChain = N; return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment); } @@ -2010,23 +2342,6 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, } } - // Also handle the case where we explicitly require zeros in the top - // elements. This is a vector shuffle from the zero vector. - if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && - // Check to see if the top elements are all zeros (or bitcast of zeros). - N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - N.getOperand(0).getNode()->hasOneUse()) { - PatternNodeWithChain = N.getOperand(0).getOperand(0); - if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && - IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { - // Okay, this is a zero extending load. Fold it. - LoadSDNode *LD = cast(PatternNodeWithChain); - return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, - Segment); - } - } - return false; } @@ -2077,14 +2392,12 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, RegisterSDNode *RN = dyn_cast(Base); if (RN && RN->getReg() == 0) Base = CurDAG->getRegister(0, MVT::i64); - else if (Base.getValueType() == MVT::i32 && !dyn_cast(Base)) { + else if (Base.getValueType() == MVT::i32 && !isa(Base)) { // Base could already be %rip, particularly in the x32 ABI. - Base = SDValue(CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, - CurDAG->getTargetConstant(0, DL, MVT::i64), - Base, - CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)), - 0); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, + MVT::i64), 0); + Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, + Base); } RN = dyn_cast(Index); @@ -2093,13 +2406,10 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, else { assert(Index.getValueType() == MVT::i32 && "Expect to be extending 32-bit registers for use in LEA"); - Index = SDValue(CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, - CurDAG->getTargetConstant(0, DL, MVT::i64), - Index, - CurDAG->getTargetConstant(X86::sub_32bit, DL, - MVT::i32)), - 0); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, + MVT::i64), 0); + Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, + Index); } return true; @@ -2128,18 +2438,13 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, AM.Segment = Copy; unsigned Complexity = 0; - if (AM.BaseType == X86ISelAddressMode::RegBase) - if (AM.Base_Reg.getNode()) - Complexity = 1; - else - AM.Base_Reg = CurDAG->getRegister(0, VT); + if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) + Complexity = 1; else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) Complexity = 4; if (AM.IndexReg.getNode()) Complexity++; - else - AM.IndexReg = CurDAG->getRegister(0, VT); // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with // a simple shift. @@ -2159,14 +2464,14 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, Complexity += 2; } - if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode())) + if (AM.Disp) Complexity++; // If it isn't worth using an LEA, reject it. if (Complexity <= 2) return false; - getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } @@ -2180,17 +2485,15 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, X86ISelAddressMode AM; AM.GV = GA->getGlobal(); AM.Disp += GA->getOffset(); - AM.Base_Reg = CurDAG->getRegister(0, N.getValueType()); AM.SymbolFlags = GA->getTargetFlags(); - if (N.getValueType() == MVT::i32) { + MVT VT = N.getSimpleValueType(); + if (VT == MVT::i32) { AM.Scale = 1; AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); - } else { - AM.IndexReg = CurDAG->getRegister(0, MVT::i64); } - getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); return true; } @@ -2274,14 +2577,22 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { CR->getSignedMax().slt(1ull << Width); } -static X86::CondCode getCondFromOpc(unsigned Opc) { +static X86::CondCode getCondFromNode(SDNode *N) { + assert(N->isMachineOpcode() && "Unexpected node"); X86::CondCode CC = X86::COND_INVALID; - if (CC == X86::COND_INVALID) - CC = X86::getCondFromBranchOpc(Opc); - if (CC == X86::COND_INVALID) - CC = X86::getCondFromSETOpc(Opc); - if (CC == X86::COND_INVALID) - CC = X86::getCondFromCMovOpc(Opc); + unsigned Opc = N->getMachineOpcode(); + if (Opc == X86::JCC_1) + CC = static_cast(N->getConstantOperandVal(1)); + else if (Opc == X86::SETCCr) + CC = static_cast(N->getConstantOperandVal(0)); + else if (Opc == X86::SETCCm) + CC = static_cast(N->getConstantOperandVal(5)); + else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || + Opc == X86::CMOV64rr) + CC = static_cast(N->getConstantOperandVal(2)); + else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || + Opc == X86::CMOV64rm) + CC = static_cast(N->getConstantOperandVal(6)); return CC; } @@ -2307,7 +2618,7 @@ bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. - X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode()); + X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which only use the zero flag. @@ -2343,7 +2654,7 @@ bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. - X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode()); + X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which don't examine the SF flag. @@ -2404,7 +2715,7 @@ static bool mayUseCarryFlag(X86::CondCode CC) { if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. - X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode()); + X86::CondCode CC = getCondFromNode(*FlagUI); if (mayUseCarryFlag(CC)) return false; @@ -2582,10 +2893,13 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { return false; bool IsCommutable = false; + bool IsNegate = false; switch (Opc) { default: return false; case X86ISD::SUB: + IsNegate = isNullConstant(StoredVal.getOperand(0)); + break; case X86ISD::SBB: break; case X86ISD::ADD: @@ -2597,7 +2911,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { break; } - unsigned LoadOpNo = 0; + unsigned LoadOpNo = IsNegate ? 1 : 0; LoadSDNode *LoadNode = nullptr; SDValue InputChain; if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, @@ -2635,11 +2949,20 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { MachineSDNode *Result; switch (Opc) { - case X86ISD::ADD: case X86ISD::SUB: + // Handle negate. + if (IsNegate) { + unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, + X86::NEG8m); + const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, + MVT::Other, Ops); + break; + } + LLVM_FALLTHROUGH; + case X86ISD::ADD: // Try to match inc/dec. - if (!Subtarget->slowIncDec() || - CurDAG->getMachineFunction().getFunction().optForSize()) { + if (!Subtarget->slowIncDec() || OptForSize) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. @@ -2740,16 +3063,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { // See if the operand is a constant that we can fold into an immediate // operand. if (auto *OperandC = dyn_cast(Operand)) { - auto OperandV = OperandC->getAPIntValue(); + int64_t OperandV = OperandC->getSExtValue(); // Check if we can shrink the operand enough to fit in an immediate (or // fit into a smaller immediate) by negating it and switching the // operation. if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && - ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 && - (-OperandV).getMinSignedBits() <= 8) || - (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 && - (-OperandV).getMinSignedBits() <= 32)) && + ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || + (MemVT == MVT::i64 && !isInt<32>(OperandV) && + isInt<32>(-OperandV))) && hasNoCarryFlagUses(StoredVal.getValue(1))) { OperandV = -OperandV; Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; @@ -2757,11 +3079,10 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { // First try to fit this into an Imm8 operand. If it doesn't fit, then try // the larger immediate operand. - if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) { + if (MemVT != MVT::i8 && isInt<8>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImm8Opcode(Opc); - } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() && - (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) { + } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImmOpcode(Opc); } @@ -2821,8 +3142,6 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { if (NVT != MVT::i32 && NVT != MVT::i64) return false; - unsigned Size = NVT.getSizeInBits(); - SDValue NBits; // If we have BMI2's BZHI, we are ok with muti-use patterns. @@ -2835,16 +3154,27 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; + auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { + if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { + assert(V.getSimpleValueType() == MVT::i32 && + V.getOperand(0).getSimpleValueType() == MVT::i64 && + "Expected i64 -> i32 truncation"); + V = V.getOperand(0); + } + return V; + }; + // a) x & ((1 << nbits) + (-1)) - auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool { + auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, + &NBits](SDValue Mask) -> bool { // Match `add`. Must only have one use! if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) return false; // We should be adding all-ones constant (i.e. subtracting one.) if (!isAllOnesConstant(Mask->getOperand(1))) return false; - // Match `1 << nbits`. Must only have one use! - SDValue M0 = Mask->getOperand(0); + // Match `1 << nbits`. Might be truncated. Must only have one use! + SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; if (!isOneConstant(M0->getOperand(0))) @@ -2853,23 +3183,36 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { return true; }; + auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { + V = peekThroughOneUseTruncation(V); + return CurDAG->MaskedValueIsAllOnes( + V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), + NVT.getSizeInBits())); + }; + // b) x & ~(-1 << nbits) - auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool { + auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, + &NBits](SDValue Mask) -> bool { // Match `~()`. Must only have one use! - if (!isBitwiseNot(Mask) || !checkOneUse(Mask)) + if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) return false; - // Match `-1 << nbits`. Must only have one use! - SDValue M0 = Mask->getOperand(0); + // The -1 only has to be all-ones for the final Node's NVT. + if (!isAllOnes(Mask->getOperand(1))) + return false; + // Match `-1 << nbits`. Might be truncated. Must only have one use! + SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; - if (!isAllOnesConstant(M0->getOperand(0))) + // The -1 only has to be all-ones for the final Node's NVT. + if (!isAllOnes(M0->getOperand(0))) return false; NBits = M0->getOperand(1); return true; }; // Match potentially-truncated (bitwidth - y) - auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) { + auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, + unsigned Bitwidth) { // Skip over a truncate of the shift amount. if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { ShiftAmt = ShiftAmt.getOperand(0); @@ -2881,52 +3224,56 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { if (ShiftAmt.getOpcode() != ISD::SUB) return false; auto V0 = dyn_cast(ShiftAmt.getOperand(0)); - if (!V0 || V0->getZExtValue() != Size) + if (!V0 || V0->getZExtValue() != Bitwidth) return false; NBits = ShiftAmt.getOperand(1); return true; }; // c) x & (-1 >> (32 - y)) - auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool { + auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, + matchShiftAmt](SDValue Mask) -> bool { + // The mask itself may be truncated. + Mask = peekThroughOneUseTruncation(Mask); + unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); // Match `l>>`. Must only have one use! if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) return false; - // We should be shifting all-ones constant. + // We should be shifting truly all-ones constant. if (!isAllOnesConstant(Mask.getOperand(0))) return false; SDValue M1 = Mask.getOperand(1); // The shift amount should not be used externally. if (!checkOneUse(M1)) return false; - return matchShiftAmt(M1); + return matchShiftAmt(M1, Bitwidth); }; SDValue X; // d) x << (32 - y) >> (32 - y) - auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt, + auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, &X](SDNode *Node) -> bool { if (Node->getOpcode() != ISD::SRL) return false; SDValue N0 = Node->getOperand(0); if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) return false; + unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); SDValue N1 = Node->getOperand(1); SDValue N01 = N0->getOperand(1); // Both of the shifts must be by the exact same value. // There should not be any uses of the shift amount outside of the pattern. if (N1 != N01 || !checkTwoUse(N1)) return false; - if (!matchShiftAmt(N1)) + if (!matchShiftAmt(N1, Bitwidth)) return false; X = N0->getOperand(0); return true; }; - auto matchLowBitMask = [&matchPatternA, &matchPatternB, - &matchPatternC](SDValue Mask) -> bool { - // FIXME: pattern c. + auto matchLowBitMask = [matchPatternA, matchPatternB, + matchPatternC](SDValue Mask) -> bool { return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); }; @@ -2946,42 +3293,46 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { SDLoc DL(Node); - // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically* - // shifted (potentially with one-use trunc inbetween), - // and if so look past one-use truncation. - MVT XVT = NVT; - if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE && - X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) { - assert(NVT == MVT::i32 && "Expected target valuetype to be i32"); - X = X.getOperand(0); - XVT = X.getSimpleValueType(); - assert(XVT == MVT::i64 && "Expected truncation from i64"); - } + // Truncate the shift amount. + NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); + insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); - SDValue OrigNBits = NBits; - if (NBits.getValueType() != XVT) { - // Truncate the shift amount. - NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); - insertDAGNode(*CurDAG, OrigNBits, NBits); - - // Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit) - // register. All the other bits are undefined, we do not care about them. - SDValue ImplDef = - SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0); - insertDAGNode(*CurDAG, OrigNBits, ImplDef); - NBits = - CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits); - insertDAGNode(*CurDAG, OrigNBits, NBits); - } + // Insert 8-bit NBits into lowest 8 bits of 32-bit register. + // All the other bits are undefined, we do not care about them. + SDValue ImplDef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); + insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); + NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef, + NBits); + insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); if (Subtarget->hasBMI2()) { // Great, just emit the the BZHI.. - SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits); + if (NVT != MVT::i32) { + // But have to place the bit count into the wide-enough register first. + NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); + insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); + } + + SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); ReplaceNode(Node, Extract.getNode()); SelectCode(Extract.getNode()); return true; } + // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is + // *logically* shifted (potentially with one-use trunc inbetween), + // and the truncation was the only use of the shift, + // and if so look past one-use truncation. + { + SDValue RealX = peekThroughOneUseTruncation(X); + // FIXME: only if the shift is one-use? + if (RealX != X && RealX.getOpcode() == ISD::SRL) + X = RealX; + } + + MVT XVT = X.getSimpleValueType(); + // Else, emitting BEXTR requires one more step. // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location @@ -2991,10 +3342,11 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // Shift NBits left by 8 bits, thus producing 'control'. // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); - SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8); - insertDAGNode(*CurDAG, OrigNBits, Control); + SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); + insertDAGNode(*CurDAG, SDValue(Node, 0), Control); // If the 'X' is *logically* shifted, we can fold that shift into 'control'. + // FIXME: only if the shift is one-use? if (X.getOpcode() == ISD::SRL) { SDValue ShiftAmt = X.getOperand(1); X = X.getOperand(0); @@ -3003,13 +3355,20 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { "Expected shift amount to be i8"); // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! + // We could zext to i16 in some form, but we intentionally don't do that. SDValue OrigShiftAmt = ShiftAmt; - ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt); + ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); // And now 'or' these low 8 bits of shift amount into the 'control'. - Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt); - insertDAGNode(*CurDAG, OrigNBits, Control); + Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); + insertDAGNode(*CurDAG, SDValue(Node, 0), Control); + } + + // But have to place the 'control' into the wide-enough register first. + if (XVT != MVT::i32) { + Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); + insertDAGNode(*CurDAG, SDValue(Node, 0), Control); } // And finally, form the BEXTR itself. @@ -3017,7 +3376,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // The 'X' was originally truncated. Do that now. if (XVT != NVT) { - insertDAGNode(*CurDAG, OrigNBits, Extract); + insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); } @@ -3098,14 +3457,14 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; - SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. - ReplaceUses(Input.getValue(1), SDValue(NewNode, 1)); + ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast(Input)->getMemOperand()}); } else { - NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New); } return NewNode; @@ -3263,6 +3622,119 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { return true; } +bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { + MVT NVT = N->getSimpleValueType(0); + unsigned Opcode = N->getOpcode(); + SDLoc dl(N); + + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue Shift = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + ConstantSDNode *Cst = dyn_cast(N1); + if (!Cst) + return false; + + int64_t Val = Cst->getSExtValue(); + + // If we have an any_extend feeding the AND, look through it to see if there + // is a shift behind it. But only if the AND doesn't use the extended bits. + // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? + bool FoundAnyExtend = false; + if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && + Shift.getOperand(0).getSimpleValueType() == MVT::i32 && + isUInt<32>(Val)) { + FoundAnyExtend = true; + Shift = Shift.getOperand(0); + } + + if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) + return false; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + ConstantSDNode *ShlCst = dyn_cast(Shift.getOperand(1)); + if (!ShlCst) + return false; + + uint64_t ShAmt = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + return false; + + // Check the minimum bitwidth for the new constant. + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { + if (Opcode == ISD::AND) { + // AND32ri is the same as AND64ri32 with zext imm. + // Try this before sign extended immediates below. + ShiftedVal = (uint64_t)Val >> ShAmt; + if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) + return true; + // Also swap order when the AND can become MOVZX. + if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) + return true; + } + ShiftedVal = Val >> ShAmt; + if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || + (!isInt<32>(Val) && isInt<32>(ShiftedVal))) + return true; + if (Opcode != ISD::AND) { + // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr + ShiftedVal = (uint64_t)Val >> ShAmt; + if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) + return true; + } + return false; + }; + + int64_t ShiftedVal; + if (!CanShrinkImmediate(ShiftedVal)) + return false; + + // Ok, we can reorder to get a smaller immediate. + + // But, its possible the original immediate allowed an AND to become MOVZX. + // Doing this late due to avoid the MakedValueIsZero call as late as + // possible. + if (Opcode == ISD::AND) { + // Find the smallest zext this could possibly be. + unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); + ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); + + // Figure out which bits need to be zero to achieve that mask. + APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), + ZExtWidth); + NeededMask &= ~Cst->getAPIntValue(); + + if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) + return false; + } + + SDValue X = Shift.getOperand(0); + if (FoundAnyExtend) { + SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); + insertDAGNode(*CurDAG, SDValue(N, 0), NewX); + X = NewX; + } + + SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); + insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); + SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); + insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); + SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, + Shift.getOperand(1)); + ReplaceNode(N, NewSHL.getNode()); + SelectCode(NewSHL.getNode()); + return true; +} + /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large @@ -3333,6 +3805,347 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { return true; } +static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, + bool FoldedBCast, bool Masked) { + if (Masked) { + if (FoldedLoad) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk; + } + } + + if (FoldedBCast) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk; + } + } + + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk; + } + } + + if (FoldedLoad) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm; + } + } + + if (FoldedBCast) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb; + } + } + + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr; + } +} + +// Try to create VPTESTM instruction. If InMask is not null, it will be used +// to form a masked operation. +bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, + SDValue InMask) { + assert(Subtarget->hasAVX512() && "Expected AVX512!"); + assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected VT!"); + + // Look for equal and not equal compares. + ISD::CondCode CC = cast(Setcc.getOperand(2))->get(); + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return false; + + // See if we're comparing against zero. This should have been canonicalized + // to RHS during lowering. + if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode())) + return false; + + SDValue N0 = Setcc.getOperand(0); + + MVT CmpVT = N0.getSimpleValueType(); + MVT CmpSVT = CmpVT.getVectorElementType(); + + // Start with both operands the same. We'll try to refine this. + SDValue Src0 = N0; + SDValue Src1 = N0; + + { + // Look through single use bitcasts. + SDValue N0Temp = N0; + if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) + N0Temp = N0.getOperand(0); + + // Look for single use AND. + if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { + Src0 = N0Temp.getOperand(0); + Src1 = N0Temp.getOperand(1); + } + } + + // Without VLX we need to widen the load. + bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); + + // We can only fold loads if the sources are unique. + bool CanFoldLoads = Src0 != Src1; + + // Try to fold loads unless we need to widen. + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; + if (!Widen && CanFoldLoads) { + Load = Src1; + FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4); + if (!FoldedLoad) { + // And is computative. + Load = Src0; + FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4); + if (FoldedLoad) + std::swap(Src0, Src1); + } + } + + auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { + // Look through single use bitcasts. + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) + Src = Src.getOperand(0); + + if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + Parent = Src.getNode(); + Src = Src.getOperand(0); + if (Src.getSimpleValueType() == CmpSVT) + return Src; + } + + return SDValue(); + }; + + // If we didn't fold a load, try to match broadcast. No widening limitation + // for this. But only 32 and 64 bit types are supported. + bool FoldedBCast = false; + if (!FoldedLoad && CanFoldLoads && + (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { + SDNode *ParentNode = nullptr; + if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { + FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); + } + + // Try the other operand. + if (!FoldedBCast) { + if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { + FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); + if (FoldedBCast) + std::swap(Src0, Src1); + } + } + } + + auto getMaskRC = [](MVT MaskVT) { + switch (MaskVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v2i1: return X86::VK2RegClassID; + case MVT::v4i1: return X86::VK4RegClassID; + case MVT::v8i1: return X86::VK8RegClassID; + case MVT::v16i1: return X86::VK16RegClassID; + case MVT::v32i1: return X86::VK32RegClassID; + case MVT::v64i1: return X86::VK64RegClassID; + } + }; + + bool IsMasked = InMask.getNode() != nullptr; + + SDLoc dl(Root); + + MVT ResVT = Setcc.getSimpleValueType(); + MVT MaskVT = ResVT; + if (Widen) { + // Widen the inputs using insert_subreg or copy_to_regclass. + unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; + unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; + unsigned NumElts = CmpVT.getVectorNumElements() * Scale; + CmpVT = MVT::getVectorVT(CmpSVT, NumElts); + MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, + CmpVT), 0); + Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); + + assert(!FoldedLoad && "Shouldn't have folded the load"); + if (!FoldedBCast) + Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); + + if (IsMasked) { + // Widen the mask. + unsigned RegClass = getMaskRC(MaskVT); + SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); + InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, MaskVT, InMask, RC), 0); + } + } + + bool IsTestN = CC == ISD::SETEQ; + unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, + IsMasked); + + MachineSDNode *CNode; + if (FoldedLoad || FoldedBCast) { + SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); + + if (IsMasked) { + SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, + Load.getOperand(0) }; + CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + } else { + SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, + Load.getOperand(0) }; + CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + } + + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + } else { + if (IsMasked) + CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); + else + CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); + } + + // If we widened, we need to shrink the mask VT. + if (Widen) { + unsigned RegClass = getMaskRC(ResVT); + SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); + CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, ResVT, SDValue(CNode, 0), RC); + } + + ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Root); + return true; +} + void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); @@ -3346,6 +4159,61 @@ void X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::INTRINSIC_VOID: { + unsigned IntNo = Node->getConstantOperandVal(1); + switch (IntNo) { + default: break; + case Intrinsic::x86_sse3_monitor: + case Intrinsic::x86_monitorx: + case Intrinsic::x86_clzero: { + bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; + + unsigned Opc = 0; + switch (IntNo) { + case Intrinsic::x86_sse3_monitor: + if (!Subtarget->hasSSE3()) + break; + Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; + break; + case Intrinsic::x86_monitorx: + if (!Subtarget->hasMWAITX()) + break; + Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; + break; + case Intrinsic::x86_clzero: + if (!Subtarget->hasCLZERO()) + break; + Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; + break; + } + + if (Opc) { + unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; + SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, + Node->getOperand(2), SDValue()); + SDValue InFlag = Chain.getValue(1); + + if (IntNo == Intrinsic::x86_sse3_monitor || + IntNo == Intrinsic::x86_monitorx) { + // Copy the other two operands to ECX and EDX. + Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), + InFlag); + InFlag = Chain.getValue(1); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), + InFlag); + InFlag = Chain.getValue(1); + } + + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, + { Chain, InFlag}); + ReplaceNode(Node, CNode); + return; + } + } + } + + break; + } case ISD::BRIND: { if (Subtarget->isTargetNaCl()) // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We @@ -3381,13 +4249,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } break; - case X86ISD::BLENDV: { - // BLENDV selects like a regular VSELECT. - SDValue VSelect = CurDAG->getNode( - ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + case ISD::VSELECT: { + // Replace VSELECT with non-mask conditions with with BLENDV. + if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) + break; + + assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); + SDValue Blendv = CurDAG->getNode( + X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2)); - ReplaceNode(Node, VSelect.getNode()); - SelectCode(VSelect.getNode()); + ReplaceNode(Node, Blendv.getNode()); + SelectCode(Blendv.getNode()); // We already called ReplaceUses. return; } @@ -3403,6 +4275,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case ISD::AND: + if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { + // Try to form a masked VPTESTM. Operands can be in either order. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && + tryVPTESTM(Node, N0, N1)) + return; + if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && + tryVPTESTM(Node, N1, N0)) + return; + } + if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); @@ -3415,89 +4299,113 @@ void X86DAGToDAGISel::Select(SDNode *Node) { LLVM_FALLTHROUGH; case ISD::OR: - case ISD::XOR: { - - // For operations of the form (x << C1) op C2, check if we can use a smaller - // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); + case ISD::XOR: + if (tryShrinkShlLogicImm(Node)) + return; - if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + LLVM_FALLTHROUGH; + case ISD::ADD: + case ISD::SUB: { + // Try to avoid folding immediates with multiple uses for optsize. + // This code tries to select to register form directly to avoid going + // through the isel table which might fold the immediate. We can't change + // the patterns on the add/sub/and/or/xor with immediate paterns in the + // tablegen files to check immediate use count without making the patterns + // unavailable to the fast-isel table. + if (!OptForSize) break; - // i8 is unshrinkable, i16 should be promoted to i32. - if (NVT != MVT::i32 && NVT != MVT::i64) + // Only handle i8/i16/i32/i64. + if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) break; + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + ConstantSDNode *Cst = dyn_cast(N1); - ConstantSDNode *ShlCst = dyn_cast(N0->getOperand(1)); - if (!Cst || !ShlCst) + if (!Cst) break; int64_t Val = Cst->getSExtValue(); - uint64_t ShlVal = ShlCst->getZExtValue(); - // Make sure that we don't change the operation by removing bits. - // This only matters for OR and XOR, AND is unaffected. - uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; - if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + // Make sure its an immediate that is considered foldable. + // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. + if (!isInt<8>(Val) && !isInt<32>(Val)) break; - unsigned ShlOp, AddOp, Op; - MVT CstVT = NVT; - - // Check the minimum bitwidth for the new constant. - // TODO: AND32ri is the same as AND64ri32 with zext imm. - // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr - // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. - if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) - CstVT = MVT::i8; - else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) - CstVT = MVT::i32; - - // Bail if there is no smaller encoding. - if (NVT == CstVT) + // Check if we should avoid folding this immediate. + if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) break; + // We should not fold the immediate. So we need a register form instead. + unsigned ROpc, MOpc; switch (NVT.SimpleTy) { - default: llvm_unreachable("Unsupported VT!"); + default: llvm_unreachable("Unexpected VT!"); + case MVT::i8: + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; + case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; + case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; + case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; + case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; + } + break; + case MVT::i16: + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; + case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; + case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; + case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; + case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; + } + break; case MVT::i32: - assert(CstVT == MVT::i8); - ShlOp = X86::SHL32ri; - AddOp = X86::ADD32rr; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case ISD::AND: Op = X86::AND32ri8; break; - case ISD::OR: Op = X86::OR32ri8; break; - case ISD::XOR: Op = X86::XOR32ri8; break; + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; + case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; + case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; + case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; + case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; } break; case MVT::i64: - assert(CstVT == MVT::i8 || CstVT == MVT::i32); - ShlOp = X86::SHL64ri; - AddOp = X86::ADD64rr; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; - case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; - case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; + case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; + case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; + case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; + case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; } break; } - // Emit the smaller op and the shift. - SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT); - SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); - if (ShlVal == 1) - CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0), - SDValue(New, 0)); - else - CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), - getI8Imm(ShlVal, dl)); + // Ok this is a AND/OR/XOR/ADD/SUB with constant. + + // If this is a not a subtract, we can still try to fold a load. + if (Opcode != ISD::SUB) { + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + CurDAG->setNodeMemRefs(CNode, {cast(N0)->getMemOperand()}); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Node); + return; + } + } + + CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); return; } + case X86ISD::SMUL: // i16/i32/i64 are handled with isel patterns. if (NVT != MVT::i8) @@ -3895,7 +4803,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned TrailingZeros = countTrailingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); SDValue Shift = - SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, + SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); @@ -3906,7 +4814,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned LeadingZeros = countLeadingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); SDValue Shift = - SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, + SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); @@ -3964,8 +4872,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } - // FIXME: We should be able to fold loads here. - SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); SDValue Reg = N0.getOperand(0); @@ -4058,10 +4964,46 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; } + case ISD::SETCC: { + if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) + return; + + break; + } + case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; break; + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FRINT: { + // Replace fp rounding with their X86 specific equivalent so we don't + // need 2 sets of patterns. + // FIXME: This can only happen when the nodes started as STRICT_* and have + // been mutated into their non-STRICT equivalents. Eventually this + // mutation will be removed and we should switch the STRICT_ nodes to a + // strict version of RNDSCALE in PreProcessISelDAG. + unsigned Imm; + switch (Node->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::FCEIL: Imm = 0xA; break; + case ISD::FFLOOR: Imm = 0x9; break; + case ISD::FTRUNC: Imm = 0xB; break; + case ISD::FNEARBYINT: Imm = 0xC; break; + case ISD::FRINT: Imm = 0x4; break; + } + SDLoc dl(Node); + SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, + Node->getValueType(0), + Node->getOperand(0), + CurDAG->getConstant(Imm, dl, MVT::i8)); + ReplaceNode(Node, Res.getNode()); + SelectCode(Res.getNode()); + return; + } } SelectCode(Node); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b6a692ee187d..0b4bf687e6cf 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -131,7 +130,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addBypassSlowDiv(64, 32); } - if (Subtarget.isTargetKnownWindowsMSVC() || + if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); @@ -159,6 +158,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setUseUnderscoreLongJmp(true); } + // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to + // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. + // FIXME: Should we be limitting the atomic size on other configs? Default is + // 1024. + if (!Subtarget.hasCmpxchg8b()) + setMaxAtomicSizeInBitsSupported(32); + // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); @@ -190,10 +196,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); - if (Subtarget.is64Bit()) - setOperationAction(ISD::ABS , MVT::i64 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); } + setOperationAction(ISD::ABS , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { @@ -258,14 +263,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); @@ -415,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + else + setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); @@ -486,6 +487,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } + if (!Subtarget.is64Bit()) + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } @@ -530,6 +534,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); + // Disable f32->f64 extload as we can only generate this in one instruction + // under optsize. So its easier to pattern match (fpext (load)) for that + // case instead of needing to emit 2 instructions for extload in the + // non-optsize case. + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); @@ -668,6 +678,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); + setOperationAction(ISD::LROUND, MVT::f80, Expand); + setOperationAction(ISD::LLROUND, MVT::f80, Expand); + setOperationAction(ISD::LRINT, MVT::f80, Expand); + setOperationAction(ISD::LLRINT, MVT::f80, Expand); } // Always use a library call for pow. @@ -780,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -841,6 +858,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); + setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); + setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); if (!ExperimentalVectorWideningLegalization) { // Use widening instead of promotion. @@ -950,17 +971,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); - // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. - setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); - setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); @@ -1128,14 +1144,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); - // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1144,13 +1156,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, VT, Custom); } - if (ExperimentalVectorWideningLegalization) { - // These types need custom splitting if their input is a 128-bit vector. - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - } + // These types need custom splitting if their input is a 128-bit vector. + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); @@ -1182,9 +1192,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); - // TODO - remove this once 256-bit X86ISD::ANDNP correctly split. - setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom); - // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); @@ -1260,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { - setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } @@ -1282,6 +1289,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); } if (HasInt256) @@ -1352,19 +1360,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } @@ -1378,9 +1381,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); @@ -1413,10 +1413,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE + // to 512-bit rather than use the AVX2 instructions so that we can use + // k-masks. if (!Subtarget.hasVLX()) { - // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE - // to 512-bit rather than use the AVX2 instructions so that we can use - // k-masks. for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); @@ -1446,6 +1446,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); + + setOperationAction(ISD::SELECT, VT, Custom); } // Without BWI we need to use custom lowering to handle MVT::v64i8 input. @@ -1465,13 +1467,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); - setOperationAction(ISD::SELECT, MVT::v8f64, Custom); - setOperationAction(ISD::SELECT, MVT::v8i64, Custom); - setOperationAction(ISD::SELECT, MVT::v16i32, Custom); - setOperationAction(ISD::SELECT, MVT::v32i16, Custom); - setOperationAction(ISD::SELECT, MVT::v64i8, Custom); - setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -1485,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1705,6 +1701,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1788,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't @@ -1842,8 +1838,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. - if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() || - Subtarget.isTargetWindowsItanium())) + if (Subtarget.is32Bit() && + (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, ISD::FLOG10, ISD::FPOW, ISD::FSIN}) @@ -1854,6 +1850,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); @@ -1881,6 +1878,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -2050,20 +2050,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. -EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) { - if (Size >= 16 && - (!Subtarget.isUnalignedMem16Slow() || - ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16)))) { +/// For vector ops we check that the overall size isn't larger than our +/// preferred vector width. +EVT X86TargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 32-byte accesses are slow. - if (Size >= 32 && Subtarget.hasAVX()) { + if (Size >= 32 && Subtarget.hasAVX() && + (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, @@ -2071,11 +2070,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, // multiply) before we splat as a vector. return MVT::v32i8; } - if (Subtarget.hasSSE2()) + if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? // If we have SSE1 registers we should be able to use them. - if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87())) + if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && + (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { @@ -2104,11 +2104,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { return true; } -bool -X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, - bool *Fast) const { +bool X86TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: @@ -2124,6 +2122,16 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // TODO: What about AVX-512 (512-bit) accesses? } } + // NonTemporal vector memory ops must be aligned. + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + // NT loads can only be vector aligned, so if its less aligned than the + // minimum vector size (which we can split the vector down to), we might as + // well use a regular unaligned vector load. + // We don't have any NT loads pre-SSE41. + if (!!(Flags & MachineMemOperand::MOLoad)) + return (Align < 16 || !Subtarget.hasSSE41()); + return false; + } // Misaligned accesses of any size are always allowed. return true; } @@ -2281,12 +2289,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. - auto *SecurityCheckCookie = cast( - M.getOrInsertFunction("__security_check_cookie", - Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()))); - SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); - SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { + F->setCallingConv(CallingConv::X86_FastCall); + F->addAttribute(1, Attribute::AttrKind::InReg); + } return; } // glibc, bionic, and Fuchsia have a special slot for the stack guard. @@ -2304,7 +2313,7 @@ Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { return TargetLowering::getSDagStackGuard(M); } -Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { +Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { @@ -2347,8 +2356,6 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// -#include "X86GenCallingConv.inc" - bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { @@ -2703,7 +2710,6 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, "The values should reside in two registers"); SDValue Lo, Hi; - unsigned Reg; SDValue ArgValueLo, ArgValueHi; MachineFunction &MF = DAG.getMachineFunction(); @@ -2713,7 +2719,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register. - Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); @@ -2934,6 +2940,8 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: + // Swift: + case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); @@ -2986,22 +2994,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, else ValVT = VA.getValVT(); - // Calculate SP offset of interrupt parameter, re-arrange the slot normally - // taken by a return address. - int Offset = 0; - if (CallConv == CallingConv::X86_INTR) { - // X86 interrupts may take one or two arguments. - // On the stack there will be no return address as in regular call. - // Offset of last argument need to be set to -4/-8 bytes. - // Where offset of the first argument out of two, should be set to 0 bytes. - Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); - if (Subtarget.is64Bit() && Ins.size() == 2) { - // The stack pointer needs to be realigned for 64 bit handlers with error - // code, so the argument offset changes by 8 bytes. - Offset += 8; - } - } - // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -3014,15 +3006,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // can be improved with deeper analysis. int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, /*isAliased=*/true); - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } return DAG.getFrameIndex(FI, PtrVT); } // This is an argument in memory. We might be able to perform copy elision. - if (Flags.isCopyElisionCandidate()) { + // If the argument is passed directly in memory without any extension, then we + // can perform copy elision. Large vector types, for example, may be passed + // indirectly by pointer. + if (Flags.isCopyElisionCandidate() && + VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) { EVT ArgVT = Ins[i].ArgVT; SDValue PartAddr; if (Ins[i].PartOffset == 0) { @@ -3031,7 +3023,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // load from our portion of it. This assumes that if the first part of an // argument is in memory, the rest will also be in memory. int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), - /*Immutable=*/false); + /*IsImmutable=*/false); PartAddr = DAG.getFrameIndex(FI, PtrVT); return DAG.getLoad( ValVT, dl, Chain, PartAddr, @@ -3072,11 +3064,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, MFI.setObjectSExt(FI, true); } - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, @@ -3166,14 +3153,6 @@ SDValue X86TargetLowering::LowerFormalArguments( !(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); - if (CallConv == CallingConv::X86_INTR) { - bool isLegal = Ins.size() == 1 || - (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || - (!Is64Bit && Ins[1].VT == MVT::i32))); - if (!isLegal) - report_fatal_error("X86 interrupts may take one or two arguments"); - } - // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -3454,11 +3433,11 @@ SDValue X86TargetLowering::LowerFormalArguments( } // Copy all forwards from physical to virtual registers. - for (ForwardedRegister &F : Forwards) { + for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); - Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); + FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); } } @@ -3610,6 +3589,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); + MachineFunction::CallSiteInfo CSInfo; + if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); @@ -3805,6 +3786,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), I); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. @@ -3975,46 +3959,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. - } else if (Callee->getOpcode() == ISD::GlobalAddress) { - // If the callee is a GlobalAddress node (quite common, every direct call - // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack - // it. - GlobalAddressSDNode* G = cast(Callee); - - // We should use extra load for direct calls to dllimported functions in - // non-JIT mode. - const GlobalValue *GV = G->getGlobal(); - if (!GV->hasDLLImportStorageClass()) { - unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV); - - Callee = DAG.getTargetGlobalAddress( - GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); - - if (OpFlags == X86II::MO_GOTPCREL) { - // Add a wrapper. - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - // Add extra indirection - Callee = DAG.getLoad( - getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - } - } - } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - unsigned char OpFlags = - Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); - - Callee = DAG.getTargetExternalSymbol( - S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); - - if (OpFlags == X86II::MO_GOTPCREL) { - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - Callee = DAG.getLoad( - getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - } + } else if (Callee->getOpcode() == ISD::GlobalAddress || + Callee->getOpcode() == ISD::ExternalSymbol) { + // Lower direct calls to global addresses and external symbols. Setting + // ForCall to true here has the effect of removing WrapperRIP when possible + // to allow direct calls to be selected without first materializing the + // address into a register. + Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI @@ -4105,7 +4056,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } if (HasNoCfCheck && IsCFProtectionSupported) { @@ -4114,6 +4067,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); } InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; @@ -4787,7 +4741,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (!IntrData) return false; - Info.opc = ISD::INTRINSIC_W_CHAIN; Info.flags = MachineMemOperand::MONone; Info.offset = 0; @@ -4795,6 +4748,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { + Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; @@ -4810,6 +4764,31 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; break; } + case GATHER: + case GATHER_AVX2: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOLoad; + break; + } + case SCATTER: { + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOStore; + break; + } default: return false; } @@ -4820,7 +4799,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. -bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; @@ -4837,6 +4817,26 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + + // If this is an (1) AVX vector load with (2) multiple uses and (3) all of + // those uses are extracted directly into a store, then the extract + store + // can be store-folded. Therefore, it's probably not worth splitting the load. + EVT VT = Load->getValueType(0); + if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { + for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { + // Skip uses of the chain value. Result 0 of the node is the load value. + if (UI.getUse().getResNo() != 0) + continue; + + // If this use is not an extract + store, it's probably worth splitting. + if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || + UI->use_begin()->getOpcode() != ISD::STORE) + return true; + } + // All non-chain uses are extract + store. + return false; + } + return true; } @@ -4909,15 +4909,29 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { + unsigned Opc = VecOp.getOpcode(); + + // Assume target opcodes can't be scalarized. + // TODO - do we have any exceptions? + if (Opc >= ISD::BUILTIN_OP_END) + return false; + // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); - if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT)) + if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); - return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT); + return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); +} + +bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { + // TODO: Allow vectors? + if (VT.isVector()) + return false; + return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz() const { @@ -4930,8 +4944,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } -bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, - EVT BitcastVT) const { +bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; @@ -4939,7 +4954,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; - return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); + // If both types are legal vectors, it's always ok to convert them. + if (LoadVT.isVector() && BitcastVT.isVector() && + isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) + return true; + + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, @@ -4953,6 +4973,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } + // Make sure we don't merge greater than our preferred vector + // width. + if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) + return false; return true; } @@ -4998,7 +5022,25 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const { return Subtarget.hasSSE2(); } -bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const { +bool X86TargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + EVT VT = N->getValueType(0); + if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || + (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { + // Only fold if the shift values are equal - so it folds to AND. + // TODO - we should fold if either is a non-uniform vector but we don't do + // the fold for non-splats yet. + return N->getOperand(1) == N->getOperand(0).getOperand(1); + } + return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); +} + +bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. @@ -5048,8 +5090,8 @@ static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } -/// Return true if every element in Mask, beginning -/// from position Pos and ending in Pos+Size is the undef sentinel value. +/// Return true if every element in Mask, beginning from position Pos and ending +/// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (Mask[i] != SM_SentinelUndef) @@ -5057,6 +5099,18 @@ static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return true; } +/// Return true if the mask creates a vector whose lower half is undefined. +static bool isUndefLowerHalf(ArrayRef Mask) { + unsigned NumElts = Mask.size(); + return isUndefInRange(Mask, 0, NumElts / 2); +} + +/// Return true if the mask creates a vector whose upper half is undefined. +static bool isUndefUpperHalf(ArrayRef Mask) { + unsigned NumElts = Mask.size(); + return isUndefInRange(Mask, NumElts / 2, NumElts / 2); +} + /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); @@ -5409,6 +5463,53 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, DAG.getIntPtrConstant(0, dl)); } +/// Widen a vector to a larger size with the same scalar type, with the new +/// elements either zero or undef. +static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, unsigned WideSizeInBits) { + assert(Vec.getValueSizeInBits() < WideSizeInBits && + (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && + "Unsupported vector widening type"); + unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); + MVT SVT = Vec.getSimpleValueType().getScalarType(); + MVT VT = MVT::getVectorVT(SVT, WideNumElts); + return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); +} + +// Helper function to collect subvector ops that are concated together, +// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. +// The subvectors in Ops are guaranteed to be the same type. +static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { + assert(Ops.empty() && "Expected an empty ops vector"); + + if (N->getOpcode() == ISD::CONCAT_VECTORS) { + Ops.append(N->op_begin(), N->op_end()); + return true; + } + + if (N->getOpcode() == ISD::INSERT_SUBVECTOR && + isa(N->getOperand(2))) { + SDValue Src = N->getOperand(0); + SDValue Sub = N->getOperand(1); + const APInt &Idx = N->getConstantOperandAPInt(2); + EVT VT = Src.getValueType(); + EVT SubVT = Sub.getValueType(); + + // TODO - Handle more general insert_subvector chains. + if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && + Idx == (VT.getVectorNumElements() / 2) && + Src.getOpcode() == ISD::INSERT_SUBVECTOR && + isNullConstant(Src.getOperand(2))) { + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } + } + + return false; +} + // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in @@ -5457,19 +5558,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } -// Return true if the instruction zeroes the unused upper part of the -// destination and accepts mask. -static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { - switch (Opcode) { - default: - return false; - case X86ISD::CMPM: - case X86ISD::CMPM_RND: - case ISD::SETCC: - return true; - } -} - /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5626,10 +5714,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { return DAG.getBitcast(VT, Vec); } -static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, - SelectionDAG &DAG) { +// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. +static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { + switch (Opcode) { + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: + return ISD::ANY_EXTEND_VECTOR_INREG; + case ISD::ZERO_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + return ISD::ZERO_EXTEND_VECTOR_INREG; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: + return ISD::SIGN_EXTEND_VECTOR_INREG; + } + llvm_unreachable("Unknown opcode"); +} + +static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, + SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); + assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || + ISD::ZERO_EXTEND == Opcode) && + "Unknown extension opcode"); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -5642,13 +5749,10 @@ static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, InVT = In.getValueType(); } - if (VT.getVectorNumElements() == InVT.getVectorNumElements()) - return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - DL, VT, In); + if (VT.getVectorNumElements() != InVT.getVectorNumElements()) + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); - return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG, - DL, VT, In); + return DAG.getNode(Opcode, DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. @@ -5686,18 +5790,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } -// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops. -static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) { - while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) - V = V.getOperand(0); - return V; -} - -static const Constant *getTargetConstantFromNode(SDValue Op) { - Op = peekThroughBitcasts(Op); - - auto *Load = dyn_cast(Op); - if (!Load) +static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { + if (!Load || !ISD::isNormalLoad(Load)) return nullptr; SDValue Ptr = Load->getBasePtr(); @@ -5712,6 +5806,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { return CNode->getConstVal(); } +static const Constant *getTargetConstantFromNode(SDValue Op) { + Op = peekThroughBitcasts(Op); + return getTargetConstantFromNode(dyn_cast(Op)); +} + +const Constant * +X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { + assert(LD && "Unexpected null LoadSDNode"); + return getTargetConstantFromNode(LD); +} + // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, @@ -5778,8 +5883,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; - APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); - EltBits[i] = Bits.getZExtValue(); + EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; @@ -5899,6 +6003,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } } + // Extract constant bits from a subvector broadcast. + if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { + SmallVector SubEltBits; + if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, + UndefElts, SubEltBits, AllowWholeUndefs, + AllowPartialUndefs)) { + UndefElts = APInt::getSplat(NumElts, UndefElts); + while (EltBits.size() < NumElts) + EltBits.append(SubEltBits.begin(), SubEltBits.end()); + return true; + } + } + // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && @@ -5914,6 +6031,29 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return CastBitData(UndefSrcElts, SrcEltBits); } + // Insert constant bits from a base and sub vector sources. + if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && + isa(Op.getOperand(2))) { + // TODO - support insert_subvector through bitcasts. + if (EltSizeInBits != VT.getScalarSizeInBits()) + return false; + + APInt UndefSubElts; + SmallVector EltSubBits; + if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, + UndefSubElts, EltSubBits, + AllowWholeUndefs, AllowPartialUndefs) && + getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, + UndefElts, EltBits, AllowWholeUndefs, + AllowPartialUndefs)) { + unsigned BaseIdx = Op.getConstantOperandVal(2); + UndefElts.insertBits(UndefSubElts, BaseIdx); + for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) + EltBits[BaseIdx + i] = EltSubBits[i]; + return true; + } + } + // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isa(Op.getOperand(1))) { @@ -6068,6 +6208,34 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, } } +// Split the demanded elts of a HADD/HSUB node between its operands. +static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, + APInt &DemandedLHS, APInt &DemandedRHS) { + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = DemandedElts.getBitWidth(); + int NumEltsPerLane = NumElts / NumLanes; + int HalfEltsPerLane = NumEltsPerLane / 2; + + DemandedLHS = APInt::getNullValue(NumElts); + DemandedRHS = APInt::getNullValue(NumElts); + + // Map DemandedElts to the horizontal operands. + for (int Idx = 0; Idx != NumElts; ++Idx) { + if (!DemandedElts[Idx]) + continue; + int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; + int LocalIdx = Idx % NumEltsPerLane; + if (LocalIdx < HalfEltsPerLane) { + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); + } else { + LocalIdx -= HalfEltsPerLane; + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); + } + } +} + /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -6468,14 +6636,15 @@ static bool setTargetShuffleZeroElements(SDValue N, static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, - const SelectionDAG &DAG); + SelectionDAG &DAG); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. -static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, +static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, + SmallVectorImpl &Mask, SmallVectorImpl &Ops, - const SelectionDAG &DAG) { + SelectionDAG &DAG) { Mask.clear(); Ops.clear(); @@ -6483,8 +6652,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && - "Expected byte aligned value types"); + if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) + return false; + assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned Opcode = N.getOpcode(); switch (Opcode) { @@ -6524,6 +6694,40 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return true; } case ISD::OR: { + // Inspect each operand at the byte level. We can merge these into a + // blend shuffle mask if for each byte at least one is masked out (zero). + KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts); + KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts); + if (Known0.One.isNullValue() && Known1.One.isNullValue()) { + bool IsByteMask = true; + unsigned NumSizeInBytes = NumSizeInBits / 8; + unsigned NumBytesPerElt = NumBitsPerElt / 8; + APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); + APInt SelectMask = APInt::getNullValue(NumBytesPerElt); + for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { + unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue(); + unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue(); + if (LHS == 255 && RHS == 0) + SelectMask.setBit(i); + else if (LHS == 255 && RHS == 255) + ZeroMask.setBit(i); + else if (!(LHS == 0 && RHS == 255)) + IsByteMask = false; + } + if (IsByteMask) { + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) { + for (unsigned j = 0; j != NumBytesPerElt; ++j) { + unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0); + int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs)); + Mask.push_back(Idx); + } + } + Ops.push_back(N.getOperand(0)); + Ops.push_back(N.getOperand(1)); + return true; + } + } + // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); @@ -6558,9 +6762,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return true; } case ISD::INSERT_SUBVECTOR: { - // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where - // SRC0/SRC1 are both of the same valuetype VT. - // TODO - add peekThroughOneUseBitcasts support. SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); @@ -6568,28 +6769,57 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, if (!isa(N.getOperand(2)) || !N->isOnlyUserOf(Sub.getNode())) return false; + uint64_t InsertIdx = N.getConstantOperandVal(2); + // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). + if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0).getValueType() == VT && + isa(Sub.getOperand(1))) { + uint64_t ExtractIdx = Sub.getConstantOperandVal(1); + for (int i = 0; i != (int)NumElts; ++i) + Mask.push_back(i); + for (int i = 0; i != (int)NumSubElts; ++i) + Mask[InsertIdx + i] = NumElts + ExtractIdx + i; + Ops.push_back(Src); + Ops.push_back(Sub.getOperand(0)); + return true; + } + // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector SubMask; SmallVector SubInputs; - if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) || - SubMask.size() != NumSubElts) + if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, + SubMask, DAG)) return false; + if (SubMask.size() != NumSubElts) { + assert(((SubMask.size() % NumSubElts) == 0 || + (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); + if ((NumSubElts % SubMask.size()) == 0) { + int Scale = NumSubElts / SubMask.size(); + SmallVector ScaledSubMask; + scaleShuffleMask(Scale, SubMask, ScaledSubMask); + SubMask = ScaledSubMask; + } else { + int Scale = SubMask.size() / NumSubElts; + NumSubElts = SubMask.size(); + NumElts *= Scale; + InsertIdx *= Scale; + } + } Ops.push_back(Src); for (SDValue &SubInput : SubInputs) { - if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR || - SubInput.getOperand(0).getValueType() != VT || - !isa(SubInput.getOperand(1))) - return false; - Ops.push_back(SubInput.getOperand(0)); + EVT SubSVT = SubInput.getValueType().getScalarType(); + EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, + NumSizeInBits / SubSVT.getSizeInBits()); + Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, + DAG.getUNDEF(AltVT), SubInput, + DAG.getIntPtrConstant(0, SDLoc(N)))); } - int InsertIdx = N.getConstantOperandVal(2); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; - int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1); - M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts); + M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } @@ -6674,16 +6904,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); + APInt EltsLHS, EltsRHS; + getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); + // If we know input saturation won't happen we can treat this // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { - if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) || - (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)) + if ((!N0.isUndef() && + DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) || + (!N1.isUndef() && + DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) || - (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask))) + if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) || + (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS))) return false; } @@ -6728,15 +6963,54 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, } return true; } - case ISD::ZERO_EXTEND_VECTOR_INREG: - case ISD::ZERO_EXTEND: { - // TODO - add support for VPMOVZX with smaller input vector types. + case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); - if (NumSizeInBits != SrcVT.getSizeInBits()) - break; - DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, + if (!SrcVT.isVector()) + return false; + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal broadcast type"); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumSizeInBits / SrcVT.getScalarSizeInBits()); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + + Ops.push_back(Src); + Mask.append(NumElts, 0); + return true; + } + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + case ISD::ANY_EXTEND_VECTOR_INREG: { + SDValue Src = N.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // Extended source must be a simple vector. + if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || + (SrcVT.getScalarSizeInBits() % 8) != 0) + return false; + + unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); + bool IsAnyExtend = + (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); + DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, Mask); + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal zero-extension type"); + SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), + NumSizeInBits / NumSrcBitsPerElt); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + Ops.push_back(Src); return true; } @@ -6745,7 +7019,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return false; } -/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. +/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, SmallVectorImpl &Mask) { int MaskWidth = Mask.size(); @@ -6761,13 +7035,28 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, M = SM_SentinelUndef; // Check for unused inputs. - if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { - UsedInputs.push_back(Inputs[i]); + if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; continue; } - for (int &M : Mask) - if (lo <= M) - M -= MaskWidth; + + // Check for repeated inputs. + bool IsRepeat = false; + for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { + if (UsedInputs[j] != Inputs[i]) + continue; + for (int &M : Mask) + if (lo <= M) + M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); + IsRepeat = true; + break; + } + if (IsRepeat) + continue; + + UsedInputs.push_back(Inputs[i]); } Inputs = UsedInputs; } @@ -6780,9 +7069,11 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, - const SelectionDAG &DAG) { + SelectionDAG &DAG) { + unsigned NumElts = Op.getValueType().getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) + if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); @@ -6838,6 +7129,28 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, Depth+1); } + // Recurse into insert_subvector base/sub vector to find scalars. + if (Opcode == ISD::INSERT_SUBVECTOR && + isa(N->getOperand(2))) { + SDValue Vec = N->getOperand(0); + SDValue Sub = N->getOperand(1); + EVT SubVT = Sub.getValueType(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + uint64_t SubIdx = N->getConstantOperandVal(2); + + if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) + return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); + return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); + } + + // Recurse into extract_subvector src vector to find scalars. + if (Opcode == ISD::EXTRACT_SUBVECTOR && + isa(N->getOperand(1))) { + SDValue Src = N->getOperand(0); + uint64_t SrcIdx = N->getConstantOperandVal(1); + return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); + } + // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); @@ -6880,7 +7193,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register - // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. + // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) @@ -6889,7 +7202,6 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } @@ -6916,50 +7228,51 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, SDLoc dl(Op); SDValue V; - bool First = true; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. - for (unsigned i = 0; i < 16; ++i) { + for (unsigned i = 0; i < 16; i += 2) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; - if (ThisIsNonZero && First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; + if (!ThisIsNonZero && !NextIsNonZero) + continue; + + // FIXME: Investigate combining the first 4 bytes as a i32 instead. + SDValue Elt; + if (ThisIsNonZero) { + if (NumZero || NextIsNonZero) + Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); else - V = DAG.getUNDEF(MVT::v8i16); - First = false; + Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); } - if ((i & 1) != 0) { - // FIXME: Investigate extending to i32 instead of just i16. - // FIXME: Investigate combining the first 4 bytes as a i32 instead. - SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; - if (LastIsNonZero) { - LastElt = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); - } - if (ThisIsNonZero) { - ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, - DAG.getConstant(8, dl, MVT::i8)); - if (LastIsNonZero) - ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); - } else - ThisElt = LastElt; - - if (ThisElt) { - if (1 == i) { - V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) - : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); - V = DAG.getBitcast(MVT::v8i16, V); - } else { - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i / 2, dl)); - } + if (NextIsNonZero) { + SDValue NextElt = Op.getOperand(i + 1); + if (i == 0 && NumZero) + NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); + else + NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); + NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, + DAG.getConstant(8, dl, MVT::i8)); + if (ThisIsNonZero) + Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); + else + Elt = NextElt; + } + + // If our first insertion is not the first index then insert into zero + // vector to break any register dependency else use SCALAR_TO_VECTOR. + if (!V) { + if (i != 0) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); + V = DAG.getBitcast(MVT::v8i16, V); + continue; } } + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, + DAG.getIntPtrConstant(i / 2, dl)); } return DAG.getBitcast(MVT::v16i8, V); @@ -7002,9 +7315,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, } // Find all zeroable elements. - std::bitset<4> Zeroable; - for (int i=0; i < 4; ++i) { - SDValue Elt = Op->getOperand(i); + std::bitset<4> Zeroable, Undefs; + for (int i = 0; i < 4; ++i) { + SDValue Elt = Op.getOperand(i); + Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && @@ -7014,10 +7328,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; - for (unsigned i=0; i < 4; ++i) { + for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; - SDValue Elt = Op->getOperand(i); + SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); @@ -7056,10 +7370,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. - SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + SDValue VZeroOrUndef = (Zeroable == Undefs) + ? DAG.getUNDEF(VT) + : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); - return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. @@ -7079,7 +7395,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; - CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i); + CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) @@ -7200,9 +7516,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, unsigned NumElems = Elts.size(); int LastLoadedElt = -1; - SmallBitVector LoadMask(NumElems, false); - SmallBitVector ZeroMask(NumElems, false); - SmallBitVector UndefMask(NumElems, false); + APInt LoadMask = APInt::getNullValue(NumElems); + APInt ZeroMask = APInt::getNullValue(NumElems); + APInt UndefMask = APInt::getNullValue(NumElems); + + SmallVector Loads(NumElems, nullptr); // For each element in the initializer, see if we've found a load, zero or an // undef. @@ -7210,38 +7528,52 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); + if (Elt.isUndef()) { + UndefMask.setBit(i); + continue; + } + if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { + ZeroMask.setBit(i); + continue; + } - if (Elt.isUndef()) - UndefMask[i] = true; - else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) - ZeroMask[i] = true; - else if (ISD::isNON_EXTLoad(Elt.getNode())) { - LoadMask[i] = true; - LastLoadedElt = i; - // Each loaded element must be the correct fractional portion of the - // requested vector load. - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) - return SDValue(); - } else + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) return SDValue(); + + if (!ISD::isNON_EXTLoad(Elt.getNode())) + return SDValue(); + + Loads[i] = cast(Elt); + LoadMask.setBit(i); + LastLoadedElt = i; } - assert((ZeroMask | UndefMask | LoadMask).count() == NumElems && + assert((ZeroMask.countPopulation() + UndefMask.countPopulation() + + LoadMask.countPopulation()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. - if (UndefMask.count() == NumElems) + if (UndefMask.countPopulation() == NumElems) return DAG.getUNDEF(VT); // FIXME: Should we return this as a BUILD_VECTOR instead? - if ((ZeroMask | UndefMask).count() == NumElems) + if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - int FirstLoadedElt = LoadMask.find_first(); + int FirstLoadedElt = LoadMask.countTrailingZeros(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); - LoadSDNode *LDBase = cast(EltBase); - EVT LDBaseVT = EltBase.getValueType(); + EVT EltBaseVT = EltBase.getValueType(); + assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && + "Register/Memory size mismatch"); + LoadSDNode *LDBase = Loads[FirstLoadedElt]; + assert(LDBase && "Did not find base load for merging consecutive loads"); + unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); + unsigned BaseSizeInBytes = BaseSizeInBits / 8; + int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; + assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a @@ -7250,11 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { - SDValue Elt = peekThroughBitcasts(Elts[i]); - LoadSDNode *LD = cast(Elt); - if (!DAG.areNonVolatileConsecutiveLoads( - LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8, - i - FirstLoadedElt)) { + if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, + i - FirstLoadedElt)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; @@ -7264,11 +7593,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, } } - SmallVector Loads; - for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i) - if (LoadMask[i]) - Loads.push_back(cast(peekThroughBitcasts(Elts[i]))); - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && @@ -7277,23 +7601,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); for (auto *LD : Loads) - DAG.makeEquivalentMemoryOrdering(LD, NewLd); + if (LD) + DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; - // LOAD - all consecutive load/undefs (must start/end with a load). - // If we have found an entire vector of loads and undefs, then return a large - // load of the entire vector width starting at the base pointer. - // If the vector contains zeros, then attempt to shuffle those elements. - if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && + // Check if the base load is entirely dereferenceable. + bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( + VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); + + // LOAD - all consecutive load/undefs (must start/end with a load or be + // entirely dereferenceable). If we have found an entire vector of loads and + // undefs, then return a large load of the entire vector width starting at the + // base pointer. If the vector contains zeros, then attempt to shuffle those + // elements. + if (FirstLoadedElt == 0 && + (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { - assert(LDBase && "Did not find base load for merging consecutive loads"); - EVT EltVT = LDBase->getValueType(0); - // Ensure that the input vector size for the merged loads matches the - // cumulative size of the input elements. - if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) - return SDValue(); - if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); @@ -7303,12 +7627,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); - if (IsConsecutiveLoad) + if (NumElems == 1) + return DAG.getBitcast(VT, Elts[FirstLoadedElt]); + + if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. - if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) { + if (!isAfterLegalize && VT.isVector()) { SmallVector ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) @@ -7323,16 +7650,28 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, } } - int LoadSize = - (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); + // If the upper half of a ymm/zmm load is undef then just load the lower half. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned HalfNumElems = NumElems / 2; + if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) { + EVT HalfVT = + EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); + SDValue HalfLD = + EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, + DAG, Subtarget, isAfterLegalize); + if (HalfLD) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), + HalfLD, DAG.getIntPtrConstant(0, DL)); + } + } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && - (LoadSize == 32 || LoadSize == 64) && + (LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize) - : MVT::getIntegerVT(LoadSize); - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize); + MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) + : MVT::getIntegerVT(LoadSizeInBits); + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -7342,14 +7681,85 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, LDBase->getAlignment(), MachineMemOperand::MOLoad); for (auto *LD : Loads) - DAG.makeEquivalentMemoryOrdering(LD, ResNode); + if (LD) + DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } + // BROADCAST - match the smallest possible repetition pattern, load that + // scalar/subvector element and then broadcast to the entire vector. + if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && + (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { + for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { + unsigned RepeatSize = SubElems * BaseSizeInBits; + unsigned ScalarSize = std::min(RepeatSize, 64u); + if (!Subtarget.hasAVX2() && ScalarSize < 32) + continue; + + bool Match = true; + SmallVector RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); + for (unsigned i = 0; i != NumElems && Match; ++i) { + if (!LoadMask[i]) + continue; + SDValue Elt = peekThroughBitcasts(Elts[i]); + if (RepeatedLoads[i % SubElems].isUndef()) + RepeatedLoads[i % SubElems] = Elt; + else + Match &= (RepeatedLoads[i % SubElems] == Elt); + } + + // We must have loads at both ends of the repetition. + Match &= !RepeatedLoads.front().isUndef(); + Match &= !RepeatedLoads.back().isUndef(); + if (!Match) + continue; + + EVT RepeatVT = + VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) + ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) + : EVT::getFloatingPointVT(ScalarSize); + if (RepeatSize > ScalarSize) + RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, + RepeatSize / ScalarSize); + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), + VT.getSizeInBits() / ScalarSize); + if (TLI.isTypeLegal(BroadcastVT)) { + if (SDValue RepeatLoad = EltsFromConsecutiveLoads( + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { + unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST + : X86ISD::VBROADCAST; + SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); + return DAG.getBitcast(VT, Broadcast); + } + } + } + } + return SDValue(); } +// Combine a vector ops (shuffles etc.) that is equal to build_vector load1, +// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses +// are consecutive, non-overlapping, and in the right order. +static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool isAfterLegalize) { + SmallVector Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + Elts.push_back(Elt); + continue; + } + return SDValue(); + } + assert(Elts.size() == VT.getVectorNumElements()); + return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, + isAfterLegalize); +} + static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); @@ -7373,12 +7783,20 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue, return ConstantVector::get(ArrayRef(ConstantVec)); } -static bool isUseOfShuffle(SDNode *N) { +static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { - if (isTargetShuffle(U->getOpcode())) + unsigned Opc = U->getOpcode(); + // VPERMV/VPERMV3 shuffles can never fold their index operands. + if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) + return false; + if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) + return false; + if (isTargetShuffle(Opc)) + return true; + if (Opc == ISD::BITCAST) // Ignore bitcasts + return isFoldableUseOfShuffle(U); + if (N->hasOneUse()) return true; - if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts - return isUseOfShuffle(U); } return false; } @@ -7486,7 +7904,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. - if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) + if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -7581,7 +7999,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -8330,6 +8748,22 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); + unsigned NumElts = VT.getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); + for (unsigned i = 0; i != NumElts; ++i) + if (BV->getOperand(i).isUndef()) + DemandedElts.clearBit(i); + + // If we don't need the upper xmm, then perform as a xmm hop. + unsigned HalfNumElts = NumElts / 2; + if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts); + V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); + V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); + SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); + return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); + } + return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); } @@ -8338,11 +8772,8 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. - unsigned NumNonUndefs = 0; - for (const SDValue &V : BV->op_values()) - if (!V.isUndef()) - ++NumNonUndefs; - + unsigned NumNonUndefs = + count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); @@ -8350,23 +8781,15 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); - unsigned HOpcode; - SDValue V0, V1; - if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2()) + if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || + ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || + ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || + ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { + unsigned HOpcode; + SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); + } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) @@ -8481,9 +8904,15 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). + bool IsShift = false; switch (Opcode) { default: return SDValue(); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + IsShift = true; + break; case ISD::AND: case ISD::XOR: case ISD::OR: @@ -8504,10 +8933,24 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, // We expect the canonicalized RHS operand to be the constant. if (!isa(RHS)) return SDValue(); + + // Extend shift amounts. + if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { + if (!IsShift) + return SDValue(); + RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); + } + LHSElts.push_back(LHS); RHSElts.push_back(RHS); } + // Limit to shifts by uniform immediates. + // TODO: Only accept vXi8/vXi64 special cases? + // TODO: Permit non-uniform XOP/AVX2/MULLO cases? + if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) + return SDValue(); + SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); return DAG.getNode(Opcode, DL, VT, LHS, RHS); @@ -9288,60 +9731,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, return Vec; } -// Return true if all the operands of the given CONCAT_VECTORS node are zeros -// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) -static bool isExpandWithZeros(const SDValue &Op) { - assert(Op.getOpcode() == ISD::CONCAT_VECTORS && - "Expand with zeros only possible in CONCAT_VECTORS nodes!"); - - for (unsigned i = 1; i < Op.getNumOperands(); i++) - if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) - return false; - - return true; -} - // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. -static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { - unsigned Opc = Op.getOpcode(); - - assert(Opc == ISD::CONCAT_VECTORS && - Op.getSimpleValueType().getVectorElementType() == MVT::i1 && - "Unexpected node to check for type promotion!"); - - // As long as we are concatenating zeros to the upper part of a previous node - // result, climb up the tree until a node with different opcode is - // encountered - while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { - if (Opc == ISD::INSERT_SUBVECTOR) { - if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && - Op.getConstantOperandVal(2) == 0) - Op = Op.getOperand(1); - else - return SDValue(); - } else { // Opc == ISD::CONCAT_VECTORS - if (isExpandWithZeros(Op)) - Op = Op.getOperand(0); - else - return SDValue(); - } - Opc = Op.getOpcode(); - } - - // Check if the first inserted node zeroes the upper bits, or an 'and' result - // of a node that zeros the upper bits (its masked version). - if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || - (Op.getOpcode() == ISD::AND && - (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || - isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { - return Op; - } - - return SDValue(); -} - // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, @@ -9353,13 +9745,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); - // If this node promotes - by concatenating zeroes - the type of the result - // of a node with instruction that zeroes all upper (irrelevant) bits of the - // output register, mark it as legal and catch the pattern in instruction - // selection to avoid emitting extra instructions (for zeroing upper bits). - if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) - return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl); - unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; @@ -9618,6 +10003,8 @@ static bool isTargetShuffleEquivalent(ArrayRef Mask, int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; + assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && + "Illegal target shuffle mask"); for (int i = 0; i < Size; ++i) if (Mask[i] == SM_SentinelUndef) @@ -9687,6 +10074,40 @@ static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) { return IsUnpackwdMask; } +static bool is128BitUnpackShuffleMask(ArrayRef Mask) { + // Create 128-bit vector type based on mask size. + MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); + MVT VT = MVT::getVectorVT(EltVT, Mask.size()); + + // We can't assume a canonical shuffle mask, so try the commuted version too. + SmallVector CommutedMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommutedMask); + + // Match any of unary/binary or low/high. + for (unsigned i = 0; i != 4; ++i) { + SmallVector UnpackMask; + createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); + if (isTargetShuffleEquivalent(Mask, UnpackMask) || + isTargetShuffleEquivalent(CommutedMask, UnpackMask)) + return true; + } + return false; +} + +/// Return true if a shuffle mask chooses elements identically in its top and +/// bottom halves. For example, any splat mask has the same top and bottom +/// halves. If an element is undefined in only one half of the mask, the halves +/// are not considered identical. +static bool hasIdenticalHalvesShuffleMask(ArrayRef Mask) { + assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); + unsigned HalfSize = Mask.size() / 2; + for (unsigned i = 0; i != HalfSize; ++i) { + if (Mask[i] != Mask[i + HalfSize]) + return false; + } + return true; +} + /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -9826,12 +10247,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable, } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. -static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; @@ -9885,11 +10305,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND -static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const APInt &Zeroable, - ArrayRef Mask, SDValue &V1, - SDValue &V2, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, + const APInt &Zeroable, + ArrayRef Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) @@ -9905,9 +10325,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getSelect(DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, @@ -9997,9 +10415,9 @@ static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. -static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, SDValue V2, + SelectionDAG &DAG) { SmallVector Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) @@ -10061,10 +10479,10 @@ static bool matchVectorShuffleAsVPMOV(ArrayRef Mask, bool SwappedOps, // // But when avx512vl is available, one can just use a single vpmovdw // instruction. -static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (VT != MVT::v16i8 && VT != MVT::v8i16) return SDValue(); @@ -10169,10 +10587,9 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, return false; } -static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef Mask, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, @@ -10187,14 +10604,32 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { - assert(!VT.isFloatingPoint() && "Floating point types are not supported"); +static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); - SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); + SDValue Zero, AllOnes; + // Use f64 if i64 isn't legal. + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + MaskVT = MVT::getVectorVT(EltVT, Mask.size()); + } + + MVT LogicVT = VT; + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + Zero = DAG.getConstantFP(0.0, DL, EltVT); + AllOnes = DAG.getConstantFP( + APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); + LogicVT = + MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); + } else { + Zero = DAG.getConstant(0, DL, EltVT); + AllOnes = DAG.getAllOnesConstant(DL, EltVT); + } + SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -10212,8 +10647,11 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, if (!V) return SDValue(); // No non-zeroable elements! - SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); - return DAG.getNode(ISD::AND, DL, VT, V, VMask); + SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); + VMask = DAG.getBitcast(LogicVT, VMask); + V = DAG.getBitcast(LogicVT, V); + SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); + return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. @@ -10221,9 +10659,9 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. -static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); @@ -10305,11 +10743,11 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Original, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SmallVector Mask = createTargetShuffleMask(Original, Zeroable); uint64_t BlendMask = 0; @@ -10325,45 +10763,24 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; + case MVT::v4f64: + case MVT::v8f32: + assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); + LLVM_FALLTHROUGH; + case MVT::v2f64: case MVT::v2i64: + case MVT::v4f32: case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget.hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - LLVM_FALLTHROUGH; - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = DAG.getBitcast(MVT::v8i16, V2); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } + case MVT::v8i16: + assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { - assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. @@ -10391,14 +10808,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, } LLVM_FALLTHROUGH; } - case MVT::v16i8: - case MVT::v32i8: { - assert((VT.is128BitVector() || Subtarget.hasAVX2()) && - "256-bit byte-blends require AVX2 support!"); + case MVT::v32i8: + assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); + LLVM_FALLTHROUGH; + case MVT::v16i8: { + assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { @@ -10456,6 +10874,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { + // Attempt to lower to a bitmask if we can. Only if not optimizing for size. + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!OptForSize) { + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return Masked; + } + + // Otherwise load an immediate into a GPR, cast to k-register, and use a + // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); @@ -10471,11 +10899,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. -static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - SelectionDAG &DAG, - bool ImmBlends = false) { +static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG, + bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); @@ -10510,10 +10938,10 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. -static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; @@ -10573,7 +11001,7 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. -static SDValue lowerVectorShuffleAsByteRotateAndPermute( +static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || @@ -10664,7 +11092,7 @@ static SDValue lowerVectorShuffleAsByteRotateAndPermute( /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. -static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( +static SDValue lowerShuffleAsDecomposedShuffleBlend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and @@ -10688,18 +11116,18 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // Only prefer immediate blends to unpack/rotate. - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( - DL, VT, V1, V2, Mask, DAG, true)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, + DAG, true)) return BlendPerm; - if (SDValue UnpackPerm = - lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) + if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, + DAG)) return UnpackPerm; - if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute( + if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. - if (SDValue BlendPerm = - lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, + DAG)) return BlendPerm; } @@ -10711,8 +11139,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( /// Try to lower a vector shuffle as a rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. -static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, - ArrayRef Mask) { +static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: @@ -10796,8 +11223,8 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef Mask) { +static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef Mask) { // Don't accept any shuffles with zero elements. if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return -1; @@ -10807,7 +11234,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; - int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); + int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; @@ -10818,15 +11245,14 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, return Rotation * Scale; } -static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; - int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask); + int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); @@ -10874,11 +11300,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); @@ -10887,7 +11312,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; - int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); + int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); @@ -10895,6 +11320,69 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, DAG.getConstant(Rotation, DL, MVT::i8)); } +/// Try to lower a vector shuffle as a byte shift sequence. +static SDValue lowerVectorShuffleAsByteShiftMask( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + assert(VT.is128BitVector() && "Only 128-bit vectors supported"); + + // We need a shuffle that has zeros at one/both ends and a sequential + // shuffle from one source within. + unsigned ZeroLo = Zeroable.countTrailingOnes(); + unsigned ZeroHi = Zeroable.countLeadingOnes(); + if (!ZeroLo && !ZeroHi) + return SDValue(); + + unsigned NumElts = Mask.size(); + unsigned Len = NumElts - (ZeroLo + ZeroHi); + if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) + return SDValue(); + + unsigned Scale = VT.getScalarSizeInBits() / 8; + ArrayRef StubMask = Mask.slice(ZeroLo, Len); + if (!isUndefOrInRange(StubMask, 0, NumElts) && + !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) + return SDValue(); + + SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; + Res = DAG.getBitcast(MVT::v16i8, Res); + + // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an + // inner sequential set of elements, possibly offset: + // 01234567 --> zzzzzz01 --> 1zzzzzzz + // 01234567 --> 4567zzzz --> zzzzz456 + // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz + if (ZeroLo == 0) { + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); + } else if (ZeroHi == 0) { + unsigned Shift = Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else if (!Subtarget.hasSSSE3()) { + // If we don't have PSHUFB then its worth avoiding an AND constant mask + // by performing 3 byte shifts. Shuffle combining can kick in above that. + // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Shift += Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else + return SDValue(); + + return DAG.getBitcast(VT, Res); +} + /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -10918,11 +11406,10 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] -static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, - unsigned ScalarSizeInBits, - ArrayRef Mask, int MaskOffset, - const APInt &Zeroable, - const X86Subtarget &Subtarget) { +static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, + unsigned ScalarSizeInBits, ArrayRef Mask, + int MaskOffset, const APInt &Zeroable, + const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -10981,11 +11468,11 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, return -1; } -static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -10994,14 +11481,13 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, unsigned Opcode; // Try to match shuffle against V1 shift. - int ShiftAmt = matchVectorShuffleAsShift( - ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); + int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { - ShiftAmt = - matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), - Mask, Size, Zeroable, Subtarget); + ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, Size, Zeroable, Subtarget); V = V2; } @@ -11018,16 +11504,16 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. -static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef Mask, uint64_t &BitLen, - uint64_t &BitIdx, const APInt &Zeroable) { +static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef Mask, uint64_t &BitLen, + uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. - if (!isUndefInRange(Mask, HalfSize, HalfSize)) + if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the @@ -11074,15 +11560,15 @@ static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } -static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef Mask, uint64_t &BitLen, - uint64_t &BitIdx) { +static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef Mask, uint64_t &BitLen, + uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. - if (!isUndefInRange(Mask, HalfSize, HalfSize)) + if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { @@ -11140,17 +11626,16 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; - if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) + if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); - if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) + if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getConstant(BitLen, DL, MVT::i8), @@ -11168,7 +11653,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. -static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( +static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); @@ -11203,6 +11688,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. + // TODO: Add AnyExt support. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. @@ -11211,7 +11697,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -11234,7 +11720,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; - unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); + unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), @@ -11253,8 +11739,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); - if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || - !SafeOffset(Offset + 1)) + if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; @@ -11326,7 +11811,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. -static SDValue lowerVectorShuffleAsZeroOrAnyExtend( +static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11397,8 +11882,8 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (Offset != 0 && Matches < 2) return SDValue(); - return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); + return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, + InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -11482,7 +11967,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. -static SDValue lowerVectorShuffleAsElementInsertion( +static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11580,10 +12065,10 @@ static SDValue lowerVectorShuffleAsElementInsertion( /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. -static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, - SDValue V0, int BroadcastIdx, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); @@ -11629,16 +12114,90 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } +/// Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + +/// If we are extracting two 128-bit halves of a vector and shuffling the +/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a +/// multi-shuffle lowering. +static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, + SDValue N1, ArrayRef Mask, + SelectionDAG &DAG) { + EVT VT = N0.getValueType(); + assert((VT.is128BitVector() && + (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && + "VPERM* family of shuffles requires 32-bit or 64-bit elements"); + + // Check that both sources are extracts of the same source vector. + if (!N0.hasOneUse() || !N1.hasOneUse() || + N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + N0.getOperand(0) != N1.getOperand(0)) + return SDValue(); + + SDValue WideVec = N0.getOperand(0); + EVT WideVT = WideVec.getValueType(); + if (!WideVT.is256BitVector() || !isa(N0.getOperand(1)) || + !isa(N1.getOperand(1))) + return SDValue(); + + // Match extracts of each half of the wide source vector. Commute the shuffle + // if the extract of the low half is N1. + unsigned NumElts = VT.getVectorNumElements(); + SmallVector NewMask(Mask.begin(), Mask.end()); + const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); + const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); + if (ExtIndex1 == 0 && ExtIndex0 == NumElts) + ShuffleVectorSDNode::commuteMask(NewMask); + else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) + return SDValue(); + + // Final bailout: if the mask is simple, we are better off using an extract + // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps + // because that avoids a constant load from memory. + if (NumElts == 4 && + (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) + return SDValue(); + + // Extend the shuffle mask with undef elements. + NewMask.append(NumElts, -1); + + // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 + SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), + NewMask); + // This is free: ymm -> xmm. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, + DAG.getIntPtrConstant(0, DL)); +} + /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. -static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && VT.isFloatingPoint()) || (Subtarget.hasAVX2() && VT.isInteger()))) @@ -11647,6 +12206,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); + unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; @@ -11670,29 +12230,19 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { - // Peek through bitcasts as long as BroadcastIdx can be adjusted. - SDValue VSrc = V.getOperand(0); - unsigned NumEltBits = V.getScalarValueSizeInBits(); - unsigned NumSrcBits = VSrc.getScalarValueSizeInBits(); - if ((NumEltBits % NumSrcBits) == 0) - BroadcastIdx *= (NumEltBits / NumSrcBits); - else if ((NumSrcBits % NumEltBits) == 0 && - (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0) - BroadcastIdx /= (NumSrcBits / NumEltBits); - else - break; - V = VSrc; + V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { - int OperandSize = - V.getOperand(0).getSimpleValueType().getVectorNumElements(); - V = V.getOperand(BroadcastIdx / OperandSize); - BroadcastIdx %= OperandSize; + int OpBitWidth = V.getOperand(0).getValueSizeInBits(); + int OpIdx = BitOffset / OpBitWidth; + V = V.getOperand(OpIdx); + BitOffset %= OpBitWidth; continue; } case ISD::INSERT_SUBVECTOR: { @@ -11701,11 +12251,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, if (!ConstantIdx) break; - int BeginIdx = (int)ConstantIdx->getZExtValue(); - int EndIdx = - BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); - if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { - BroadcastIdx -= BeginIdx; + int EltBitWidth = VOuter.getScalarValueSizeInBits(); + int Idx = (int)ConstantIdx->getZExtValue(); + int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); + int BeginOffset = Idx * EltBitWidth; + int EndOffset = BeginOffset + NumSubElts * EltBitWidth; + if (BeginOffset <= BitOffset && BitOffset < EndOffset) { + BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; @@ -11715,48 +12267,34 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, } break; } + assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); + BroadcastIdx = BitOffset / NumEltBits; - // Ensure the source vector and BroadcastIdx are for a suitable type. - if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) { - unsigned NumEltBits = VT.getScalarSizeInBits(); - unsigned NumSrcBits = V.getScalarValueSizeInBits(); - if ((NumSrcBits % NumEltBits) == 0) - BroadcastIdx *= (NumSrcBits / NumEltBits); - else if ((NumEltBits % NumSrcBits) == 0 && - (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0) - BroadcastIdx /= (NumEltBits / NumSrcBits); - else - return SDValue(); - - unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; - MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts); - V = DAG.getBitcast(SrcVT, V); - } + // Do we need to bitcast the source to retrieve the original broadcast index? + bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. - // First, look through bitcast: if the original value has a larger element - // type than the shuffle, the broadcast element is in essence truncated. - // Make that explicit to ease folding. - if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) - if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( - DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + // If the original value has a larger element type than the shuffle, the + // broadcast element is in essence truncated. Make that explicit to ease + // folding. + if (BitCastSrc && VT.isInteger()) + if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( + DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; - // Peek through any bitcast (only useful for loads). - SDValue BC = peekThroughBitcasts(V); - // Also check the simpler case, where we can directly reuse the scalar. - if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || - (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + if (!BitCastSrc && + ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { + } else if (MayFoldLoad(V) && !cast(V)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); @@ -11767,10 +12305,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. - LoadSDNode *Ld = cast(BC); + LoadSDNode *Ld = cast(V); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( @@ -11779,7 +12318,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); - } else if (BroadcastIdx != 0) { + } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. @@ -11791,18 +12330,15 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. - unsigned EltSize = VT.getScalarSizeInBits(); - if (((BroadcastIdx * EltSize) % 128) != 0) + if ((BitOffset % 128) != 0) return SDValue(); - // The shuffle input might have been a bitcast we looked through; look at - // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll - // later bitcast it to BroadcastVT. - assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() && - "Unexpected vector element size"); + assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && + "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); - V = extract128BitVector(V, BroadcastIdx, DAG, DL); + unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); + V = extract128BitVector(V, ExtractIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) @@ -11810,21 +12346,21 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, DAG.getBitcast(MVT::f64, V)); // Bitcast back to the same scalar type as BroadcastVT. - MVT SrcVT = V.getSimpleValueType(); - if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) { - assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) { + assert(NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); - if (SrcVT.isVector()) { - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); + MVT ExtVT; + if (V.getValueType().isVector()) { + unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; + ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); } else { - SrcVT = BroadcastVT.getScalarType(); + ExtVT = BroadcastVT.getScalarType(); } - V = DAG.getBitcast(SrcVT, V); + V = DAG.getBitcast(ExtVT, V); } // 32-bit targets need to load i64 as a f64 and then bitcast the result. - if (!Subtarget.is64Bit() && SrcVT == MVT::i64) { + if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) { V = DAG.getBitcast(MVT::f64, V); unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); @@ -11833,9 +12369,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. - if (SrcVT.getSizeInBits() > 128) { - MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), - 128 / SrcVT.getScalarSizeInBits()); + if (V.getValueSizeInBits() > 128) { + MVT ExtVT = V.getSimpleValueType().getScalarType(); + ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits()); V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); V = DAG.getBitcast(ExtVT, V); } @@ -11849,11 +12385,10 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. -static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, - unsigned &InsertPSMask, - const APInt &Zeroable, - ArrayRef Mask, - SelectionDAG &DAG) { +static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, + unsigned &InsertPSMask, + const APInt &Zeroable, + ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -11938,16 +12473,15 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, return false; } -static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, - SDValue V2, ArrayRef Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, + ArrayRef Mask, const APInt &Zeroable, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask; - if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) + if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. @@ -11964,7 +12498,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsPermuteAndUnpack( +static SDValue lowerShuffleAsPermuteAndUnpack( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && @@ -12079,19 +12613,18 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack( /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. -static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the @@ -12116,16 +12649,20 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; @@ -12141,13 +12678,12 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); @@ -12161,19 +12697,18 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. -static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -12193,20 +12728,24 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; @@ -12214,33 +12753,32 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. @@ -12252,36 +12790,14 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// Test whether this can be lowered with a single SHUFPS instruction. -/// -/// This is used to disable more specialized lowerings when the shufps lowering -/// will happen to be efficient. -static bool isSingleSHUFPSMask(ArrayRef Mask) { - // This routine only handles 128-bit shufps. - assert(Mask.size() == 4 && "Unsupported mask size!"); - assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); - assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); - assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); - assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); - - // To lower with a single SHUFPS we need to have the low half and high half - // each requiring a single input. - if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) - return false; - if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) - return false; - - return true; -} - /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. -static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; @@ -12366,11 +12882,10 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. -static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -12379,8 +12894,8 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. @@ -12413,29 +12928,32 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. - if (SDValue V = - lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( - DL, MVT::v4f32, V1, V2, Mask, DAG)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, + V2, Mask, DAG)) return BlendPerm; } @@ -12449,23 +12967,21 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, } // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. -static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -12473,16 +12989,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -12501,14 +13017,18 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; @@ -12516,29 +13036,28 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; } @@ -12549,12 +13068,12 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) return Unpack; } @@ -12585,7 +13104,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. -static SDValue lowerV8I16GeneralSingleInputVectorShuffle( +static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); @@ -12617,11 +13136,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); - int NumLToL = - std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; - int NumLToH = - std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); @@ -12730,7 +13247,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. - int ADWord, BDWord; + int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; @@ -12825,8 +13342,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. - return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); @@ -13084,7 +13600,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. -static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( +static SDValue lowerShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && @@ -13147,54 +13663,51 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. -static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, - DAG, Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, + Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector MutableMask(Mask.begin(), Mask.end()); - return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, - MutableMask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, + Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && @@ -13202,19 +13715,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, "shuffles."); // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; @@ -13222,50 +13735,54 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; if (SDValue BitBlend = - lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return V; + // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; - return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG, V1InUse, V2InUse); + return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG); } /// Check whether a compaction lowering can be done by dropping even @@ -13334,9 +13851,9 @@ static int canLowerByDroppingEvenElements(ArrayRef Mask, return 0; } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); @@ -13354,39 +13871,38 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. -static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use a zext lowering. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); @@ -13394,12 +13910,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check whether we can widen this to an i16 shuffle by duplicating bytes. @@ -13492,13 +14007,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, return V; } - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; + + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly @@ -13518,7 +14037,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, bool V1InUse = false; bool V2InUse = false; - SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( + SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, @@ -13526,8 +14045,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some @@ -13538,17 +14057,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. - if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute( + if (SDValue V = lowerShuffleAsByteRotateAndPermute( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; } @@ -13558,13 +14077,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; - if (SDValue BitBlend = - lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) - return BitBlend; + if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Blend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for @@ -13605,8 +14123,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together @@ -13661,24 +14179,24 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. -static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: - return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: - return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: - return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: - return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: - return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: - return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); @@ -13690,9 +14208,9 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. -static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - SelectionDAG &DAG) { +static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); @@ -13816,11 +14334,10 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. -static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); @@ -13845,8 +14362,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, return true; }; if (DoBothBroadcast()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -13860,12 +14377,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, + DAG); } /// Lower a vector shuffle crossing multiple 128-bit lanes as @@ -13874,9 +14391,9 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, /// This is mainly for cases where we can have non-repeating permutes /// in each lane. /// -/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes, +/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, /// we should investigate merging them. -static SDValue lowerVectorShuffleAsLanePermuteAndPermute( +static SDValue lowerShuffleAsLanePermuteAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); @@ -13884,7 +14401,6 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( int NumEltsPerLane = NumElts / NumLanes; SmallVector SrcLaneMask(NumLanes, SM_SentinelUndef); - SmallVector LaneMask(NumElts, SM_SentinelUndef); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { @@ -13899,10 +14415,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( return SDValue(); SrcLaneMask[DstLane] = SrcLane; - LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane); PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane); } + // Make sure we set all elements of the lane mask, to avoid undef propagation. + SmallVector LaneMask(NumElts, SM_SentinelUndef); + for (int DstLane = 0; DstLane != NumLanes; ++DstLane) { + int SrcLane = SrcLaneMask[DstLane]; + if (0 <= SrcLane) + for (int j = 0; j != NumEltsPerLane; ++j) { + LaneMask[(DstLane * NumEltsPerLane) + j] = + (SrcLane * NumEltsPerLane) + j; + } + } + // If we're only shuffling a single lowest lane and the rest are identity // then don't bother. // TODO - isShuffleMaskInputInPlace could be extended to something like this. @@ -13931,11 +14457,9 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. -static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleAsLanePermuteAndBlend( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); @@ -13950,14 +14474,14 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] / LaneSize)] = true; if (!LaneUsed[0] || !LaneUsed[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } assert(V2.isUndef() && @@ -13981,11 +14505,11 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, } /// Handle lowering 2-lane 128-bit shuffles. -static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); @@ -14012,8 +14536,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask @@ -14084,9 +14608,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, /// or two of the lanes of the inputs. The lanes of the input vectors are /// shuffled in one or two independent shuffles to get the lanes into the /// position needed by the final shuffle. -/// -/// FIXME: This should be generalized to 512-bit shuffles. -static SDValue lowerVectorShuffleByMerging128BitLanes( +static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); @@ -14095,12 +14617,10 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return SDValue(); int Size = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; int LaneSize = 128 / VT.getScalarSizeInBits(); - int NumLanes = Size / LaneSize; - assert(NumLanes == 2 && "Only handles 256-bit shuffles."); - SmallVector RepeatMask(LaneSize, -1); - int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } }; + SmallVector, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. @@ -14111,7 +14631,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( int M = Mask[(Lane * LaneSize) + i]; if (M < 0) continue; - // Determine which of the 4 possible input lanes (2 from each source) + // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. @@ -14250,54 +14770,30 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } -/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. -/// This allows for fast cases such as subvector extraction/insertion -/// or shuffling smaller vector types which can lower more efficiently. -static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert((VT.is256BitVector() || VT.is512BitVector()) && - "Expected 256-bit or 512-bit vector"); - - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); - - bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); - bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); - if (!UndefLower && !UndefUpper) - return SDValue(); - - // Upper half is undef and lower half is whole upper subvector. - // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (UndefUpper && - isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getIntPtrConstant(HalfNumElts, DL)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, - DAG.getIntPtrConstant(0, DL)); - } - - // Lower half is undef and upper half is whole lower subvector. - // e.g. vector_shuffle or - if (UndefLower && - isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, - DAG.getIntPtrConstant(HalfNumElts, DL)); - } +/// If the input shuffle mask results in a vector that is undefined in all upper +/// or lower half elements and that mask accesses only 2 halves of the +/// shuffle's operands, return true. A mask of half the width with mask indexes +/// adjusted to access the extracted halves of the original shuffle operands is +/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or +/// lower half of each input operand is accessed. +static bool +getHalfShuffleMask(ArrayRef Mask, MutableArrayRef HalfMask, + int &HalfIdx1, int &HalfIdx2) { + assert((Mask.size() == HalfMask.size() * 2) && + "Expected input mask to be twice as long as output"); + + // Exactly one half of the result must be undef to allow narrowing. + bool UndefLower = isUndefLowerHalf(Mask); + bool UndefUpper = isUndefUpperHalf(Mask); + if (UndefLower == UndefUpper) + return false; - // If the shuffle only uses two of the four halves of the input operands, - // then extract them and perform the 'half' shuffle at half width. - // e.g. vector_shuffle or - int HalfIdx1 = -1, HalfIdx2 = -1; - SmallVector HalfMask(HalfNumElts); - unsigned Offset = UndefLower ? HalfNumElts : 0; + unsigned HalfNumElts = HalfMask.size(); + unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; + HalfIdx1 = -1; + HalfIdx2 = -1; for (unsigned i = 0; i != HalfNumElts; ++i) { - int M = Mask[i + Offset]; + int M = Mask[i + MaskIndexOffset]; if (M < 0) { HalfMask[i] = M; continue; @@ -14324,42 +14820,27 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, } // Too many half vectors referenced. - return SDValue(); + return false; } - assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); - // Only shuffle the halves of the inputs when useful. - int NumLowerHalves = - (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); - int NumUpperHalves = - (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); - - // uuuuXXXX - don't extract uppers just to insert again. - if (UndefLower && NumUpperHalves != 0) - return SDValue(); - - // XXXXuuuu - don't extract both uppers, instead shuffle and then extract. - if (UndefUpper && NumUpperHalves == 2) - return SDValue(); + return true; +} - // AVX2 - XXXXuuuu - always extract lowers. - if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) { - // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. - if (VT == MVT::v4f64 || VT == MVT::v4i64) - return SDValue(); - // AVX2 supports variable 32-bit element cross-lane shuffles. - if (VT == MVT::v8f32 || VT == MVT::v8i32) { - // XXXXuuuu - don't extract lowers and uppers. - if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0) - return SDValue(); - } - } +/// Given the output values from getHalfShuffleMask(), create a half width +/// shuffle of extracted vectors followed by an insert back to full width. +static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, + ArrayRef HalfMask, int HalfIdx1, + int HalfIdx2, bool UndefLower, + SelectionDAG &DAG) { + assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); + assert(V1.getValueType().isSimple() && "Expecting only simple types"); - // AVX512 - XXXXuuuu - always extract lowers. - if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0)) - return SDValue(); + MVT VT = V1.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); - auto GetHalfVector = [&](int HalfIdx) { + auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); @@ -14368,13 +14849,126 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, DAG.getIntPtrConstant(HalfIdx, DL)); }; - SDValue Half1 = GetHalfVector(HalfIdx1); - SDValue Half2 = GetHalfVector(HalfIdx2); + // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset + SDValue Half1 = getHalfVector(HalfIdx1); + SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } +/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.is256BitVector() || VT.is512BitVector()) && + "Expected 256-bit or 512-bit vector"); + + bool UndefLower = isUndefLowerHalf(Mask); + if (!UndefLower && !isUndefUpperHalf(Mask)) + return SDValue(); + + assert((!UndefLower || !isUndefUpperHalf(Mask)) && + "Completely undef shuffle mask should have been simplified already"); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + if (!UndefLower && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle or + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + int HalfIdx1, HalfIdx2; + SmallVector HalfMask(HalfNumElts); + if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) + return SDValue(); + + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + // Only shuffle the halves of the inputs when useful. + unsigned NumLowerHalves = + (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); + unsigned NumUpperHalves = + (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); + assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"); + + // Determine the larger pattern of undef/halves, then decide if it's worth + // splitting the shuffle based on subtarget capabilities and types. + unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); + if (!UndefLower) { + // XXXXuuuu: no insert is needed. + // Always extract lowers when setting lower - these are all free subreg ops. + if (NumUpperHalves == 0) + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + + if (NumUpperHalves == 1) { + // AVX2 has efficient 32/64-bit element cross-lane shuffles. + if (Subtarget.hasAVX2()) { + // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. + if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && + !is128BitUnpackShuffleMask(HalfMask) && + (!isSingleSHUFPSMask(HalfMask) || + Subtarget.hasFastVariableShuffle())) + return SDValue(); + // If this is a unary shuffle (assume that the 2nd operand is + // canonicalized to undef), then we can use vpermpd. Otherwise, we + // are better off extracting the upper half of 1 operand and using a + // narrow shuffle. + if (EltWidth == 64 && V2.isUndef()) + return SDValue(); + } + // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. + if (Subtarget.hasAVX512() && VT.is512BitVector()) + return SDValue(); + // Extract + narrow shuffle is better than the wide alternative. + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + } + + // Don't extract both uppers, instead shuffle and then extract. + assert(NumUpperHalves == 2 && "Half vector count went wrong"); + return SDValue(); + } + + // UndefLower - uuuuXXXX: an insert to high half is required if we split this. + if (NumUpperHalves == 0) { + // AVX2 has efficient 64-bit element cross-lane shuffles. + // TODO: Refine to account for unary shuffle, splat, and other masks? + if (Subtarget.hasAVX2() && EltWidth == 64) + return SDValue(); + // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. + if (Subtarget.hasAVX512() && VT.is512BitVector()) + return SDValue(); + // Narrow shuffle + insert is better than the wide alternative. + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + } + + // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. + return SDValue(); +} + /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -14560,9 +15154,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( SubLaneMask); } -static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, - unsigned &ShuffleImm, - ArrayRef Mask) { +static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, + unsigned &ShuffleImm, ArrayRef Mask) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && @@ -14597,14 +15190,14 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, return false; } -static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; - if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, @@ -14615,23 +15208,22 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. @@ -14659,29 +15251,33 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, return V; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( - DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) + if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, + Mask, DAG, Subtarget)) return V; // Otherwise, fall back. - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, + Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = - lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; + // If we have one input in place, then we can permute the other input and + // blend the result. + if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -14694,52 +15290,51 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) - return Result; + return V; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. -static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; if (V2.isUndef()) { @@ -14763,31 +15358,36 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; } // Try to use PALIGNR. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; + // If we have one input in place, then we can permute the other input and + // blend the result. + if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -14800,35 +15400,34 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // instruction so skip this pattern. if (!isShuffleMaskInputInPlace(0, Mask) && !isShuffleMaskInputInPlace(1, Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more @@ -14849,13 +15448,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -14875,49 +15473,49 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) - if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. -static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -14926,8 +15524,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split @@ -14935,17 +15533,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) - if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more @@ -14961,30 +15559,29 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; } // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15006,31 +15603,30 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); - SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, - CastV1, CastV2, DAG); + SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, + CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. -static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15039,37 +15635,36 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15082,12 +15677,12 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, + DAG, Subtarget); } SmallVector RepeatedMask; @@ -15095,44 +15690,43 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. - return lowerV8I16GeneralSingleInputVectorShuffle( + return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BWVL can lower to VPERMW. if (Subtarget.hasBWI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. -static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); @@ -15141,37 +15735,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15183,36 +15776,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, + Subtarget); } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMIVL can lower to VPERMB. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. @@ -15220,24 +15813,23 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, + SDValue V1, SDValue V2, const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = - lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we @@ -15251,12 +15843,12 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. - if (SDValue V = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; - if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), @@ -15268,17 +15860,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, switch (VT.SimpleTy) { case MVT::v4f64: - return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: - return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: - return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: - return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: - return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: - return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); @@ -15286,12 +15878,10 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, } /// Try to lower a vector shuffle as a 128-bit shuffles. -static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, - ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); @@ -15388,11 +15978,10 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, } /// Handle lowering of 8-lane 64-bit floating point shuffles. -static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -15419,37 +16008,33 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2, - Subtarget, DAG)) + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, + V2, Subtarget, DAG)) return Shuf128; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = - lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, - V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15471,16 +16056,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) - return Unpck; + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the @@ -15492,19 +16076,18 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; - return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. -static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -15530,47 +16113,44 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable, - V1, V2, Subtarget, DAG)) + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, + V2, Subtarget, DAG)) return Shuf128; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to use PALIGNR. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, - V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. -static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15578,7 +16158,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; @@ -15595,25 +16175,24 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. @@ -15621,27 +16200,26 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); - SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, - CastV1, CastV2, DAG); + SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, + CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. -static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); @@ -15650,23 +16228,22 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; if (V2.isUndef()) { @@ -15675,28 +16252,27 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. - return lowerV8I16GeneralSingleInputVectorShuffle( + return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; - return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. -static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); @@ -15705,37 +16281,36 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) - return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. @@ -15743,12 +16318,19 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (!V2.isUndef()) + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( + DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// High-level routine to lower various 512-bit x86 vector shuffles. @@ -15756,11 +16338,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); @@ -15770,18 +16352,18 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = - lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. - if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for @@ -15790,17 +16372,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: - return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: - return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: - return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); @@ -15809,7 +16391,7 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified -// version of matchVectorShuffleAsShift. +// version of matchShuffleAsShift. static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef Mask, int MaskOffset, const APInt &Zeroable) { int Size = Mask.size(); @@ -15844,11 +16426,11 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef Mask, // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. -static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); @@ -16037,15 +16619,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. - if (V2IsUndef) - for (int M : Mask) - if (M >= NumElements) { - SmallVector NewMask(Mask.begin(), Mask.end()); - for (int &M : NewMask) - if (M >= NumElements) - M = -1; - return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); - } + if (V2IsUndef && + any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { + SmallVector NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); + } // Check for illegal shuffle mask element index values. int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; @@ -16083,8 +16664,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() @@ -16122,26 +16703,21 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); - if (SDValue V = - lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) - return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) - return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) - return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } @@ -16401,7 +16977,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - DAG.getConstant(IdxVal, dl, MVT::i32)); + DAG.getIntPtrConstant(IdxVal, dl)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); @@ -16527,10 +17103,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); - if (!isa(N2)) + + auto *N2C = dyn_cast(N2); + if (!N2C || N2C->getAPIntValue().uge(NumElts)) return SDValue(); - auto *N2C = cast(N2); - unsigned IdxVal = N2C->getZExtValue(); + uint64_t IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); @@ -16575,13 +17152,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(IdxIn128, dl, MVT::i32)); + DAG.getIntPtrConstant(IdxIn128, dl)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); + // This will be just movd/movq/movss/movsd. + if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) && + (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || + EltVT == MVT::i64)) { + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + } + // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { @@ -16613,7 +17198,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize(); + bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -16663,7 +17248,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } - assert(OpVT.is128BitVector() && "Expected an SSE type!"); + assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && + "Expected an SSE type!"); // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. if (OpVT == MVT::v4i32) @@ -16789,35 +17375,9 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue -X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { - const char *Sym = cast(Op)->getSymbol(); - - // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the - // global base reg. - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); - - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); - - SDLoc DL(Op); - Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); - - // With PIC, the address is actually $g + Offset. - if (OpFlag) { - Result = - DAG.getNode(ISD::ADD, DL, PtrVT, - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); - } - - // For symbols that require a load from a stub to get the address, emit the - // load. - if (isGlobalStubReference(OpFlag)) - Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - - return Result; +SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } SDValue @@ -16841,35 +17401,67 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, - const SDLoc &dl, int64_t Offset, - SelectionDAG &DAG) const { - // Create the TargetGlobalAddress node, folding in the constant - // offset if it is legal. - unsigned char OpFlags = Subtarget.classifyGlobalReference(GV); +/// Creates target global address or external symbol nodes for calls or +/// other uses. +SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, + bool ForCall) const { + // Unpack the global address or external symbol. + const SDLoc &dl = SDLoc(Op); + const GlobalValue *GV = nullptr; + int64_t Offset = 0; + const char *ExternalSym = nullptr; + if (const auto *G = dyn_cast(Op)) { + GV = G->getGlobal(); + Offset = G->getOffset(); + } else { + const auto *ES = cast(Op); + ExternalSym = ES->getSymbol(); + } + + // Calculate some flags for address lowering. + const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); + unsigned char OpFlags; + if (ForCall) + OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); + else + OpFlags = Subtarget.classifyGlobalReference(GV, Mod); + bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); + bool NeedsLoad = isGlobalStubReference(OpFlags); + CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; - if (OpFlags == X86II::MO_NO_FLAG && - X86::isOffsetSuitableForCodeModel(Offset, M)) { - // A direct static reference to a global. - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); - Offset = 0; + + if (GV) { + // Create a target global address if this is a global. If possible, fold the + // offset into the global address reference. Otherwise, ADD it on later. + int64_t GlobalOffset = 0; + if (OpFlags == X86II::MO_NO_FLAG && + X86::isOffsetSuitableForCodeModel(Offset, M)) { + std::swap(GlobalOffset, Offset); + } + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); } else { - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); + // If this is not a global address, this must be an external symbol. + Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); } + // If this is a direct call, avoid the wrapper if we don't need to do any + // loads or adds. This allows SDAG ISel to match direct calls. + if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) + return Result; + Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. - if (isGlobalRelativeToPICBase(OpFlags)) { + if (HasPICReg) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. - if (isGlobalStubReference(OpFlags)) + if (NeedsLoad) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); @@ -16884,9 +17476,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - const GlobalValue *GV = cast(Op)->getGlobal(); - int64_t Offset = cast(Op)->getOffset(); - return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); + return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } static SDValue @@ -17112,9 +17702,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } - if (Subtarget.isTargetKnownWindowsMSVC() || - Subtarget.isTargetWindowsItanium() || - Subtarget.isTargetWindowsGNU()) { + if (Subtarget.isOSWindows()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -17254,7 +17842,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, APInt APIntShiftAmt; if (isConstantSplat(Amt, APIntShiftAmt)) { - uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); + uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8)); } @@ -17267,7 +17855,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -17311,6 +17899,70 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, DAG.getIntPtrConstant(0, dl)); } +static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, + const X86Subtarget &Subtarget) { + switch (Opcode) { + case ISD::SINT_TO_FP: + // TODO: Handle wider types with AVX/AVX512. + if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) + return false; + // CVTDQ2PS or (V)CVTDQ2PD + return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); + + case ISD::UINT_TO_FP: + // TODO: Handle wider types and i64 elements. + if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) + return false; + // VCVTUDQ2PS or VCVTUDQ2PD + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + default: + return false; + } +} + +/// Given a scalar cast operation that is extracted from a vector, try to +/// vectorize the cast op followed by extraction. This will avoid an expensive +/// round-trip between XMM and GPR. +static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: This could be enhanced to handle smaller integer types by peeking + // through an extend. + SDValue Extract = Cast.getOperand(0); + MVT DestVT = Cast.getSimpleValueType(); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Extract.getOperand(1))) + return SDValue(); + + // See if we have a 128-bit vector cast op for this type of cast. + SDValue VecOp = Extract.getOperand(0); + MVT FromVT = VecOp.getSimpleValueType(); + unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); + MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); + MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); + if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) + return SDValue(); + + // If we are extracting from a non-zero element, first shuffle the source + // vector to allow extracting from element zero. + SDLoc DL(Cast); + if (!isNullConstant(Extract.getOperand(1))) { + SmallVector Mask(FromVT.getVectorNumElements(), -1); + Mask[0] = Extract.getConstantOperandVal(1); + VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); + } + // If the source vector is wider than 128-bits, extract the low part. Do not + // create an unnecessarily wide vector cast op. + if (FromVT != Vec128VT) + VecOp = extract128BitVector(VecOp, 0, DAG, DL); + + // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 + // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 + SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, + DAG.getIntPtrConstant(0, DL)); +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); @@ -17318,6 +17970,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, @@ -17371,23 +18026,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); - unsigned ByteSize = SrcVT.getSizeInBits()/8; + unsigned ByteSize = SrcVT.getSizeInBits() / 8; FrameIndexSDNode *FI = dyn_cast(StackSlot); - MachineMemOperand *MMO; + MachineMemOperand *LoadMMO; if (FI) { int SSFI = FI->getIndex(); - MMO = DAG.getMachineFunction().getMachineMemOperand( + LoadMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { - MMO = cast(StackSlot)->getMemOperand(); + LoadMMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } - SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; - SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : - X86ISD::FILD, DL, - Tys, Ops, SrcVT, MMO); + SDValue FILDOps[] = {Chain, StackSlot}; + SDValue Result = + DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, + Tys, FILDOps, SrcVT, LoadMMO); if (useSSE) { Chain = Result.getValue(1); @@ -17397,20 +18052,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); - unsigned SSFISize = Op.getValueSizeInBits()/8; + unsigned SSFISize = Op.getValueSizeInBits() / 8; int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); - SDValue Ops[] = { - Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag - }; - MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag}; + MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); - Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, - Ops, Op.getValueType(), MMO); + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, + Op.getValueType(), StoreMMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); @@ -17545,7 +18198,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); // Two to the power of half-word-size. - SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64); + SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); // Clear upper part of LO, lower HI. SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); @@ -17680,6 +18333,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); @@ -17732,7 +18388,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); - SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); @@ -17768,16 +18424,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), -// just return an pair. +// just return an SDValue(). // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 -// to i16, i32 or i64, and we lower it to a legal sequence. -// If lowered to the final integer result we return a pair. -// Otherwise we lower it to a sequence ending with a FIST, return a -// pair, and the caller is responsible for loading -// the final integer result from StackSlot. -std::pair +// to i16, i32 or i64, and we lower it to a legal sequence and return the +// result. +SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { + bool IsSigned) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); @@ -17787,18 +18440,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. - return std::make_pair(SDValue(), SDValue()); + return SDValue(); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. - bool UnsignedFixup = !IsSigned && - DstTy == MVT::i64 && - (!Subtarget.is64Bit() || - !isScalarFPTypeInSSEReg(TheVT)); + bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; - if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) { + if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); @@ -17809,30 +18459,13 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); - // These are really Legal. - if (DstTy == MVT::i32 && - isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) - return std::make_pair(SDValue(), SDValue()); - if (Subtarget.is64Bit() && - DstTy == MVT::i64 && - isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) - return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); - unsigned MemSize = DstTy.getSizeInBits()/8; + unsigned MemSize = DstTy.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - unsigned Opc; - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } - SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. @@ -17874,9 +18507,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); - Adjust = DAG.getSelect(DL, MVT::i32, Cmp, - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0x80000000, DL, MVT::i32)); + Adjust = DAG.getSelect(DL, MVT::i64, Cmp, + DAG.getConstant(0, DL, MVT::i64), + DAG.getConstant(APInt::getSignMask(64), + DL, MVT::i64)); SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), @@ -17884,81 +18518,52 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); } + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); - Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(MF, SSFI)); - SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); - SDValue Ops[] = { - Chain, StackSlot, DAG.getValueType(TheVT) - }; - - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); + Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); + SDVTList Tys = DAG.getVTList(TheVT, MVT::Other); + SDValue Ops[] = { Chain, StackSlot }; + + unsigned FLDSize = TheVT.getStoreSize(); + assert(FLDSize <= MemSize && "Stack slot not big enough"); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); - SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); - StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); - - if (UnsignedFixup) { - - // Insert the FIST, load its result as two i32's, - // and XOR the high i32 with Adjust. + // Build the FP_TO_INT*_IN_MEM + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOStore, MemSize, MemSize); + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, + DAG.getVTList(MVT::Other), + Ops, DstTy, MMO); - SDValue FistOps[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - FistOps, DstTy, MMO); + SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); - SDValue Low32 = - DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo()); - SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL); + // If we need an unsigned fixup, XOR the result with adjust. + if (UnsignedFixup) + Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); - SDValue High32 = - DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo()); - High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); - - if (Subtarget.is64Bit()) { - // Join High32 and Low32 into a 64-bit result. - // (High32 << 32) | Low32 - Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); - High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); - High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, - DAG.getConstant(32, DL, MVT::i8)); - SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); - return std::make_pair(Result, SDValue()); - } - - SDValue ResultOps[] = { Low32, High32 }; - - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) - : DAG.getMergeValues(ResultOps, DL); - return std::make_pair(pair, SDValue()); - } else { - // Build the FP_TO_INT*_IN_MEM - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, DstTy, MMO); - return std::make_pair(FIST, StackSlot); - } + return Res; } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - MVT VT = Op->getSimpleValueType(0); - SDValue In = Op->getOperand(0); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); + unsigned Opc = Op.getOpcode(); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); + assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && + "Unexpected extension opcode"); assert(VT.getVectorNumElements() == VT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || @@ -17970,6 +18575,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); + unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); + // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) @@ -17977,8 +18584,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input. - return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In); + return DAG.getNode(ExtendInVecOpc, dl, VT, In); } if (Subtarget.hasInt256()) @@ -18000,11 +18606,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In); + SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); + + // Short-circuit if we can determine that each 128-bit half is the same value. + // Otherwise, this is difficult to match and optimize. + if (auto *Shuf = dyn_cast(In)) + if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); SDValue ZeroVec = DAG.getConstant(0, dl, InVT); SDValue Undef = DAG.getUNDEF(InVT); - bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; + bool NeedZero = Opc == ISD::ZERO_EXTEND; SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); OpHi = DAG.getBitcast(HalfVT, OpHi); @@ -18179,8 +18791,11 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). - Res = DAG.getBitcast(MVT::v4i64, Res); - Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); + // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. + SmallVector Mask; + int Scale = 64 / OutVT.getScalarSizeInBits(); + scaleShuffleMask(Scale, ArrayRef({ 0, 2, 1, 3 }), Mask); + Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); @@ -18422,12 +19037,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; MVT VT = Op.getSimpleValueType(); + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + SDLoc dl(Op); if (VT.isVector()) { - SDValue Src = Op.getOperand(0); - SDLoc dl(Op); - - if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) { + if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; @@ -18447,7 +19062,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { } assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); - if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { + if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); @@ -18458,19 +19073,34 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { assert(!VT.isVector()); - std::pair Vals = FP_TO_INTHelper(Op, DAG, - IsSigned, /*IsReplace=*/ false); - SDValue FIST = Vals.first, StackSlot = Vals.second; - // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) + bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); + + if (!IsSigned && Subtarget.hasAVX512()) { + // Conversions from f32/f64 should be legal. + if (UseSSEReg) + return Op; + + // Use default expansion. + if (VT == MVT::i64) + return SDValue(); + } + + // Promote i16 to i32 if we can use a SSE operation. + if (VT == MVT::i16 && UseSSEReg) { + assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + // If this is a SINT_TO_FP using SSEReg we're done. + if (UseSSEReg && IsSigned) return Op; - if (StackSlot.getNode()) - // Load the result. - return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo()); + // Fall back to X87. + if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned)) + return V; - // The node is the result. - return FIST; + llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { @@ -18491,7 +19121,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize(); + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -18513,16 +19143,11 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, if (!IsFP && !Subtarget.hasSSSE3()) return Op; - // Defer forming the minimal horizontal op if the vector source has more than - // the 2 extract element uses that we're matching here. In that case, we might - // form a horizontal op that includes more than 1 add/sub op. + // Extract from a common vector. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getOperand(0) != RHS.getOperand(0) || - !LHS.getOperand(0)->hasNUsesOfValue(2, 0)) - return Op; - - if (!isa(LHS.getOperand(1)) || + !isa(LHS.getOperand(1)) || !isa(RHS.getOperand(1)) || !shouldUseHorizontalOp(true, DAG, Subtarget)) return Op; @@ -18540,33 +19165,37 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); - if (LExtIndex == 1 && RExtIndex == 0 && + if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) std::swap(LExtIndex, RExtIndex); - // TODO: This can be extended to handle other adjacent extract pairs. - if (LExtIndex != 0 || RExtIndex != 1) + if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) return Op; SDValue X = LHS.getOperand(0); EVT VecVT = X.getValueType(); unsigned BitWidth = VecVT.getSizeInBits(); + unsigned NumLanes = BitWidth / 128; + unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here"); // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit - // equivalent, so extract the 256/512-bit source op to 128-bit. - // This is free: ymm/zmm -> xmm. + // equivalent, so extract the 256/512-bit source op to 128-bit if we can. SDLoc DL(Op); - if (BitWidth == 256 || BitWidth == 512) - X = extract128BitVector(X, 0, DAG, DL); + if (BitWidth == 256 || BitWidth == 512) { + unsigned LaneIdx = LExtIndex / NumEltsPerLane; + X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); + LExtIndex %= NumEltsPerLane; + } // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 + // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, - DAG.getIntPtrConstant(0, DL)); + DAG.getIntPtrConstant(LExtIndex / 2, DL)); } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -18732,36 +19361,25 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); } -// Check whether an OR'd tree is PTEST-able. -static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, - const X86Subtarget &Subtarget, - SelectionDAG &DAG, - SDValue &X86CC) { - assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); - - if (!Subtarget.hasSSE41()) - return SDValue(); - - if (!Op->hasOneUse()) - return SDValue(); - - SDNode *N = Op.getNode(); - SDLoc DL(N); - +/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) +/// style scalarized (associative) reduction patterns. +static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp, + SmallVectorImpl &SrcOps) { SmallVector Opnds; - DenseMap VecInMap; - SmallVector VecIns; + DenseMap SrcOpMap; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. - Opnds.push_back(N->getOperand(0)); - Opnds.push_back(N->getOperand(1)); + assert(Op.getOpcode() == unsigned(BinOp) && + "Unexpected bit reduction opcode"); + Opnds.push_back(Op.getOperand(0)); + Opnds.push_back(Op.getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; - // BFS traverse all OR'd operands. - if (I->getOpcode() == ISD::OR) { + // BFS traverse all BinOp operands. + if (I->getOpcode() == unsigned(BinOp)) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. @@ -18771,42 +19389,63 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + return false; // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa(Idx)) - return SDValue(); + return false; - SDValue ExtractedFromVec = I->getOperand(0); - DenseMap::iterator M = VecInMap.find(ExtractedFromVec); - if (M == VecInMap.end()) { - VT = ExtractedFromVec.getValueType(); - // Quit if not 128/256-bit vector. - if (!VT.is128BitVector() && !VT.is256BitVector()) - return SDValue(); + SDValue Src = I->getOperand(0); + DenseMap::iterator M = SrcOpMap.find(Src); + if (M == SrcOpMap.end()) { + VT = Src.getValueType(); // Quit if not the same type. - if (VecInMap.begin() != VecInMap.end() && - VT != VecInMap.begin()->first.getValueType()) - return SDValue(); - M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; - VecIns.push_back(ExtractedFromVec); + if (SrcOpMap.begin() != SrcOpMap.end() && + VT != SrcOpMap.begin()->first.getValueType()) + return false; + unsigned NumElts = VT.getVectorNumElements(); + APInt EltCount = APInt::getNullValue(NumElts); + M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; + SrcOps.push_back(Src); } - M->second |= 1U << cast(Idx)->getZExtValue(); + // Quit if element already used. + unsigned CIdx = cast(Idx)->getZExtValue(); + if (M->second[CIdx]) + return false; + M->second.setBit(CIdx); } - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Not extracted from 128-/256-bit vector."); + // Quit if not all elements are used. + for (DenseMap::const_iterator I = SrcOpMap.begin(), + E = SrcOpMap.end(); + I != E; ++I) { + if (!I->second.isAllOnesValue()) + return false; + } - unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + return true; +} - for (DenseMap::const_iterator - I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { - // Quit if not all elements are used. - if (I->second != FullMask) - return SDValue(); - } +// Check whether an OR'd tree is PTEST-able. +static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, SDValue &X86CC) { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget.hasSSE41() || !Op->hasOneUse()) + return SDValue(); + + SmallVector VecIns; + if (!matchBitOpReduction(Op, ISD::OR, VecIns)) + return SDValue(); + + // Quit if not 128/256-bit vector. + EVT VT = VecIns[0].getValueType(); + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + SDLoc DL(Op); MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. @@ -18822,10 +19461,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, - DL, MVT::i8); - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, - VecIns.back(), VecIns.back()); + X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, + MVT::i8); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// return true if \c Op has a use that doesn't just read flags. @@ -18963,29 +19601,52 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG, Subtarget); - if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || - Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Only promote the compare up to I32 if it is a 16 bit operation - // with an immediate. 16 bit immediates are to be avoided. - if (Op0.getValueType() == MVT::i16 && - ((isa(Op0) && - !cast(Op0)->getAPIntValue().isSignedIntN(8)) || - (isa(Op1) && - !cast(Op1)->getAPIntValue().isSignedIntN(8))) && - !DAG.getMachineFunction().getFunction().optForMinSize() && - !Subtarget.isAtom()) { + EVT CmpVT = Op0.getValueType(); + + if (CmpVT.isFloatingPoint()) + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + + assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || + CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); + + // Only promote the compare up to I32 if it is a 16 bit operation + // with an immediate. 16 bit immediates are to be avoided. + if (CmpVT == MVT::i16 && !Subtarget.isAtom() && + !DAG.getMachineFunction().getFunction().hasMinSize()) { + ConstantSDNode *COp0 = dyn_cast(Op0); + ConstantSDNode *COp1 = dyn_cast(Op1); + // Don't do this if the immediate can fit in 8-bits. + if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || + (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); - Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); + if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { + // For equality comparisons try to use SIGN_EXTEND if the input was + // truncate from something with enough sign bits. + if (Op0.getOpcode() == ISD::TRUNCATE) { + SDValue In = Op0.getOperand(0); + unsigned EffBits = + In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; + if (EffBits <= 16) + ExtendOp = ISD::SIGN_EXTEND; + } else if (Op1.getOpcode() == ISD::TRUNCATE) { + SDValue In = Op1.getOperand(0); + unsigned EffBits = + In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; + if (EffBits <= 16) + ExtendOp = ISD::SIGN_EXTEND; + } + } + + CmpVT = MVT::i32; + Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); + Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } - // Use SUB instead of CMP to enable CSE between SUB and CMP. - SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); - SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return SDValue(Sub.getNode(), 1); } - assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!"); - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); + return Sub.getValue(1); } /// Convert a comparison if required by the subtarget. @@ -19146,7 +19807,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -19290,10 +19951,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } -/// Given a simple buildvector constant, return a new vector constant with each -/// element decremented. If decrementing would result in underflow or this -/// is not a simple vector constant, return an empty value. -static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { +/// Given a buildvector constant, return a new vector constant with each element +/// incremented or decremented. If incrementing or decrementing would result in +/// unsigned overflow or underflow or this is not a simple vector constant, +/// return an empty value. +static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) { auto *BV = dyn_cast(V.getNode()); if (!BV) return SDValue(); @@ -19308,11 +19970,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); - // Avoid underflow. - if (Elt->getAPIntValue().isNullValue()) + // Avoid overflow/underflow. + const APInt &EltC = Elt->getAPIntValue(); + if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue())) return SDValue(); - NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT)); + NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); @@ -19344,12 +20007,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = decrementVectorConstant(Op1, DAG); + SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } + case ISD::SETUGT: { + // If the comparison is against a constant, we can turn this into a setuge. + // This is beneficial because materializing a constant 0 for the PCMPEQ is + // probably cheaper than XOR+PCMPGT using 2 different vector constants: + // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 + SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); + if (!UGEOp1) + return SDValue(); + Op1 = Op0; + Op0 = UGEOp1; + break; + } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); @@ -19446,10 +20121,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); - // Break 256-bit integer vector compare into smaller ones. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntVSETCC(Op, DAG); - // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, @@ -19503,6 +20174,27 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } } + // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. + if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && + Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { + ConstantSDNode *C1 = isConstOrConstSplat(Op1); + if (C1 && C1->getAPIntValue().isPowerOf2()) { + unsigned BitWidth = VT.getScalarSizeInBits(); + unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; + + SDValue Result = Op0.getOperand(0); + Result = DAG.getNode(ISD::SHL, dl, VT, Result, + DAG.getConstant(ShiftAmt, dl, VT)); + Result = DAG.getNode(ISD::SRA, dl, VT, Result, + DAG.getConstant(BitWidth - 1, dl, VT)); + return Result; + } + } + + // Break 256-bit integer vector compare into smaller ones. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntVSETCC(Op, DAG); + // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. // which will be swapped to SETGT. @@ -19530,17 +20222,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. - // TODO: This could be extended to handle a non-splat constant by checking - // that each element of the constant is not the max/null value. - APInt C; - if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) { + if (Cond == ISD::SETUGT && + ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { + return !C->getAPIntValue().isMaxValue(); + })) { // X > C --> X >= (C+1) --> X == umax(X, C+1) - Op1 = DAG.getConstant(C + 1, dl, VT); + Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETUGE; } - if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) { + if (Cond == ISD::SETULT && + ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { + return !C->getAPIntValue().isNullValue(); + })) { // X < C --> X <= (C-1) --> X == umin(X, C-1) - Op1 = DAG.getConstant(C - 1, dl, VT); + Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETULE; } bool Invert = false; @@ -19826,7 +20521,7 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { break; case ISD::UADDO: BaseOp = X86ISD::ADD; - Cond = X86::COND_B; + Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; break; case ISD::SSUBO: BaseOp = X86ISD::SUB; @@ -19867,6 +20562,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); + assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!"); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); } @@ -20036,10 +20732,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); - SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); + SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Zero = DAG.getConstant(0, DL, Op.getValueType()); - return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); + return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero); } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, @@ -20111,7 +20807,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; @@ -20120,7 +20815,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || - Opc == X86ISD::BT) { // FIXME + Cmp.getOpcode() == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } @@ -20193,8 +20888,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - // Promote i16 cmovs if it won't prevent folding a load. - if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) { + // Or finally, promote i8 cmovs if we have CMOV, + // or i16 cmovs if it won't prevent folding a load. + // FIXME: we should not limit promotion of i8 case to only when the CMOV is + // legal, but EmitLoweredSelect() can not deal with these extensions + // being inserted between two CMOV's. (in i16 case too TBN) + // https://bugs.llvm.org/show_bug.cgi?id=40974 + if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || + (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && + !MayFoldLoad(Op2))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; @@ -20453,6 +21155,76 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +/// Change a vector store into a pair of half-size vector stores. +static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert((StoredVal.getValueType().is256BitVector() || + StoredVal.getValueType().is512BitVector()) && + "Expecting 256/512-bit op"); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreVT = StoredVal.getSimpleValueType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; + unsigned HalfAlign = (128 == HalfSize ? 16 : 32); + + SDLoc DL(Store); + SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize); + SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize); + SDValue Ptr0 = Store->getBasePtr(); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL); + unsigned Alignment = Store->getAlignment(); + SDValue Ch0 = + DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), + Alignment, Store->getMemOperand()->getFlags()); + SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, + Store->getPointerInfo().getWithOffset(HalfAlign), + MinAlign(Alignment, HalfAlign), + Store->getMemOperand()->getFlags()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); +} + +/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar +/// type. +static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, + SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert(StoreVT.is128BitVector() && + StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); + StoredVal = DAG.getBitcast(StoreVT, StoredVal); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreSVT = StoreVT.getScalarType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned ScalarSize = StoreSVT.getStoreSize(); + unsigned Alignment = Store->getAlignment(); + + SDLoc DL(Store); + SmallVector Stores; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Offset = i * ScalarSize; + SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL); + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, + DAG.getIntPtrConstant(i, DL)); + SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, + Store->getPointerInfo().getWithOffset(Offset), + MinAlign(Alignment, Offset), + Store->getMemOperand()->getFlags()); + Stores.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(Op.getNode()); @@ -20482,28 +21254,47 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, if (St->isTruncatingStore()) return SDValue(); + // If this is a 256-bit store of concatenated ops, we are better off splitting + // that store into two 128-bit stores. This avoids spurious use of 256-bit ops + // and each half can execute independently. Some cores would split the op into + // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); + if (StoreVT.is256BitVector()) { + SmallVector CatOps; + if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) + return splitVectorStore(St, DAG); + return SDValue(); + } + assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != TargetLowering::TypeWidenVector) return SDValue(); - // Widen the vector, cast to a v2x64 type, extract the single 64-bit element - // and store it. MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); - MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; - MVT CastVT = MVT::getVectorVT(StVT, 2); - StoredVal = DAG.getBitcast(CastVT, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, - DAG.getIntPtrConstant(0, dl)); - return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); + if (Subtarget.hasSSE2()) { + // Widen the vector, cast to a v2x64 type, extract the single 64-bit element + // and store it. + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, + DAG.getIntPtrConstant(0, dl)); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, + St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we @@ -20694,13 +21485,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { - SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -21240,42 +22031,41 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector Elts; unsigned NumElts = SrcOp->getNumOperands(); - ConstantSDNode *ND; - switch(Opc) { + switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast(CurrentOp); + auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast(CurrentOp); + auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast(CurrentOp); + auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } @@ -21443,7 +22233,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, DAG.getBitcast(MVT::v8i1, Mask), DAG.getIntPtrConstant(0, dl)); if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND || + Op.getOpcode() == X86ISD::FSETCCM_SAE || Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); @@ -21517,11 +22307,31 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { - if (!isa(Rnd)) - return false; + if (auto *C = dyn_cast(Rnd)) + return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; - unsigned Round = cast(Rnd)->getZExtValue(); - return Round == X86::STATIC_ROUNDING::CUR_DIRECTION; + return false; + }; + auto isRoundModeSAE = [](SDValue Rnd) { + if (auto *C = dyn_cast(Rnd)) + return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC; + + return false; + }; + auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) { + if (auto *C = dyn_cast(Rnd)) { + RC = C->getZExtValue(); + if (RC & X86::STATIC_ROUNDING::NO_EXC) { + // Clear the NO_EXC bit and check remaining bits. + RC ^= X86::STATIC_ROUNDING::NO_EXC; + return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT || + RC == X86::STATIC_ROUNDING::TO_NEG_INF || + RC == X86::STATIC_ROUNDING::TO_POS_INF || + RC == X86::STATIC_ROUNDING::TO_ZERO; + } + } + + return false; }; SDLoc dl(Op); @@ -21537,13 +22347,29 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(2); - if (!isRoundModeCurDirection(Rnd)) { + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), - Op.getOperand(1), Rnd); - } + Op.getOperand(1), + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); } + case INTR_TYPE_1OP_SAE: { + SDValue Sae = Op.getOperand(2); + + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1)); + } case INTR_TYPE_2OP: { SDValue Src2 = Op.getOperand(2); @@ -21553,15 +22379,32 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(3); - if (!isRoundModeCurDirection(Rnd)) { + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), - Op.getOperand(1), Src2, Rnd); - } + Op.getOperand(1), Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Src2); } + case INTR_TYPE_2OP_SAE: { + SDValue Sae = Op.getOperand(3); + + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } case INTR_TYPE_3OP: case INTR_TYPE_3OP_IMM8: { SDValue Src1 = Op.getOperand(1); @@ -21577,11 +22420,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Src3, Rnd); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Src1, Src2, Src3, + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), @@ -21590,44 +22435,45 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); - case INTR_TYPE_1OP_MASK_RM: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - SDValue RoundingMode; - // We always add rounding mode to the Node. - // If the rounding mode is not specified, we add the - // "current direction" mode. - if (Op.getNumOperands() == 4) - RoundingMode = - DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - else - RoundingMode = Op.getOperand(4); - assert(IntrData->Opc1 == 0 && "Unexpected second opcode!"); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, - RoundingMode), - Mask, PassThru, Subtarget, DAG); - } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when - // - RM Opcode is specified and - // - RM is not "current direction". + // - RC Opcode is specified and + // - RC is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return getVectorMaskingNode( + DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Src, DAG.getTargetConstant(RC, dl, MVT::i32)), + Mask, PassThru, Subtarget, DAG); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_1OP_MASK_SAE: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Rnd = Op.getOperand(4); + + unsigned Opc; + if (isRoundModeCurDirection(Rnd)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Rnd)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); + } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -21641,10 +22487,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (Op.getNumOperands() == (5U + HasRounding)) { if (HasRounding) { SDValue Rnd = Op.getOperand(5); + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return getScalarMaskingNode( + DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)), + Mask, passThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Rnd), - Mask, passThru, Subtarget, DAG); + return SDValue(); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), @@ -21654,123 +22504,138 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); + unsigned Opc = IntrData->Opc0; if (HasRounding) { SDValue Sae = Op.getOperand(6); - if (!isRoundModeCurDirection(Sae)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, - RoundingMode, Sae), - Mask, passThru, Subtarget, DAG); + if (isRoundModeSAE(Sae)) + Opc = IntrWithRoundingModeOpcode; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } - case INTR_TYPE_SCALAR_MASK_RM: { + case INTR_TYPE_SCALAR_MASK_RND: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); - SDValue Src0 = Op.getOperand(3); + SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // There are 2 kinds of intrinsics in this group: - // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands - // (2) With rounding mode and sae - 7 operands. - if (Op.getNumOperands() == 6) { - SDValue Sae = Op.getOperand(5); - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - Sae), - Mask, Src0, Subtarget, DAG); - } - assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); - SDValue RoundingMode = Op.getOperand(5); - SDValue Sae = Op.getOperand(6); - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - RoundingMode, Sae), - Mask, Src0, Subtarget, DAG); + SDValue Rnd = Op.getOperand(5); + + SDValue NewOp; + unsigned RC = 0; + if (isRoundModeCurDirection(Rnd)) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + else if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else + return SDValue(); + + return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK_SAE: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + SDValue Sae = Op.getOperand(5); + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { + SDValue NewOp; + if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } - // TODO: Intrinsics should have fast-math-flags to propagate. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), - Mask, PassThru, Subtarget, DAG); + if (!NewOp) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_2OP_MASK_RM: { + case INTR_TYPE_2OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // We specify 2 possible modes for intrinsics, with/without rounding - // modes. - // First, we check if the intrinsic have rounding mode (6 operands), - // if not, we set rounding mode to "current". - SDValue Rnd; - if (Op.getNumOperands() == 6) - Rnd = Op.getOperand(5); - else - Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Rnd), + + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(5); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); + } + + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_SCALAR_MASK: { + case INTR_TYPE_3OP_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(6); - if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, - Src2, Src3), + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_MASK: { + case INTR_TYPE_3OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(6); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(6); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3), + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case BLENDV: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + + EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); + Src3 = DAG.getBitcast(MaskVT, Src3); + + // Reverse the operands to match VSELECT order. + return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); + } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -21783,35 +22648,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // first. return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case CVTPD2PS: - // ISD::FP_ROUND has a second argument that indicates if the truncation - // does not change the value. Set it to 0 since it can change. - return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), - DAG.getIntPtrConstant(0, dl)); - case CVTPD2PS_RND_MASK: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - // We add rounding mode to the Node when - // - RM Opcode is specified and - // - RM is not "current direction". - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src, Rnd), - Mask, PassThru, Subtarget, DAG); - } - } - assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!"); - // ISD::FP_ROUND has a second argument that indicates if the truncation - // does not change the value. Set it to 0 since it can change. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, - DAG.getIntPtrConstant(0, dl)), - Mask, PassThru, Subtarget, DAG); - } case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); @@ -21829,24 +22665,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); - SDValue Cmp; SDValue CC = Op.getOperand(3); CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC, Rnd); + SDValue Sae = Op.getOperand(4); + if (isRoundModeSAE(Sae)) + return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Sae); + if (!isRoundModeCurDirection(Sae)) + return SDValue(); } //default rounding mode - if (!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC); - - return Cmp; } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -21856,12 +22690,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Cmp; if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); + SDValue Sae = Op.getOperand(5); + if (isRoundModeSAE(Sae)) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae); + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } //default rounding mode - if(!Cmp.getNode()) + if (!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), @@ -21921,9 +22757,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8)); - else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + else if (isRoundModeSAE(Sae)) + FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); + else + return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, @@ -21940,41 +22778,42 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnesConstant(Mask)) // return data as is + if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is return Op.getOperand(1); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - DataToCompress), - Mask, PassThru, Subtarget, DAG); + // Avoid false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, VT); + + return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru, + Mask); } - case FIXUPIMMS: - case FIXUPIMMS_MASKZ: case FIXUPIMM: - case FIXUPIMM_MASKZ:{ + case FIXUPIMM_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Imm = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ? - Src1 : getZeroVector(VT, Subtarget, DAG, dl); - // We specify 2 possible modes for intrinsics, with/without rounding - // modes. - // First, we check if the intrinsic have rounding mode (7 operands), - // if not, we set rounding mode to "current". - SDValue Rnd; - if (Op.getNumOperands() == 7) - Rnd = Op.getOperand(6); - else - Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ) - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Imm, Rnd), - Mask, Passthru, Subtarget, DAG); - else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Imm, Rnd), - Mask, Passthru, Subtarget, DAG); + SDValue Passthru = (IntrData->Type == FIXUPIMM) + ? Src1 + : getZeroVector(VT, Subtarget, DAG, dl); + + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(6); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); + } + + SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm); + + if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE) + return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); + + return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); } case ROUNDP: { assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); @@ -22018,7 +22857,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMergeValues(Results, dl); } case CVTPD2PS_MASK: - case CVTPD2I_MASK: + case CVTPD2DQ_MASK: + case CVTQQ2PS_MASK: case TRUNCATE_TO_REG: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); @@ -22049,6 +22889,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, PassThru, Mask); } + case CVTNEPS2BF16_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); + + // Break false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); + + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, + Mask); + } default: break; } @@ -22279,10 +23134,37 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); - else // This function handles the SP or FP case. - Reg = RegInfo->getPtrSizedFrameRegister(MF); + else { // Handles the SP or FP case. + bool CantUseFP = RegInfo->needsStackRealignment(MF); + if (CantUseFP) + Reg = RegInfo->getPtrSizedStackRegister(MF); + else + Reg = RegInfo->getPtrSizedFrameRegister(MF); + } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } + + case Intrinsic::x86_avx512_vp2intersect_q_512: + case Intrinsic::x86_avx512_vp2intersect_q_256: + case Intrinsic::x86_avx512_vp2intersect_q_128: + case Intrinsic::x86_avx512_vp2intersect_d_512: + case Intrinsic::x86_avx512_vp2intersect_d_256: + case Intrinsic::x86_avx512_vp2intersect_d_128: { + MVT MaskVT = Op.getSimpleValueType(); + + SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); + SDLoc DL(Op); + + SDValue Operation = + DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, + Op->getOperand(1), Op->getOperand(2)); + + SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, + MaskVT, Operation); + SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, + MaskVT, Operation); + return DAG.getMergeValues({Result0, Result1}, DL); + } } } @@ -22296,25 +23178,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = Mask.getValueType(); + EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } -static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget &Subtarget) { +static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); @@ -22332,17 +23215,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22355,8 +23239,6 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -22366,10 +23248,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + MemIntrinsicSDNode *MemIntr = cast(Op); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - return SDValue(Res, 1); + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; + SDValue Res = DAG.getTargetMemSDNode( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return Res.getValue(1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22392,24 +23277,37 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } -/// Handles the lowering of builtin intrinsic that return the value -/// of the extended control register. -static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SmallVectorImpl &Results) { - assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue LO, HI; +/// Handles the lowering of builtin intrinsics with chain that return their +/// value into registers EDX:EAX. +/// If operand ScrReg is a valid register identifier, then operand 2 of N is +/// copied to SrcReg. The assumption is that SrcReg is an implicit input to +/// TargetOpcode. +/// Returns a Glue value which can be used to add extra copy-from-reg if the +/// expanded intrinsics implicitly defines extra registers (i.e. not just +/// EDX:EAX). +static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + unsigned TargetOpcode, + unsigned SrcReg, + const X86Subtarget &Subtarget, + SmallVectorImpl &Results) { + SDValue Chain = N->getOperand(0); + SDValue Glue; - // The ECX register is used to select the index of the XCR register to - // return. - SDValue Chain = - DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); - SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain); + if (SrcReg) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue); + Glue = Chain.getValue(1); + } + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue N1Ops[] = {Chain, Glue}; + SDNode *N1 = DAG.getMachineNode( + TargetOpcode, DL, Tys, ArrayRef(N1Ops, Glue.getNode() ? 2 : 1)); Chain = SDValue(N1, 0); // Reads the content of XCR and returns it in registers EDX:EAX. + SDValue LO, HI; if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, @@ -22420,60 +23318,15 @@ static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, LO.getValue(2)); } Chain = HI.getValue(1); + Glue = HI.getValue(2); if (Subtarget.is64Bit()) { - // Merge the two 32-bit values into a 64-bit one.. - SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, DL, MVT::i8)); - Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); - Results.push_back(Chain); - return; - } - - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { LO, HI }; - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); - Results.push_back(Pair); - Results.push_back(Chain); -} - -/// Handles the lowering of builtin intrinsics that read performance monitor -/// counters (x86_rdpmc). -static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SmallVectorImpl &Results) { - assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue LO, HI; - - // The ECX register is used to select the index of the performance counter - // to read. - SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, - N->getOperand(2)); - SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); - - // Reads the content of a 64-bit performance counter and returns it in the - // registers EDX:EAX. - if (Subtarget.is64Bit()) { - LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, - LO.getValue(2)); - } else { - LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, - LO.getValue(2)); - } - Chain = HI.getValue(1); - - if (Subtarget.is64Bit()) { - // The EAX register is loaded with the low-order 32 bits. The EDX register - // is loaded with the supported high-order bits of the counter. + // Merge the two 32-bit values into a 64-bit one. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); - return; + return Glue; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. @@ -22481,6 +23334,7 @@ static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); + return Glue; } /// Handles the lowering of builtin intrinsics that read the time stamp counter @@ -22490,59 +23344,28 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); - SDValue LO, HI; - // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. - if (Subtarget.is64Bit()) { - LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, - LO.getValue(2)); - } else { - LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, - LO.getValue(2)); - } - SDValue Chain = HI.getValue(1); - - SDValue TSC; - if (Subtarget.is64Bit()) { - // The EDX register is loaded with the high-order 32 bits of the MSR, and - // the EAX register is loaded with the low-order 32 bits. - TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, DL, MVT::i8)); - TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC); - } else { - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI }); - } - - if (Opcode == X86ISD::RDTSCP_DAG) { - assert(N->getNumOperands() == 2 && "Unexpected number of operands!"); - - // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into - // the ECX register. Add 'ecx' explicitly to the chain. - SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, - HI.getValue(2)); - - Results.push_back(TSC); - Results.push_back(ecx); - Results.push_back(ecx.getValue(1)); + SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, + /* NoRegister */0, Subtarget, + Results); + if (Opcode != X86::RDTSCP) return; - } - Results.push_back(TSC); - Results.push_back(Chain); + SDValue Chain = Results[1]; + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue); + Results[1] = ecx; + Results.push_back(ecx.getValue(1)); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector Results; SDLoc DL(Op); - getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } @@ -22621,6 +23444,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); + case llvm::Intrinsic::x86_rdpkru: { + SDLoc dl(Op); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + // Create a RDPKRU node and pass 0 to the ECX parameter. + return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + } + case llvm::Intrinsic::x86_wrpkru: { + SDLoc dl(Op); + // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 + // to the EDX and ECX parameters. + return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, + Op.getOperand(0), Op.getOperand(2), + DAG.getConstant(0, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + } case llvm::Intrinsic::x86_flags_read_u32: case llvm::Intrinsic::x86_flags_read_u64: case llvm::Intrinsic::x86_flags_write_u32: @@ -22630,7 +23469,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later - // during ExpandISelPseudos in EmitInstrWithCustomInserter. + // during FinalizeISel in EmitInstrWithCustomInserter. return SDValue(); } case Intrinsic::x86_lwpins32: @@ -22660,8 +23499,28 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); - SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC); - return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, + Operation.getValue(1)); + } + case Intrinsic::x86_enqcmd: + case Intrinsic::x86_enqcmds: { + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic!"); + case Intrinsic::x86_enqcmd: + Opcode = X86ISD::ENQCMD; + break; + case Intrinsic::x86_enqcmds: + Opcode = X86ISD::ENQCMDS; + break; + } + SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2), + Op.getOperand(3)); + SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } } @@ -22707,7 +23566,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, + return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { @@ -22743,15 +23602,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. - case RDPMC: { - SmallVector Results; - getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); - return DAG.getMergeValues(Results, dl); - } - // Get Extended Control Register. + case RDPMC: + // GetExtended Control Register. case XGETBV: { SmallVector Results; - getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results); + + // RDPMC uses ECX to select the index of the performance counter to read. + // XGETBV uses ECX to select the index of the XCR register to return. + // The result is stored into registers EDX:EAX. + expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, + Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. @@ -22861,7 +23721,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( - SlotSize, /*Offset=*/0, /*IsImmutable=*/false); + SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); @@ -23444,10 +24304,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SDValue N0 = Op.getOperand(0); SDLoc dl(Op); - // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); - assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); @@ -23539,22 +24395,48 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, return split256IntArith(Op, DAG); } -static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); if (VT.getScalarType() == MVT::i1) { SDLoc dl(Op); - switch (Op.getOpcode()) { + switch (Opcode) { default: llvm_unreachable("Expected saturated arithmetic opcode"); case ISD::UADDSAT: case ISD::SADDSAT: - return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1)); + // *addsat i1 X, Y --> X | Y + return DAG.getNode(ISD::OR, dl, VT, X, Y); case ISD::USUBSAT: case ISD::SSUBSAT: - return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), - DAG.getNOT(dl, Op.getOperand(1), VT)); + // *subsat i1 X, Y --> X & ~Y + return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT)); } } + if (VT.is128BitVector()) { + // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), VT); + SDLoc DL(Op); + if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) { + // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT); + return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add); + } + if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { + // usubsat X, Y --> (X >u Y) ? X - Y : 0 + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); + return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); + } + // Use default expansion. + return SDValue(); + } + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -23886,9 +24768,6 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Signed AVX2 implementation - extend xmm subvectors to ymm. if (VT == MVT::v32i8 && IsSigned) { - SDValue Lo = DAG.getIntPtrConstant(0, dl); - SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl); - MVT ExVT = MVT::v16i16; SDValue ALo = extract128BitVector(A, 0, DAG, dl); SDValue BLo = extract128BitVector(B, 0, DAG, dl); @@ -23898,8 +24777,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); - Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); - Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); @@ -24156,6 +25035,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, APInt APIntShiftAmt; if (!isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); + + // If the shift amount is out of range, return undef. + if (APIntShiftAmt.uge(VT.getScalarSizeInBits())) + return DAG.getUNDEF(VT); + uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) @@ -24197,8 +25081,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); + APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt); + return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -24224,54 +25108,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -// If V is a splat value, return the source vector and splat index; -static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) { - V = peekThroughEXTRACT_SUBVECTORs(V); - - EVT VT = V.getValueType(); - unsigned Opcode = V.getOpcode(); - switch (Opcode) { - default: { - APInt UndefElts; - APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); - if (DAG.isSplatValue(V, DemandedElts, UndefElts)) { - // Handle case where all demanded elements are UNDEF. - if (DemandedElts.isSubsetOf(UndefElts)) { - SplatIdx = 0; - return DAG.getUNDEF(VT); - } - SplatIdx = (UndefElts & DemandedElts).countTrailingOnes(); - return V; - } - break; - } - case ISD::VECTOR_SHUFFLE: { - // Check if this is a shuffle node doing a splat. - // TODO - remove this and rely purely on SelectionDAG::isSplatValue, - // getTargetVShiftNode currently struggles without the splat source. - auto *SVN = cast(V); - if (!SVN->isSplat()) - break; - int Idx = SVN->getSplatIndex(); - int NumElts = V.getValueType().getVectorNumElements(); - SplatIdx = Idx % NumElts; - return V.getOperand(Idx / NumElts); - } - } - - return SDValue(); -} - -static SDValue GetSplatValue(SDValue V, const SDLoc &dl, - SelectionDAG &DAG) { - int SplatIdx; - if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG)) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - SrcVector.getValueType().getScalarType(), SrcVector, - DAG.getIntPtrConstant(SplatIdx, dl)); - return SDValue(); -} - static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -24282,7 +25118,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true); - if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) { + if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) { if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) { MVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); @@ -25102,24 +25938,45 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b - else if (OpWidth == 128) + return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit(); + if (OpWidth == 128) return Subtarget.hasCmpxchg16b(); - else - return false; + + return false; } +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? +// TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { - return needsCmpXchgNb(SI->getValueOperand()->getType()); + Type *MemType = SI->getValueOperand()->getType(); + + bool NoImplicitFloatOps = + SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + return false; + + return needsCmpXchgNb(MemType); } // Note: this turns large loads into lock cmpxchg8b/16b. -// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - auto PTy = cast(LI->getPointerOperandType()); - return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg - : AtomicExpansionKind::None; + Type *MemType = LI->getType(); + + // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we + // can use movq to do the load. If we have X87 we can load into an 80-bit + // X87 register and store it to a stack temporary. + bool NoImplicitFloatOps = + LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && + (Subtarget.hasSSE2() || Subtarget.hasX87())) + return AtomicExpansionKind::None; + + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind @@ -25155,6 +26012,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; @@ -25171,13 +26030,20 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. + // TODO: push more cases through this path. + if (auto *C = dyn_cast(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) + return nullptr; + auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); - auto Ptr = AI->getPointerOperand(); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence @@ -25212,14 +26078,80 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. - LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, - AI->getType()->getPrimitiveSizeInBits()); + LoadInst *Loaded = + Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(), + AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } +/// Emit a locked operation on a stack location which does not change any +/// memory location, but does involve a lock prefix. Location is chosen to be +/// a) very likely accessed only by a single thread to minimize cache traffic, +/// and b) definitely dereferenceable. Returns the new Chain result. +static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { + // Implementation notes: + // 1) LOCK prefix creates a full read/write reordering barrier for memory + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) When choosing offsets, there are several contributing factors: + // a) If there's no redzone, we default to TOS. (We could allocate a cache + // line aligned stack object to improve this case.) + // b) To minimize our chances of introducing a false dependence, we prefer + // to offset the stack usage from TOS slightly. + // c) To minimize concerns about cross thread stack usage - in particular, + // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which + // captures state in the TOS frame and accesses it from many threads - + // we want to use an offset such that the offset is in a distinct cache + // line from the TOS frame. + // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + + auto &MF = DAG.getMachineFunction(); + auto &TFL = *Subtarget.getFrameLowering(); + const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0; + + if (Subtarget.is64Bit()) { + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain}; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, + MVT::Other, Ops); + return SDValue(Res, 1); + } + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, + MVT::Other, Ops); + return SDValue(Res, 1); +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -25235,19 +26167,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); - SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, dl, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(0, dl, MVT::i32), // Disp - DAG.getRegister(0, MVT::i32), // Segment. - Zero, - Chain - }; - SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops); - return SDValue(Res, 0); + SDValue Chain = Op.getOperand(0); + return emitLockedStackOp(DAG, Subtarget, Chain, dl); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. @@ -25288,10 +26209,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); - return SDValue(); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + cpOut, Success, EFLAGS.getValue(1)); } // Create MOVMSKB, taking into account whether we need to split for AVX1. @@ -25703,6 +26622,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + AtomicSDNode *AN = cast(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -25717,7 +26637,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { - AtomicSDNode *AN = cast(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); @@ -25727,35 +26646,93 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, return N; } + // Specialized lowering for the canonical form of an idemptotent atomicrmw. + // The core idea here is that since the memory location isn't actually + // changing, all we need is a lowering for the *ordering* impacts of the + // atomicrmw. As such, we can chose a different operation and memory + // location to minimize impact on other code. + if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { + // On X86, the only ordering which actually requires an instruction is + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to + // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be + // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), NewChain); + } + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), NewChain); + } + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); - DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); - return SDValue(); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), LockOp.getValue(1)); } -static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + auto *Node = cast(Op.getNode()); SDLoc dl(Node); - EVT VT = cast(Node)->getMemoryVT(); + EVT VT = Node->getMemoryVT(); + + bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent; + bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); + + // If this store is not sequentially consistent and the type is legal + // we can just keep it. + if (!IsSeqCst && IsTypeLegal) + return Op; + + if (VT == MVT::i64 && !IsTypeLegal) { + // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. + // FIXME: Use movlps with SSE1. + // FIXME: Use fist with X87. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + Subtarget.hasSSE2()) { + SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Node->getOperand(2)); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, + Ops, MVT::i64, + Node->getMemOperand()); + + // If this is a sequentially consistent store, also emit an appropriate + // barrier. + if (IsSeqCst) + Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); + + return Chain; + } + } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) - // FIXME: On 32-bit, store -> fist or movq would be more efficient - // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. - if (cast(Node)->getOrdering() == - AtomicOrdering::SequentiallyConsistent || - !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - cast(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2), - cast(Node)->getMemOperand()); - return Swap.getValue(1); - } - // Other atomic stores have a simple pattern. - return Op; + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + Node->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + Node->getMemOperand()); + return Swap.getValue(1); } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { @@ -25919,7 +26896,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } return SDValue(); @@ -25935,7 +26911,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } // Custom widen all the operands to avoid promotion. @@ -25980,7 +26955,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } @@ -25991,8 +26965,28 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); + MVT MaskVT = Mask.getSimpleValueType(); + SDValue PassThru = N->getPassThru(); SDLoc dl(Op); + // Handle AVX masked loads which don't support passthru other than 0. + if (MaskVT.getVectorElementType() != MVT::i1) { + // We also allow undef in the isel pattern. + if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) + return Op; + + SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), + N->getBasePtr(), Mask, + getZeroVector(VT, Subtarget, DAG, dl), + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType(), + N->isExpandingLoad()); + // Emit a blend. + SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, + PassThru); + return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); + } + assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); @@ -26011,7 +27005,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG); + PassThru = ExtendToType(PassThru, WideDataVT, DAG); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && @@ -26179,7 +27173,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); @@ -26272,7 +27266,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDSAT: case ISD::SADDSAT: case ISD::USUBSAT: - case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG); + case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -26301,12 +27295,19 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N, if (!Res.getNode()) return; - assert((N->getNumValues() <= Res->getNumValues()) && + // If the original node has one result, take the return value from + // LowerOperation as is. It might not be result number 0. + if (N->getNumValues() == 1) { + Results.push_back(Res); + return; + } + + // If the original node has multiple results, then the return node should + // have the same number of results. + assert((N->getNumValues() == Res->getNumValues()) && "Lowering returned the wrong number of results!"); // Places new result values base on N result number. - // In some cases (LowerSINT_TO_FP for example) Res has more result values - // than original node, chain should be dropped(last value). for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) Results.push_back(Res.getValue(I)); } @@ -26319,7 +27320,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: +#ifndef NDEBUG + dbgs() << "ReplaceNodeResults: "; + N->dump(&DAG); +#endif llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::CTPOP: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + // Use a v2i64 if possible. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) { + SDValue Wide = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0)); + Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide); + // Bit count should fit in 32-bits, extract it as that and then zero + // extend to i64. Otherwise we end up extracting bits 63:32 separately. + Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); + Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, + DAG.getIntPtrConstant(0, dl)); + Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); + Results.push_back(Wide); + } + return; + } case ISD::MUL: { EVT VT = N->getValueType(0); assert(VT.isVector() && "Unexpected VT"); @@ -26385,6 +27410,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res); return; } + case ISD::ABS: { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + assert(N->getValueType(0) == MVT::i64 && + "Unexpected type (!= i64) on ABS."); + MVT HalfT = MVT::i32; + SDValue Lo, Hi, Tmp; + SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); + + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(0, dl, HalfT)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(1, dl, HalfT)); + Tmp = DAG.getNode( + ISD::SRA, dl, HalfT, Hi, + DAG.getConstant(HalfT.getSizeInBits() - 1, dl, + TLI.getShiftAmountTy(HalfT, DAG.getDataLayout()))); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, + SDValue(Lo.getNode(), 1)); + Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); + Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); + Results.push_back(Lo); + Results.push_back(Hi); + return; + } case ISD::SETCC: { // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when // setCC result type is v2i1 because type legalzation will end up with @@ -26557,14 +27607,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { - if (!ExperimentalVectorWideningLegalization) - return; - EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8)) { + (InVT == MVT::v4i16 || InVT == MVT::v4i8) && + getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { + assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting @@ -26589,16 +27638,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) { + if (VT == MVT::v16i32 || VT == MVT::v8i64) { + if (!InVT.is128BitVector()) { + // Not a 128 bit vector, but maybe type legalization will promote + // it to 128 bits. + if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger) + return; + InVT = getTypeToTransformTo(*DAG.getContext(), InVT); + if (!InVT.is128BitVector()) + return; + + // Promote the input to 128 bits. Type legalization will turn this into + // zext_inreg/sext_inreg. + In = DAG.getNode(N->getOpcode(), dl, InVT, In); + } + // Perform custom splitting instead of the two stage extend we would get // by default. EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); assert(isTypeLegal(LoVT) && "Split VT not legal?"); - bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND; - - SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG); + SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG); // We need to shift the input over by half the number of elements. unsigned NumElts = InVT.getVectorNumElements(); @@ -26608,7 +27669,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ShufMask[i] = i + HalfNumElts; SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); - Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG); + Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); @@ -26735,17 +27796,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - std::pair Vals = - FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); - SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode()) { - // Return a load from the stack slot. - if (StackSlot.getNode()) - Results.push_back( - DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo())); - else - Results.push_back(FIST); - } + if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned)) + Results.push_back(V); return; } case ISD::SINT_TO_FP: { @@ -26800,31 +27852,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: - return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: - return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: - return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); - + expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, + Results); + return; case Intrinsic::x86_xgetbv: - return getExtendedControlRegister(N, dl, DAG, Subtarget, Results); + expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, + Results); + return; } } - case ISD::INTRINSIC_WO_CHAIN: { - if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG)) - Results.push_back(V); - return; - } case ISD::READCYCLECOUNTER: { - return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, - Results); + return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; + assert((!Regs64bit || Subtarget.hasCmpxchg16b()) && + "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"); MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), @@ -26903,6 +27954,66 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_LOAD: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { + auto *Node = cast(N); + if (Subtarget.hasSSE2()) { + // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the + // lower 64-bits. + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Node->getMemOperand()); + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Ld.getValue(1)); + return; + } + if (Subtarget.hasX87()) { + // First load this into an 80-bit X87 register. This will put the whole + // integer into the significand. + // FIXME: Do we need to glue? See FIXME comment in BuildFILD. + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG, + dl, Tys, Ops, MVT::i64, + Node->getMemOperand()); + SDValue Chain = Result.getValue(1); + SDValue InFlag = Result.getValue(2); + + // Now store the X87 register to a stack temporary and convert to i64. + // This store is not atomic and doesn't need to be. + // FIXME: We don't need a stack temporary if the result of the load + // is already being stored. We could just directly store there. + SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); + int SPFI = cast(StackPtr.getNode())->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag }; + Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl, + DAG.getVTList(MVT::Other), StoreOps, + MVT::i64, MPI, 0 /*Align*/, + MachineMemOperand::MOStore); + + // Finally load the value back from the stack temporary and return it. + // This load is not atomic and doesn't need to be. + // This load will be further type legalized. + Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI); + Results.push_back(Result); + Results.push_back(Result.getValue(1)); + return; + } + } + // TODO: Use MOVLPS when SSE1 is available? + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by AtomicExpandPass.cpp. + break; + } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: @@ -26914,11 +28025,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: - case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; - } + case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); @@ -27061,19 +28171,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); - MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; - SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), - Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - SDValue Chain = Res.getValue(1); - MVT WideVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); - MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() * 2); - Res = DAG.getBitcast(CastVT, Res); + if (Subtarget.hasSSE2()) { + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); + SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; + SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Ld->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } } @@ -27092,26 +28211,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; - case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; - case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; - case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FIST: return "X86ISD::FIST"; + case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; - case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; - case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; - case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; - case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; + case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; - case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND"; + case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -27140,12 +28255,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAXS: return "X86ISD::FMAXS"; - case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; - case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE"; + case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMINS: return "X86ISD::FMINS"; - case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; - case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; + case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE"; + case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -27177,6 +28292,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; @@ -27188,11 +28304,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; - case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; - case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; + case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; + case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; + case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; + case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; @@ -27202,6 +28320,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VSHLV: return "X86ISD::VSHLV"; + case X86ISD::VSRLV: return "X86ISD::VSRLV"; case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; @@ -27263,11 +28383,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; + case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE"; case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; + case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; - case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND"; + case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE"; case X86ISD::VRANGES: return "X86ISD::VRANGES"; - case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND"; + case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; @@ -27281,6 +28403,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::RDPKRU: return "X86ISD::RDPKRU"; + case X86ISD::WRPKRU: return "X86ISD::WRPKRU"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; @@ -27302,17 +28426,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; - case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND"; + case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; - case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND"; + case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; - case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND"; + case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE"; case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; - case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND"; + case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; - case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; + case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; - case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; + case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE"; case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -27323,26 +28447,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RCP14: return "X86ISD::RCP14"; case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; + case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE"; case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE"; case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; + case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FADDS: return "X86ISD::FADDS"; case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FSUBS: return "X86ISD::FSUBS"; case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FMULS: return "X86ISD::FMULS"; case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FDIVS: return "X86ISD::FDIVS"; case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; + case X86ISD::FSQRTS: return "X86ISD::FSQRTS"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; - case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; - case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; + case X86ISD::FGETEXP: return "X86ISD::FGETEXP"; + case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE"; + case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS"; + case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; + case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; + case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; @@ -27351,23 +28489,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; - case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND"; - case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND"; + case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; + case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE"; case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI"; case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI"; - case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND"; - case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND"; + case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE"; + case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; + case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; + case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; + case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP"; case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; + case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; - case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; + case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI"; @@ -27378,6 +28520,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; + case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16"; + case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16"; + case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16"; + case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; @@ -27393,6 +28539,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND"; case X86ISD::UMWAIT: return "X86ISD::UMWAIT"; case X86ISD::TPAUSE: return "X86ISD::TPAUSE"; + case X86ISD::ENQCMD: return "X86ISD:ENQCMD"; + case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS"; + case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT"; } return nullptr; } @@ -27478,6 +28627,38 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { return true; } +bool X86TargetLowering::isBinOp(unsigned Opcode) const { + switch (Opcode) { + // These are non-commutative binops. + // TODO: Add more X86ISD opcodes once we have test coverage. + case X86ISD::ANDNP: + case X86ISD::PCMPGT: + case X86ISD::FMAX: + case X86ISD::FMIN: + case X86ISD::FANDN: + return true; + } + + return TargetLoweringBase::isBinOp(Opcode); +} + +bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { + switch (Opcode) { + // TODO: Add more X86ISD opcodes once we have test coverage. + case X86ISD::PCMPEQ: + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: + case X86ISD::FMAXC: + case X86ISD::FMINC: + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: + return true; + } + + return TargetLoweringBase::isCommutativeBinOp(Opcode); +} + bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; @@ -27713,87 +28894,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, return sinkMBB; } -static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // insert input VAL into EAX - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI.getOperand(0).getReg()); - // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); - - // insert zero to EDX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX); - - // insert WRPKRU instruction - BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); - - // insert RDPKRU instruction - BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::EAX); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget, - unsigned Opc) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - // Address into RAX/EAX, other two args into ECX, EDX. - unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - - unsigned ValOps = X86::AddrNumOperands; - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI.getOperand(ValOps).getReg()); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) - .addReg(MI.getOperand(ValOps + 1).getReg()); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(Opc)); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - // Address into RAX/EAX - unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI->getOperand(i)); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); - - MI->eraseFromParent(); // The pseudo is gone now. - return BB; -} - MachineBasicBlock * @@ -27823,10 +28923,18 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, unsigned ArgMode = MI.getOperand(7).getImm(); unsigned Align = MI.getOperand(8).getImm(); + MachineFunction *MF = MBB->getParent(); + // Memory Reference assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); - SmallVector MMOs(MI.memoperands_begin(), - MI.memoperands_end()); + + MachineMemOperand *OldMMO = MI.memoperands().front(); + + // Clone the MMO into two separate MMOs for loading and storing + MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand( + OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore); + MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand( + OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad); // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); @@ -27891,7 +28999,6 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction *MF = MBB->getParent(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -27924,7 +29031,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) @@ -27933,8 +29040,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise - BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) - .addMBB(overflowMBB); + BuildMI(thisMBB, DL, TII->get(X86::JCC_1)) + .addMBB(overflowMBB).addImm(X86::COND_AE); } // In offsetMBB, emit code to use the reg_save_area. @@ -27949,7 +29056,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, 16) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); @@ -27977,7 +29084,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .addReg(NextOffsetReg) - .setMemRefs(MMOs); + .setMemRefs(StoreOnlyMMO); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) @@ -27996,7 +29103,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, 8) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. @@ -28033,7 +29140,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addDisp(Disp, 8) .add(Segment) .addReg(NextAddrReg) - .setMemRefs(MMOs); + .setMemRefs(StoreOnlyMMO); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { @@ -28091,7 +29198,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); MBB->addSuccessor(EndMBB); } @@ -28371,13 +29478,11 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // Create the conditional branch instructions. X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); - unsigned Opc = X86::GetCondBranchFromCond(FirstCC); - BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC); X86::CondCode SecondCC = X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); - unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC); - BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB); + BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC); // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] @@ -28463,20 +29568,21 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineInstr *LastCMOV = &MI; - MachineBasicBlock::iterator NextMIIt = - std::next(MachineBasicBlock::iterator(MI)); + MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { - // See if we have a string of CMOVS with the same condition. + // See if we have a string of CMOVS with the same condition. Skip over + // intervening debug insts. while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; ++NextMIIt; + NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end()); } } @@ -28508,8 +29614,18 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, SinkMBB->addLiveIn(X86::EFLAGS); } + // Transfer any debug instructions inside the CMOV sequence to the sunk block. + auto DbgEnd = MachineBasicBlock::iterator(LastCMOV); + auto DbgIt = MachineBasicBlock::iterator(MI); + while (DbgIt != DbgEnd) { + auto Next = std::next(DbgIt); + if (DbgIt->isDebugInstr()) + SinkMBB->push_back(DbgIt->removeFromParent()); + DbgIt = Next; + } + // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. - SinkMBB->splice(SinkMBB->begin(), ThisMBB, + SinkMBB->splice(SinkMBB->end(), ThisMBB, std::next(MachineBasicBlock::iterator(LastCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); @@ -28522,8 +29638,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. - unsigned Opc = X86::GetCondBranchFromCond(CC); - BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // SinkMBB: // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] @@ -28539,53 +29654,6 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, return SinkMBB; } -MachineBasicBlock * -X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, - MachineBasicBlock *BB) const { - // Combine the following atomic floating-point modification pattern: - // a.store(reg OP a.load(acquire), release) - // Transform them into: - // OPss (%gpr), %xmm - // movss %xmm, (%gpr) - // Or sd equivalent for 64-bit operations. - unsigned MOp, FOp; - switch (MI.getOpcode()) { - default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); - case X86::RELEASE_FADD32mr: - FOp = X86::ADDSSrm; - MOp = X86::MOVSSmr; - break; - case X86::RELEASE_FADD64mr: - FOp = X86::ADDSDrm; - MOp = X86::MOVSDmr; - break; - } - const X86InstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned ValOpIdx = X86::AddrNumOperands; - unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(FOp), - MRI.createVirtualRegister(MRI.getRegClass(VSrc))) - .addReg(VSrc); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand &Operand = MI.getOperand(i); - // Clear any kill flags on register operands as we'll create a second - // instruction using the same address operands. - if (Operand.isReg()) - Operand.setIsKill(false); - MIB.add(Operand); - } - MachineInstr *FOpMI = MIB; - MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); - MI.eraseFromParent(); // The pseudo instruction is gone now. - return BB; -} - MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -28652,7 +29720,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); - BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); + BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. @@ -29279,7 +30347,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(checkSspMBB, DL, TII->get(TestRROpc)) .addReg(SSPCopyReg) .addReg(SSPCopyReg); - BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB); + BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); checkSspMBB->addSuccessor(sinkMBB); checkSspMBB->addSuccessor(fallMBB); @@ -29309,7 +30377,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addReg(SSPCopyReg); // Jump to sink in case PrevSSPReg <= SSPCopyReg. - BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB); + BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE); fallMBB->addSuccessor(sinkMBB); fallMBB->addSuccessor(fixShadowMBB); @@ -29332,7 +30400,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addImm(8); // Jump if the result of the shift is zero. - BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB); + BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); fixShadowMBB->addSuccessor(sinkMBB); fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB); @@ -29367,7 +30435,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg); // Jump if the counter is not zero yet. - BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB); + BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE); fixShadowLoopMBB->addSuccessor(sinkMBB); fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB); @@ -29512,10 +30580,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - int FI = MFI.getFunctionContextIndex(); + int FI = MF->getFrameInfo().getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -29613,7 +30680,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); - BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB); + BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); if (Subtarget.is64Bit()) { unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); @@ -29766,7 +30833,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: + case X86::CMOV_FR32X: case X86::CMOV_FR64: + case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -29821,10 +30890,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } - case X86::RELEASE_FADD32mr: - case X86::RELEASE_FADD64mr: - return EmitLoweredAtomicFP(MI, BB); - case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -29836,27 +30901,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FNSTCW16m)), CWFrameIdx); + TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); - // Load the old value of the high byte of the control word... + // Load the old value of the control word... unsigned OldCW = + MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), + OrigCWFrameIdx); + + // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. + unsigned NewCW = + MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) + .addReg(OldCW, RegState::Kill).addImm(0xC00); + + // Extract to 16 bits. + unsigned NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), - CWFrameIdx); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) + .addReg(NewCW, RegState::Kill, X86::sub_16bit); - // Set the high part to be round to zero... - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) - .addImm(0xC7F); + // Prepare memory for FLDCW. + int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), + NewCWFrameIdx) + .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FLDCW16m)), CWFrameIdx); - - // Restore the memory image of control word to original value - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) - .addReg(OldCW); + TII->get(X86::FLDCW16m)), NewCWFrameIdx); // Get the X86 opcode to use. unsigned Opc; @@ -29879,26 +30954,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FLDCW16m)), CWFrameIdx); + TII->get(X86::FLDCW16m)), OrigCWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } - // Thread synchronization. - case X86::MONITOR: - return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); - case X86::MONITORX: - return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); - - // Cache line zero - case X86::CLZERO: - return emitClzero(&MI, BB, Subtarget); - - // PKU feature - case X86::WRPKRU: - return emitWRPKRU(MI, BB, Subtarget); - case X86::RDPKRU: - return emitRDPKRU(MI, BB, Subtarget); + // xbegin case X86::XBEGIN: return emitXBegin(MI, BB, Subtarget.getInstrInfo()); @@ -30093,7 +31154,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); - Known = Known.zextOrTrunc(BitWidth); + Known = Known.zextOrTrunc(BitWidth, false); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } @@ -30150,6 +31211,27 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.trunc(BitWidth); break; } + case X86ISD::ANDNP: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // ANDNP = (~X & Y); + Known.One &= Known2.Zero; + Known.Zero |= Known2.One; + break; + } + case X86ISD::FOR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // Output known-0 bits are only known if clear in both the LHS & RHS. + Known.Zero &= Known2.Zero; + // Output known-1 are known to be set if set in either the LHS | RHS. + Known.One |= Known2.One; + break; + } case X86ISD::CMOV: { Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1); // If we don't know any bits, early out. @@ -30219,7 +31301,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - unsigned VTBits = Op.getScalarValueSizeInBits(); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SETCC_CARRY: @@ -30257,7 +31340,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); - APInt ShiftVal = cast(Op.getOperand(1))->getAPIntValue(); + const APInt &ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); @@ -30268,7 +31351,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); - APInt ShiftVal = cast(Op.getOperand(1))->getAPIntValue(); + APInt ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits - 1)) return VTBits; // Sign splat. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); @@ -30284,6 +31367,15 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( // Vector compares return zero/all-bits result values. return VTBits; + case X86ISD::ANDNP: { + unsigned Tmp0 = + DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Tmp0 == 1) return 1; // Early out. + unsigned Tmp1 = + DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); + return std::min(Tmp0, Tmp1); + } + case X86ISD::CMOV: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp0 == 1) return 1; // Early out. @@ -30292,6 +31384,54 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( } } + // Handle target shuffles. + // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. + if (isTargetShuffle(Opcode)) { + bool IsUnary; + SmallVector Mask; + SmallVector Ops; + if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask, + IsUnary)) { + unsigned NumOps = Ops.size(); + unsigned NumElts = VT.getVectorNumElements(); + if (Mask.size() == NumElts) { + SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i]) + continue; + int M = Mask[i]; + if (M == SM_SentinelUndef) { + // For UNDEF elements, we don't know anything about the common state + // of the shuffle result. + return 1; + } else if (M == SM_SentinelZero) { + // Zero = all sign bits. + continue; + } + assert(0 <= M && (unsigned)M < (NumOps * NumElts) && + "Shuffle index out of range"); + + unsigned OpIdx = (unsigned)M / NumElts; + unsigned EltIdx = (unsigned)M % NumElts; + if (Ops[OpIdx].getValueType() != VT) { + // TODO - handle target shuffle ops with different value types. + return 1; + } + DemandedOps[OpIdx].setBit(EltIdx); + } + unsigned Tmp0 = VTBits; + for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) { + if (!DemandedOps[i]) + continue; + unsigned Tmp1 = + DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1); + Tmp0 = std::min(Tmp0, Tmp1); + } + return Tmp0; + } + } + } + // Fallback case. return 1; } @@ -30305,12 +31445,11 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const { // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, - bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { +static bool matchUnaryShuffle(MVT MaskVT, ArrayRef Mask, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, unsigned &Shuffle, + MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); @@ -30322,19 +31461,25 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, return true; } - // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. + // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { - bool Match = true; + bool MatchAny = true; + bool MatchZero = true; unsigned NumDstElts = NumMaskElts / Scale; - for (unsigned i = 0; i != NumDstElts && Match; ++i) { - Match &= isUndefOrEqual(Mask[i * Scale], (int)i); - Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); + for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) { + if (!isUndefOrEqual(Mask[i * Scale], (int)i)) { + MatchAny = MatchZero = false; + break; + } + MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1); + MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } - if (Match) { + if (MatchAny || MatchZero) { + assert(MatchZero && "Failed to match zext but matched aext?"); unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : MVT::getIntegerVT(MaskEltSize); @@ -30343,10 +31488,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - if (SrcVT.getVectorNumElements() == NumDstElts) - Shuffle = unsigned(ISD::ZERO_EXTEND); - else - Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); + Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND); + if (SrcVT.getVectorNumElements() != NumDstElts) + Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); @@ -30368,7 +31512,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { + if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -30426,29 +31570,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } } - // Attempt to match against broadcast-from-vector. - if (Subtarget.hasAVX2()) { - SmallVector BroadcastMask(NumMaskElts, 0); - if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { - SrcVT = DstVT = MaskVT; - Shuffle = X86ISD::VBROADCAST; - return true; - } - } - return false; } // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, - const APInt &Zeroable, - bool AllowFloatDomain, - bool AllowIntDomain, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT, - unsigned &PermuteImm) { +static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, + const APInt &Zeroable, + bool AllowFloatDomain, bool AllowIntDomain, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; @@ -30549,9 +31682,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // FIXME: Add 512-bit support. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskScalarSizeInBits, Mask, - 0, Zeroable, Subtarget); + int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, + Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt) { PermuteImm = (unsigned)ShiftAmt; return true; @@ -30564,13 +31696,12 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, - bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, SDValue &V2, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, - bool IsUnary) { +static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, + bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { @@ -30631,7 +31762,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, return false; } -static bool matchBinaryPermuteVectorShuffle( +static bool matchBinaryPermuteShuffle( MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, @@ -30642,7 +31773,7 @@ static bool matchBinaryPermuteVectorShuffle( // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); + int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); @@ -30678,34 +31809,11 @@ static bool matchBinaryPermuteVectorShuffle( return true; } } else { - // Determine a type compatible with X86ISD::BLENDI. - ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (!ShuffleVT.isFloatingPoint()) { - int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); - BlendMask = - scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); - ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); - } - V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; return true; } } @@ -30715,7 +31823,7 @@ static bool matchBinaryPermuteVectorShuffle( if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { if (Zeroable.getBoolValue() && - matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; @@ -30727,7 +31835,7 @@ static bool matchBinaryPermuteVectorShuffle( ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; @@ -30784,6 +31892,11 @@ static bool matchBinaryPermuteVectorShuffle( return false; } +static SDValue combineX86ShuffleChainWithExtract( + ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget); + /// Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// @@ -30841,6 +31954,24 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); + // Attempt to match a subvector broadcast. + // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0) + if (UnaryShuffle && + (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) { + SmallVector BroadcastMask(NumBaseMaskElts, 0); + if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) { + SDValue Src = Inputs[0]; + if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(0).isUndef() && + Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits && + MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) { + return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL, + Src.getValueType(), + Src.getOperand(1))); + } + } + } + // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. @@ -30894,6 +32025,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. + // TODO: Should we indicate which domain is preferred if both are allowed? bool AllowFloatDomain = FloatDomain || (Depth > 3); bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); @@ -30909,8 +32041,11 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // directly if we don't shuffle the lower element and we shuffle the upper // (zero) elements within themselves. if (V1.getOpcode() == X86ISD::VZEXT_LOAD && - (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; + (cast(V1)->getMemoryVT().getScalarSizeInBits() % + MaskEltSizeInBits) == 0) { + unsigned Scale = + cast(V1)->getMemoryVT().getScalarSizeInBits() / + MaskEltSizeInBits; ArrayRef HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { @@ -30918,10 +32053,35 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } + // Attempt to match against broadcast-from-vector. + // Limit AVX1 to cases where we're loading+broadcasting a scalar element. + if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) + && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { + SmallVector BroadcastMask(NumMaskElts, 0); + if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { + if (V1.getValueType() == MaskVT && + V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + MayFoldLoad(V1.getOperand(0))) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = V1.getOperand(0); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + if (Subtarget.hasAVX2()) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = DAG.getBitcast(MaskVT, V1); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + } + } + SDValue NewV1 = V1; // Save operand in case early exit happens. - if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - NewV1, DL, DAG, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT) && + if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, + DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30930,9 +32090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, Subtarget, Shuffle, - ShuffleVT, PermuteImm) && + if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, ShuffleVT, + PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30945,9 +32105,9 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; - if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - NewV1, NewV2, DL, DAG, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT, UnaryShuffle) && + if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, + NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30959,7 +32119,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, NewV1 = V1; // Save operands in case early exit happens. NewV2 = V2; - if (matchBinaryPermuteVectorShuffle( + if (matchBinaryPermuteShuffle( MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { @@ -30979,8 +32139,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // Annoyingly, SSE4A instructions don't map into the above match helpers. if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { uint64_t BitLen, BitIdx; - if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, - Zeroable)) { + if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, + Zeroable)) { if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); @@ -30990,7 +32150,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { + if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); @@ -31057,6 +32217,13 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and either input is extracted then try to combine as a + // shuffle with the larger type. + if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( + Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, + DAG, Subtarget)) + return WideShuffle; + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && @@ -31222,10 +32389,145 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and either input is extracted then try to combine as a + // shuffle with the larger type. + if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( + Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, + DAG, Subtarget)) + return WideShuffle; + + // If we have a dual input shuffle then lower to VPERMV3. + if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 || + MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) { + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); + V1 = DAG.getBitcast(MaskVT, V1); + V2 = DAG.getBitcast(MaskVT, V2); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); + return DAG.getBitcast(RootVT, Res); + } + // Failed to find any combines. return SDValue(); } +// Combine an arbitrary chain of shuffles + extract_subvectors into a single +// instruction if possible. +// +// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger +// type size to attempt to combine: +// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) +// --> +// extract_subvector(shuffle(x,y,m2),0) +static SDValue combineX86ShuffleChainWithExtract( + ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned NumMaskElts = BaseMask.size(); + unsigned NumInputs = Inputs.size(); + if (NumInputs == 0) + return SDValue(); + + SmallVector WideInputs(Inputs.begin(), Inputs.end()); + SmallVector Offsets(NumInputs, 0); + + // Peek through subvectors. + // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs? + unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits(); + for (unsigned i = 0; i != NumInputs; ++i) { + SDValue &Src = WideInputs[i]; + unsigned &Offset = Offsets[i]; + Src = peekThroughBitcasts(Src); + EVT BaseVT = Src.getValueType(); + while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR && + isa(Src.getOperand(1))) { + Offset += Src.getConstantOperandVal(1); + Src = Src.getOperand(0); + } + WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits()); + assert((Offset % BaseVT.getVectorNumElements()) == 0 && + "Unexpected subvector extraction"); + Offset /= BaseVT.getVectorNumElements(); + Offset *= NumMaskElts; + } + + // Bail if we're always extracting from the lowest subvectors, + // combineX86ShuffleChain should match this for the current width. + if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; })) + return SDValue(); + + EVT RootVT = Root.getValueType(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned Scale = WideSizeInBits / RootSizeInBits; + assert((WideSizeInBits % RootSizeInBits) == 0 && + "Unexpected subvector extraction"); + + // If the src vector types aren't the same, see if we can extend + // them to match each other. + // TODO: Support different scalar types? + EVT WideSVT = WideInputs[0].getValueType().getScalarType(); + if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) { + return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) || + Op.getValueType().getScalarType() != WideSVT; + })) + return SDValue(); + + for (SDValue &NewInput : WideInputs) { + assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && + "Shuffle vector size mismatch"); + if (WideSizeInBits > NewInput.getValueSizeInBits()) + NewInput = widenSubVector(NewInput, false, Subtarget, DAG, + SDLoc(NewInput), WideSizeInBits); + assert(WideSizeInBits == NewInput.getValueSizeInBits() && + "Unexpected subvector extraction"); + } + + // Create new mask for larger type. + for (unsigned i = 1; i != NumInputs; ++i) + Offsets[i] += i * Scale * NumMaskElts; + + SmallVector WideMask(BaseMask.begin(), BaseMask.end()); + for (int &M : WideMask) { + if (M < 0) + continue; + M = (M % NumMaskElts) + Offsets[M / NumMaskElts]; + } + WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); + + // Remove unused/repeated shuffle source ops. + resolveTargetShuffleInputsAndMask(WideInputs, WideMask); + assert(!WideInputs.empty() && "Shuffle with no inputs detected"); + + if (WideInputs.size() > 2) + return SDValue(); + + // Increase depth for every upper subvector we've peeked through. + Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; }); + + // Attempt to combine wider chain. + // TODO: Can we use a better Root? + SDValue WideRoot = WideInputs[0]; + if (SDValue WideShuffle = combineX86ShuffleChain( + WideInputs, WideRoot, WideMask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget)) { + WideShuffle = + extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); + return DAG.getBitcast(RootVT, WideShuffle); + } + return SDValue(); +} + // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. @@ -31370,19 +32672,10 @@ static SDValue combineX86ShufflesRecursively( if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return SDValue(); - // TODO - Add support for more than 2 inputs. - if (2 < OpInputs.size()) - return SDValue(); - - SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); - SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); - // Add the inputs to the Ops list, avoiding duplicates. SmallVector Ops(SrcOps.begin(), SrcOps.end()); auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { - if (!Input) - return -1; // Attempt to find an existing match. SDValue InputBC = peekThroughBitcasts(Input); for (int i = 0, e = Ops.size(); i < e; ++i) @@ -31398,8 +32691,9 @@ static SDValue combineX86ShufflesRecursively( return Ops.size() - 1; }; - int InputIdx0 = AddOp(Input0, SrcOpIndex); - int InputIdx1 = AddOp(Input1, -1); + SmallVector OpInputIdx; + for (SDValue OpInput : OpInputs) + OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || @@ -31471,13 +32765,9 @@ static SDValue combineX86ShufflesRecursively( : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); - if (OpMask[OpIdx] < (int)OpMask.size()) { - assert(0 <= InputIdx0 && "Unknown target shuffle input"); - OpMaskedIdx += InputIdx0 * MaskWidth; - } else { - assert(0 <= InputIdx1 && "Unknown target shuffle input"); - OpMaskedIdx += InputIdx1 * MaskWidth; - } + int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); + assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); + OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; Mask[i] = OpMaskedIdx; } @@ -31493,7 +32783,7 @@ static SDValue combineX86ShufflesRecursively( return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); - // Remove unused shuffle source ops. + // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); @@ -31530,29 +32820,42 @@ static SDValue combineX86ShufflesRecursively( return Cst; // We can only combine unary and binary shuffle mask cases. - if (Ops.size() > 2) - return SDValue(); + if (Ops.size() <= 2) { + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with sequential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + } + + // Canonicalization of binary shuffle masks to improve pattern matching by + // commuting the inputs. + if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(Ops[0], Ops[1]); + } - // Minor canonicalization of the accumulated shuffle mask to make it easier - // to match below. All this does is detect masks with sequential pairs of - // elements, and shrink them to the half-width mask. It does this in a loop - // so it will reduce the size of the mask to the minimal width mask which - // performs an equivalent shuffle. - SmallVector WidenedMask; - while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { - Mask = std::move(WidenedMask); + // Finally, try to combine into a single shuffle instruction. + return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget); } - // Canonicalization of binary shuffle masks to improve pattern matching by - // commuting the inputs. - if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { - ShuffleVectorSDNode::commuteMask(Mask); - std::swap(Ops[0], Ops[1]); - } + // If that failed and any input is extracted then try to combine as a + // shuffle with the larger type. + return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth, + HasVariableMask, AllowVariableMask, + DAG, Subtarget); +} - // Finally, try to combine into a single shuffle instruction. - return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget); +/// Helper entry wrapper to combineX86ShufflesRecursively. +static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, + /*AllowVarMask*/ true, DAG, Subtarget); } /// Get the PSHUF-style mask from PSHUF node. @@ -31770,12 +33073,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, switch (Opcode) { case X86ISD::VBROADCAST: { - // If broadcasting from another shuffle, attempt to simplify it. - // TODO - we really need a general SimplifyDemandedVectorElts mechanism. SDValue Src = N.getOperand(0); SDValue BC = peekThroughBitcasts(Src); EVT SrcVT = Src.getValueType(); EVT BCVT = BC.getValueType(); + + // If broadcasting from another shuffle, attempt to simplify it. + // TODO - we really need a general SimplifyDemandedVectorElts mechanism. if (isTargetShuffle(BC.getOpcode()) && VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); @@ -31789,6 +33093,71 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } + + // broadcast(bitcast(src)) -> bitcast(broadcast(src)) + // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. + if (Src.getOpcode() == ISD::BITCAST && + SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) { + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), + VT.getVectorNumElements()); + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); + } + + // Reduce broadcast source vector to lowest 128-bits. + if (SrcVT.getSizeInBits() > 128) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + extract128BitVector(Src, 0, DAG, DL)); + + // broadcast(scalar_to_vector(x)) -> broadcast(x). + if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + + // Share broadcast with the longest vector and extract low subvector (free). + for (SDNode *User : Src->uses()) + if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && + User->getValueSizeInBits(0) > VT.getSizeInBits()) { + return extractSubVector(SDValue(User, 0), 0, DAG, DL, + VT.getSizeInBits()); + } + + return SDValue(); + } + case X86ISD::BLENDI: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + + // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. + // TODO: Handle MVT::v16i16 repeated blend mask. + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { + MVT SrcVT = N0.getOperand(0).getSimpleValueType(); + if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && + SrcVT.getScalarSizeInBits() >= 32) { + unsigned Mask = N.getConstantOperandVal(2); + unsigned Size = VT.getVectorNumElements(); + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant(ScaleMask, DL, MVT::i8))); + } + } + return SDValue(); + } + case X86ISD::VPERMI: { + // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. + // TODO: Remove when we have preferred domains in combineX86ShuffleChain. + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + if (N0.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) { + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1); + return DAG.getBitcast(VT, Res); + } return SDValue(); } case X86ISD::PSHUFD: @@ -32212,8 +33581,22 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, /// Eliminate a redundant shuffle of a horizontal math op. static SDValue foldShuffleOfHorizOp(SDNode *N) { - if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) - return SDValue(); + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) + if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) + return SDValue(); + + // For a broadcast, peek through an extract element of index 0 to find the + // horizontal op: broadcast (ext_vec_elt HOp, 0) + EVT VT = N->getValueType(0); + if (Opcode == X86ISD::VBROADCAST) { + SDValue SrcOp = N->getOperand(0); + if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + SrcOp.getValueType() == MVT::f64 && + SrcOp.getOperand(0).getValueType() == VT && + isNullConstant(SrcOp.getOperand(1))) + N = SrcOp.getNode(); + } SDValue HOp = N->getOperand(0); if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && @@ -32224,13 +33607,25 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { // lanes of each operand as: // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] // ...similarly for v2f64 and v8i16. - // TODO: Handle UNDEF operands. - if (HOp.getOperand(0) != HOp.getOperand(1)) + if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() && + HOp.getOperand(0) != HOp.getOperand(1)) return SDValue(); // When the operands of a horizontal math op are identical, the low half of - // the result is the same as the high half. If the shuffle is also replicating - // low and high halves, we don't need the shuffle. + // the result is the same as the high half. If a target shuffle is also + // replicating low and high halves, we don't need the shuffle. + if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { + if (HOp.getScalarValueSizeInBits() == 64) { + // movddup (hadd X, X) --> hadd X, X + // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X + assert((HOp.getValueType() == MVT::v2f64 || + HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && + "Unexpected type for h-op"); + return HOp; + } + return SDValue(); + } + // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X ArrayRef Mask = cast(N)->getMask(); // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, @@ -32252,14 +33647,51 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { return SDValue(); } +/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the +/// low half of each source vector and does not set any high half elements in +/// the destination vector, narrow the shuffle to half its original size. +static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { + if (!Shuf->getValueType(0).isSimple()) + return SDValue(); + MVT VT = Shuf->getSimpleValueType(0); + if (!VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + // See if we can ignore all of the high elements of the shuffle. + ArrayRef Mask = Shuf->getMask(); + if (!isUndefUpperHalf(Mask)) + return SDValue(); + + // Check if the shuffle mask accesses only the low half of each input vector + // (half-index output is 0 or 2). + int HalfIdx1, HalfIdx2; + SmallVector HalfMask(Mask.size() / 2); + if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) || + (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1)) + return SDValue(); + + // Create a half-width shuffle to replace the unnecessarily wide shuffle. + // The trick is knowing that all of the insert/extract are actually free + // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle + // of narrow inputs into a narrow output, and that is always cheaper than + // the wide shuffle that we started with. + return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), + Shuf->getOperand(1), HalfMask, HalfIdx1, + HalfIdx2, false, DAG); +} + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (auto *Shuf = dyn_cast(N)) + if (SDValue V = narrowShuffle(Shuf, DAG)) + return V; + + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we have legalized the vector types, look for blends of FADD and FSUB - // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; @@ -32328,23 +33760,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, - // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are - // consecutive, non-overlapping, and in the right order. - SmallVector Elts; - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { - Elts.push_back(Elt); - continue; - } - Elts.clear(); - break; - } - - if (Elts.size() == VT.getVectorNumElements()) - if (SDValue LD = - EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) - return LD; + // Attempt to combine into a vector load/broadcast. + if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) + return LD; // For AVX2, we sometimes want to combine // (vector_shuffle (concat_vectors t1, undef) @@ -32365,9 +33783,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // Simplify source operands based on shuffle mask. @@ -32378,6 +33794,68 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } + // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros + // in the upper 64 bits. + // TODO: Can we generalize this using computeKnownBits. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && + (VT == MVT::v2f64 || VT == MVT::v2i64) && + N->getOperand(0).getOpcode() == ISD::BITCAST && + (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 || + N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) { + SDValue In = N->getOperand(0).getOperand(0); + switch (In.getOpcode()) { + default: + break; + case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: + case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: + case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: + case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: + case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: + case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: + case X86ISD::VFPROUND: case X86ISD::VMFPROUND: + if (In.getOperand(0).getValueType() == MVT::v2f64 || + In.getOperand(0).getValueType() == MVT::v2i64) + return N->getOperand(0); // return the bitcast + break; + } + } + + // Pull subvector inserts into undef through VZEXT_MOVL by making it an + // insert into a zero vector. This helps get VZEXT_MOVL closer to + // scalar_to_vectors where 256/512 are canonicalized to an insert and a + // 128-bit scalar_to_vector. This reduces the number of isel patterns. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && + N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR && + N->getOperand(0).hasOneUse() && + N->getOperand(0).getOperand(0).isUndef() && + isNullConstant(N->getOperand(0).getOperand(2))) { + SDValue In = N->getOperand(0).getOperand(1); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, + getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), + Movl, N->getOperand(0).getOperand(2)); + } + + // If this a vzmovl of a full vector load, replace it with a vzload, unless + // the load is volatile. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && + ISD::isNormalLoad(N->getOperand(0).getNode())) { + LoadSDNode *LN = cast(N->getOperand(0)); + if (!LN->isVolatile()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + VT.getVectorElementType(), + LN->getPointerInfo(), + LN->getAlignment(), + MachineMemOperand::MOLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return VZLoad; + } + } + + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. // FIXME: This can probably go away once we default to widening legalization. @@ -32436,6 +33914,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Handle special case opcodes. switch (Opc) { + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + APInt LHSUndef, LHSZero; + APInt RHSUndef, RHSZero; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + Depth + 1)) + return true; + // Multiply by zero. + KnownZero = LHSZero | RHSZero; + break; + } case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: { @@ -32443,11 +33937,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Amt = Op.getOperand(1); MVT AmtVT = Amt.getSimpleValueType(); assert(AmtVT.is128BitVector() && "Unexpected value type"); + + // If we reuse the shift amount just for sse shift amounts then we know that + // only the bottom 64-bits are only ever used. + bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { + unsigned UseOpc = Use->getOpcode(); + return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL || + UseOpc == X86ISD::VSRA) && + Use->getOperand(0) != Amt; + }); + APInt AmtUndef, AmtZero; unsigned NumAmtElts = AmtVT.getVectorNumElements(); APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, - Depth + 1)) + Depth + 1, AssumeSingleUse)) return true; LLVM_FALLTHROUGH; } @@ -32487,6 +33991,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + APInt DemandedLHS, DemandedRHS; + getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + + APInt LHSUndef, LHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef, + LHSZero, TLO, Depth + 1)) + return true; + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef, + RHSZero, TLO, Depth + 1)) + return true; + break; + } + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, + Depth + 1)) + return true; + KnownZero = SrcZero.zextOrTrunc(NumElts); + KnownUndef = SrcUndef.zextOrTrunc(NumElts); + break; + } + case X86ISD::BLENDV: { + APInt SelUndef, SelZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, + SelZero, TLO, Depth + 1)) + return true; + + // TODO: Use SelZero to adjust LHS/RHS DemandedElts. + APInt LHSUndef, LHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, + LHSZero, TLO, Depth + 1)) + return true; + + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, + RHSZero, TLO, Depth + 1)) + return true; + + KnownZero = LHSZero & RHSZero; + KnownUndef = LHSUndef & RHSUndef; + break; + } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -32494,7 +34050,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return false; // Don't bother broadcasting if we just need the 0'th element. if (DemandedElts == 1) { - if(Src.getValueType() != VT) + if (Src.getValueType() != VT) Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, SDLoc(Op)); return TLO.CombineTo(Op, Src); @@ -32506,8 +34062,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } - case X86ISD::PSHUFB: { - // TODO - simplify other variable shuffle masks. + case X86ISD::SUBV_BROADCAST: { + // Reduce size of broadcast if we don't need the upper half. + unsigned HalfElts = NumElts / 2; + if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + SDValue Half = Src; + if (SrcVT.getVectorNumElements() != HalfElts) { + MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts); + Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src); + } + + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0, + TLO.DAG, SDLoc(Op), + Half.getValueSizeInBits())); + } + break; + } + case X86ISD::VPERMV: { + SDValue Mask = Op.getOperand(0); + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + break; + } + case X86ISD::PSHUFB: + case X86ISD::VPERMV3: + case X86ISD::VPERMILPV: { SDValue Mask = Op.getOperand(1); APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, @@ -32515,6 +34099,106 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } + case X86ISD::VPPERM: + case X86ISD::VPERMIL2: { + SDValue Mask = Op.getOperand(2); + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + break; + } + } + + // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not + // demand any of the high elements, then narrow the op to 128/256-bits: e.g. + // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 + if ((VT.is256BitVector() || VT.is512BitVector()) && + DemandedElts.lshr(NumElts / 2) == 0) { + unsigned SizeInBits = VT.getSizeInBits(); + unsigned ExtSizeInBits = SizeInBits / 2; + + // See if 512-bit ops only use the bottom 128-bits. + if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) + ExtSizeInBits = SizeInBits / 4; + + switch (Opc) { + // Zero upper elements. + case X86ISD::VZEXT_MOVL: { + SDLoc DL(Op); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = + TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + // Byte shifts by immediate. + case X86ISD::VSHLDQ: + case X86ISD::VSRLDQ: + // Shift by uniform. + case X86ISD::VSHL: + case X86ISD::VSRL: + case X86ISD::VSRA: + // Shift by immediate. + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: { + SDLoc DL(Op); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = + TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1)); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + case X86ISD::VPERMI: { + // Simplify PERMPD/PERMQ to extract_subvector. + // TODO: This should be done in shuffle combining. + if (VT == MVT::v4f64 || VT == MVT::v4i64) { + SmallVector Mask; + DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask); + if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) { + SDLoc DL(Op); + SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128); + return TLO.CombineTo(Op, Insert); + } + } + break; + } + // Target Shuffles. + case X86ISD::PSHUFB: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // Saturated Packs. + case X86ISD::PACKSS: + case X86ISD::PACKUS: + // Horizontal Ops. + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + SDLoc DL(Op); + MVT ExtVT = VT.getSimpleVT(); + ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), + ExtSizeInBits / ExtVT.getScalarSizeInBits()); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue Ext1 = + extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + } } // Simplify target shuffles. @@ -32606,9 +34290,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue RHS = Op.getOperand(1); // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); - if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; break; } @@ -32727,6 +34413,97 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } break; } + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: { + SDValue Vec = Op.getOperand(0); + auto *CIdx = dyn_cast(Op.getOperand(1)); + MVT VecVT = Vec.getSimpleValueType(); + unsigned NumVecElts = VecVT.getVectorNumElements(); + + if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) { + unsigned Idx = CIdx->getZExtValue(); + unsigned VecBitWidth = VecVT.getScalarSizeInBits(); + + // If we demand no bits from the vector then we must have demanded + // bits from the implict zext - simplify to zero. + APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth); + if (DemandedVecBits == 0) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + + APInt KnownUndef, KnownZero; + APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); + if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, + KnownZero, TLO, Depth + 1)) + return true; + + KnownBits KnownVec; + if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + Known = KnownVec.zext(BitWidth, true); + return false; + } + break; + } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue Vec = Op.getOperand(0); + SDValue Scl = Op.getOperand(1); + auto *CIdx = dyn_cast(Op.getOperand(2)); + MVT VecVT = Vec.getSimpleValueType(); + + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { + unsigned Idx = CIdx->getZExtValue(); + if (!OriginalDemandedElts[Idx]) + return TLO.CombineTo(Op, Vec); + + KnownBits KnownVec; + APInt DemandedVecElts(OriginalDemandedElts); + DemandedVecElts.clearBit(Idx); + if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + KnownBits KnownScl; + unsigned NumSclBits = Scl.getScalarValueSizeInBits(); + APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); + if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) + return true; + + KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); + Known.One = KnownVec.One & KnownScl.One; + Known.Zero = KnownVec.Zero & KnownScl.Zero; + return false; + } + break; + } + case X86ISD::PACKSS: + // PACKSS saturates to MIN/MAX integer values. So if we just want the + // sign bit then we can just ask for the source operands sign bit. + // TODO - add known bits handling. + if (OriginalDemandedBits.isSignMask()) { + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS); + + KnownBits KnownLHS, KnownRHS; + APInt SignMask = APInt::getSignMask(BitWidth * 2); + if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS, + KnownLHS, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, + KnownRHS, TLO, Depth + 1)) + return true; + } + // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. + break; + case X86ISD::PCMPGT: + // icmp sgt(0, R) == ashr(R, BitWidth-1). + // iff we only need the sign bit then we can use R directly. + if (OriginalDemandedBits.isSignMask() && + ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) + return TLO.CombineTo(Op, Op.getOperand(1)); + break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -32868,29 +34645,42 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Helper to peek through bitops/setcc to determine size of source vector. +// Allows combineBitcastvxi1 to determine what size vector generated a . +static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { + switch (Src.getOpcode()) { + case ISD::SETCC: + return Src.getOperand(0).getValueSizeInBits() == Size; + case ISD::AND: + case ISD::XOR: + case ISD::OR: + return checkBitcastSrcVectorSize(Src.getOperand(0), Size) && + checkBitcastSrcVectorSize(Src.getOperand(1), Size); + } + return false; +} + // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the illegal vector is scalarized on subtargets that don't have legal // vxi1 types. -static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, +static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, + const SDLoc &DL, const X86Subtarget &Subtarget) { - EVT VT = BitCast.getValueType(); - SDValue N0 = BitCast.getOperand(0); - EVT VecVT = N0->getValueType(0); - - if (!VT.isScalarInteger() || !VecVT.isSimple()) + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. - bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - (N0.getOperand(0).getValueType() == MVT::v16i8 || - N0.getOperand(0).getValueType() == MVT::v32i8 || - N0.getOperand(0).getValueType() == MVT::v64i8); + bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && + (Src.getOperand(0).getValueType() == MVT::v16i8 || + Src.getOperand(0).getValueType() == MVT::v32i8 || + Src.getOperand(0).getValueType() == MVT::v64i8); // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. @@ -32908,7 +34698,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; - switch (VecVT.getSimpleVT().SimpleTy) { + switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v2i1: @@ -32918,10 +34708,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - N0->getOperand(0).getValueType().is256BitVector()) { + if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) SExtVT = MVT::v4i64; - } break; case MVT::v8i1: SExtVT = MVT::v8i16; @@ -32930,9 +34718,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (N0->getOperand(0).getValueType().is256BitVector() || - N0->getOperand(0).getValueType().is512BitVector())) { + // TODO : use checkBitcastSrcVectorSize + if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && + (Src.getOperand(0).getValueType().is256BitVector() || + Src.getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; } break; @@ -32956,8 +34745,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, return SDValue(); }; - SDLoc DL(BitCast); - SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0); + SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); if (SExtVT == MVT::v64i8) { SDValue Lo, Hi; @@ -32977,7 +34765,11 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, DAG.getUNDEF(MVT::v8i16)); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } - return DAG.getZExtOrTrunc(V, DL, VT); + + EVT IntVT = + EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); + V = DAG.getZExtOrTrunc(V, DL, IntVT); + return DAG.getBitcast(VT, V); } // Convert a vXi1 constant build vector to the same width scalar integer. @@ -33054,12 +34846,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG, +static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDLoc DL(N); - unsigned NumElts = N.getNumOperands(); - - auto *BV = cast(N); + SDLoc DL(BV); + unsigned NumElts = BV->getNumOperands(); SDValue Splat = BV->getSplatValue(); // Build MMX element from integer GPR or SSE float values. @@ -33107,7 +34897,7 @@ static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG, Ops.append(NumElts, Splat); } else { for (unsigned i = 0; i != NumElts; ++i) - Ops.push_back(CreateMMXElement(N.getOperand(i))); + Ops.push_back(CreateMMXElement(BV->getOperand(i))); } // Use tree of PUNPCKLs to build up general MMX vector. @@ -33141,14 +34931,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. if (DCI.isBeforeLegalize()) { - if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + SDLoc dl(N); + if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && Subtarget.hasAVX512()) { - SDLoc dl(N); N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); N0 = DAG.getBitcast(MVT::v8i1, N0); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, @@ -33159,7 +34949,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { - SDLoc dl(N); unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; @@ -33213,7 +35002,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8)) - return createMMXBuildVector(N0, DAG, Subtarget); + return createMMXBuildVector(cast(N0), DAG, Subtarget); // Detect bitcasts between element or subvector extraction to x86mmx. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || @@ -33297,66 +35086,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Given a select, detect the following pattern: -// 1: %2 = zext %0 to -// 2: %3 = zext %1 to -// 3: %4 = sub nsw %2, %3 -// 4: %5 = icmp sgt %4, [0 x N] or [-1 x N] -// 5: %6 = sub nsw zeroinitializer, %4 -// 6: %7 = select %5, %4, %6 +// Given a ABS node, detect the following pattern: +// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). // This is useful as it is the input into a SAD pattern. -static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, - SDValue &Op1) { - // Check the condition of the select instruction is greater-than. - SDValue SetCC = Select->getOperand(0); - if (SetCC.getOpcode() != ISD::SETCC) - return false; - ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT && CC != ISD::SETLT) +static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { + SDValue AbsOp1 = Abs->getOperand(0); + if (AbsOp1.getOpcode() != ISD::SUB) return false; - SDValue SelectOp1 = Select->getOperand(1); - SDValue SelectOp2 = Select->getOperand(2); - - // The following instructions assume SelectOp1 is the subtraction operand - // and SelectOp2 is the negation operand. - // In the case of SETLT this is the other way around. - if (CC == ISD::SETLT) - std::swap(SelectOp1, SelectOp2); - - // The second operand of the select should be the negation of the first - // operand, which is implemented as 0 - SelectOp1. - if (!(SelectOp2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && - SelectOp2.getOperand(1) == SelectOp1)) - return false; - - // The first operand of SetCC is the first operand of the select, which is the - // difference between the two input vectors. - if (SetCC.getOperand(0) != SelectOp1) - return false; - - // In SetLT case, The second operand of the comparison can be either 1 or 0. - APInt SplatVal; - if ((CC == ISD::SETLT) && - !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && - SplatVal.isOneValue()) || - (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) - return false; - - // In SetGT case, The second operand of the comparison can be either -1 or 0. - if ((CC == ISD::SETGT) && - !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || - ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) - return false; - - // The first operand of the select is the difference between the two input - // vectors. - if (SelectOp1.getOpcode() != ISD::SUB) - return false; - - Op0 = SelectOp1.getOperand(0); - Op1 = SelectOp1.getOperand(1); + Op0 = AbsOp1.getOperand(0); + Op1 = AbsOp1.getOperand(1); // Check if the operands of the sub are zero-extended from vectors of i8. if (Op0.getOpcode() != ISD::ZERO_EXTEND || @@ -33476,23 +35215,25 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, DAG.getIntPtrConstant(0, DL)); } -// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Bail without SSE2 or with AVX512VL (which uses predicate registers). - if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + // Bail without SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); unsigned BitWidth = ExtractVT.getSizeInBits(); if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && - ExtractVT != MVT::i8) + ExtractVT != MVT::i8 && ExtractVT != MVT::i1) return SDValue(); - // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns. ISD::NodeType BinOp; SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); + if (!Match && ExtractVT == MVT::i1) + Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR}); if (!Match) return SDValue(); @@ -33501,53 +35242,104 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); - // We require AVX2 for PMOVMSKB for v16i16/v32i8; - unsigned MatchSizeInBits = Match.getValueSizeInBits(); - if (!(MatchSizeInBits == 128 || - (MatchSizeInBits == 256 && - ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) - return SDValue(); + SDValue Movmsk; + SDLoc DL(Extract); + EVT MatchVT = Match.getValueType(); + unsigned NumElts = MatchVT.getVectorNumElements(); - // Don't bother performing this for 2-element vectors. - if (Match.getValueType().getVectorNumElements() <= 2) - return SDValue(); + if (ExtractVT == MVT::i1) { + // Special case for (pre-legalization) vXi1 reductions. + if (NumElts > 32) + return SDValue(); + if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) { + // If this is a legal AVX512 predicate type then we can just bitcast. + EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + Movmsk = DAG.getBitcast(MovmskVT, Match); + } else { + // Use combineBitcastvxi1 to create the MOVMSK. + if (NumElts == 32 && !Subtarget.hasInt256()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); + Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); + NumElts = 16; + } + EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); + } + if (!Movmsk) + return SDValue(); + Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32); + } else { + // Bail with AVX512VL (which uses predicate registers). + if (Subtarget.hasVLX()) + return SDValue(); - // Check that we are extracting a reduction of all sign bits. - if (DAG.ComputeNumSignBits(Match) != BitWidth) - return SDValue(); + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && Subtarget.hasAVX()))) + return SDValue(); - // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. - MVT MaskVT; - if (64 == BitWidth || 32 == BitWidth) - MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), - MatchSizeInBits / BitWidth); - else - MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + // Make sure this isn't a vector of 1 element. The perf win from using + // MOVMSK diminishes with less elements in the reduction, but it is + // generally better to get the comparison over to the GPRs as soon as + // possible to reduce the number of vector ops. + if (Match.getValueType().getVectorNumElements() < 2) + return SDValue(); - APInt CompareBits; + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); + Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); + MatchSizeInBits = Match.getValueSizeInBits(); + } + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskSrcVT; + if (64 == BitWidth || 32 == BitWidth) + MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); + Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); + NumElts = MaskSrcVT.getVectorNumElements(); + } + assert(NumElts <= 32 && "Not expecting more than 32 elements"); + + if (BinOp == ISD::XOR) { + // parity -> (AND (CTPOP(MOVMSK X)), 1) + SDValue Mask = DAG.getConstant(1, DL, MVT::i32); + SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk); + Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask); + return DAG.getZExtOrTrunc(Result, DL, ExtractVT); + } + + SDValue CmpC; ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 - CompareBits = APInt::getNullValue(32); + CmpC = DAG.getConstant(0, DL, MVT::i32); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) - CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32); CondCode = ISD::CondCode::SETEQ; } - // Perform the select as i32/i64 and then truncate to avoid partial register - // stalls. - unsigned ResWidth = std::max(BitWidth, 32u); - EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); - SDLoc DL(Extract); - SDValue Zero = DAG.getConstant(0, DL, ResVT); - SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); - SDValue Res = DAG.getBitcast(MaskVT, Match); - Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); - Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), - Ones, Zero, CondCode); - return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + // The setcc produces an i8 of 0/1, so extend that to the result width and + // negate to get the final 0/-1 mask value. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetccVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); + SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); + SDValue Zero = DAG.getConstant(0, DL, ExtractVT); + return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, @@ -33592,7 +35384,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. - if (!Root || (Root.getOpcode() != ISD::VSELECT)) + if (!Root || Root.getOpcode() != ISD::ABS) return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. @@ -33651,15 +35443,19 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (SrcSVT == MVT::i1 || !isa(Idx)) return SDValue(); + SDValue SrcBC = peekThroughBitcasts(Src); + // Handle extract(broadcast(scalar_value)), it doesn't matter what index is. - if (X86ISD::VBROADCAST == Src.getOpcode() && - Src.getOperand(0).getValueType() == VT) - return Src.getOperand(0); + if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { + SDValue SrcOp = SrcBC.getOperand(0); + if (SrcOp.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, SrcOp); + } // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; - if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) + if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -33704,7 +35500,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, : DAG.getConstant(0, dl, VT); SDValue SrcOp = Ops[SrcIdx / Mask.size()]; - SrcOp = DAG.getBitcast(SrcVT, SrcOp); SrcIdx = SrcIdx % Mask.size(); // We can only extract other elements from 128-bit vectors and in certain @@ -33714,6 +35509,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { assert(SrcSVT == VT && "Unexpected extraction type"); + SrcOp = DAG.getBitcast(SrcVT, SrcOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); } @@ -33723,6 +35519,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"); unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); + SrcOp = DAG.getBitcast(SrcVT, SrcOp); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); return DAG.getZExtOrTrunc(ExtOp, dl, VT); @@ -33731,6 +35528,155 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Extracting a scalar FP value from vector element 0 is free, so extract each +/// operand first, then perform the math as a scalar op. +static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); + SDValue Vec = ExtElt->getOperand(0); + SDValue Index = ExtElt->getOperand(1); + EVT VT = ExtElt->getValueType(0); + EVT VecVT = Vec.getValueType(); + + // TODO: If this is a unary/expensive/expand op, allow extraction from a + // non-zero element because the shuffle+scalar op will be cheaper? + if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) + return SDValue(); + + // Vector FP compares don't fit the pattern of FP math ops (propagate, not + // extract, the condition code), so deal with those as a special-case. + if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) { + EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); + if (OpVT != MVT::f32 && OpVT != MVT::f64) + return SDValue(); + + // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(1), Index); + return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); + } + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Vector FP selects don't fit the pattern of FP math ops (because the + // condition has a different type and we have to change the opcode), so deal + // with those here. + // FIXME: This is restricted to pre type legalization by ensuring the setcc + // has i1 elements. If we loosen this we need to convert vector bool to a + // scalar bool. + if (Vec.getOpcode() == ISD::VSELECT && + Vec.getOperand(0).getOpcode() == ISD::SETCC && + Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 && + Vec.getOperand(0).getOperand(0).getValueType() == VecVT) { + // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0) + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + Vec.getOperand(0).getValueType().getScalarType(), + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + Vec.getOperand(1), Index); + SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + Vec.getOperand(2), Index); + return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); + } + + // TODO: This switch could include FNEG and the x86-specific FP logic ops + // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // missed load folding and fma+fneg combining. + switch (Vec.getOpcode()) { + case ISD::FMA: // Begin 3 operands + case ISD::FMAD: + case ISD::FADD: // Begin 2 operands + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCOPYSIGN: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: + case X86ISD::FMAX: + case X86ISD::FMIN: + case ISD::FABS: // Begin 1 operand + case ISD::FSQRT: + case ISD::FRINT: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case X86ISD::FRCP: + case X86ISD::FRSQRT: { + // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... + SDLoc DL(ExtElt); + SmallVector ExtOps; + for (SDValue Op : Vec->ops()) + ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); + return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); + } + default: + return SDValue(); + } + llvm_unreachable("All opcodes should return within switch"); +} + +/// Try to convert a vector reduction sequence composed of binops and shuffles +/// into horizontal ops. +static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + return SDValue(); + SDValue Index = ExtElt->getOperand(1); + if (!isNullConstant(Index)) + return SDValue(); + + // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. + ISD::NodeType Opc; + SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); + if (!Rdx) + return SDValue(); + + EVT VT = ExtElt->getValueType(0); + EVT VecVT = ExtElt->getOperand(0).getValueType(); + if (VecVT.getScalarType() != VT) + return SDValue(); + + unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; + SDLoc DL(ExtElt); + + // 256-bit horizontal instructions operate on 128-bit chunks rather than + // across the whole vector, so we need an extract + hop preliminary stage. + // This is the only step where the operands of the hop are not the same value. + // TODO: We could extend this to handle 512-bit or even longer vectors. + if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || + ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { + unsigned NumElts = VecVT.getVectorNumElements(); + SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); + SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); + VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); + } + if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && + !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) + return SDValue(); + + // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 + assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); + unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); + for (unsigned i = 0; i != ReductionSteps; ++i) + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -33741,23 +35687,48 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; + SDValue InputVector = N->getOperand(0); + SDValue EltIdx = N->getOperand(1); + auto *CIdx = dyn_cast(EltIdx); + + EVT SrcVT = InputVector.getValueType(); + EVT VT = N->getValueType(0); + SDLoc dl(InputVector); + bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; + + if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements())) + return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); + + // Integer Constant Folding. + if (CIdx && VT.isInteger()) { + APInt UndefVecElts; + SmallVector EltBits; + unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits(); + if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts, + EltBits, true, false)) { + uint64_t Idx = CIdx->getZExtValue(); + if (UndefVecElts[Idx]) + return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); + return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()), + dl, VT); + } + } + // TODO - Remove this once we can handle the implicit zero-extension of // X86ISD::PEXTRW/X86ISD::PEXTRB in: // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and // combineBasicSADPattern. - if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (IsPextr) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits( + SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI)) + return SDValue(N, 0); return SDValue(); + } if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; - SDValue InputVector = N->getOperand(0); - SDValue EltIdx = N->getOperand(1); - - EVT SrcVT = InputVector.getValueType(); - EVT VT = N->getValueType(0); - SDLoc dl(InputVector); - // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { @@ -33778,16 +35749,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } - if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && - isa(EltIdx) && - isa(InputVector.getOperand(0))) { - uint64_t ExtractedElt = N->getConstantOperandVal(1); - auto *InputC = cast(InputVector.getOperand(0)); - const APInt &InputValue = InputC->getAPIntValue(); - uint64_t Res = InputValue[ExtractedElt]; - return DAG.getConstant(Res, dl, MVT::i1); - } - // Check whether this extract is the root of a sum of absolute differences // pattern. This has to be done here because we really want it to happen // pre-legalization, @@ -33802,6 +35763,45 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget)) + return V; + + if (SDValue V = scalarizeExtEltFP(N, DAG)) + return V; + + // Attempt to extract a i1 element by using MOVMSK to extract the signbits + // and then testing the relevant element. + if (CIdx && SrcVT.getScalarType() == MVT::i1) { + SmallVector BoolExtracts; + auto IsBoolExtract = [&BoolExtracts](SDNode *Use) { + if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(Use->getOperand(1)) && + Use->getValueType(0) == MVT::i1) { + BoolExtracts.push_back(Use); + return true; + } + return false; + }; + if (all_of(InputVector->uses(), IsBoolExtract) && + BoolExtracts.size() > 1) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); + if (SDValue BC = + combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { + for (SDNode *Use : BoolExtracts) { + // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask + unsigned MaskIdx = Use->getConstantOperandVal(1); + APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx); + SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT); + SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); + Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); + DCI.CombineTo(Use, Res); + } + return SDValue(N, 0); + } + } + } + return SDValue(); } @@ -33825,11 +35825,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, assert(CondVT.isVector() && "Vector select expects a vector selector!"); - bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); // Check if the first operand is all zeros and Cond type is vXi1. // This situation only applies to avx512. - if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && - CondVT.getVectorElementType() == MVT::i1) { + // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? + // TODO: Can we assert that both operands are not zeros (because that should + // get simplified at node creation time)? + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && + Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 @@ -33844,12 +35848,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); - bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); - bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - // Try to invert the condition if true value is not all 1s and false value is - // not all 0s. - if (!TValIsAllOnes && !FValIsAllZeros && + // not all 0s. Only do this if the condition has one use. + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() && // Check if the selector will be produced by CMPP*/PCMP*. Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted. @@ -33907,6 +35909,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// If both arms of a vector select are concatenated vectors, split the select, +/// and concatenate the result to eliminate a wide (256-bit) vector instruction: +/// vselect Cond, (concat T0, T1), (concat F0, F1) --> +/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) +static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) + return SDValue(); + + // TODO: Split 512-bit vectors too? + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return SDValue(); + + // TODO: Split as long as any 2 of the 3 operands are concatenated? + SDValue Cond = N->getOperand(0); + SDValue TVal = N->getOperand(1); + SDValue FVal = N->getOperand(2); + SmallVector CatOpsT, CatOpsF; + if (!TVal.hasOneUse() || !FVal.hasOneUse() || + !collectConcatOps(TVal.getNode(), CatOpsT) || + !collectConcatOps(FVal.getNode(), CatOpsF)) + return SDValue(); + + auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, + makeBlend, /*CheckBWI*/ false); +} + static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); @@ -33973,7 +36008,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { /// If this is a *dynamic* select (non-constant condition) and we can match /// this node with one of the variable blend instructions, restructure the /// condition so that blends can use the high (sign) bit of each element. -/// This function will also call SimplfiyDemandedBits on already created +/// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -34268,6 +36303,42 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } + // AVX512 - Extend select with zero to merge with target shuffle. + // select(mask, extract_subvector(shuffle(x)), zero) --> + // extract_subvector(select(insert_subvector(mask), shuffle(x), zero)) + // TODO - support non target shuffles as well. + if (Subtarget.hasAVX512() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1) { + auto SelectableOp = [&TLI](SDValue Op) { + return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + isTargetShuffle(Op.getOperand(0).getOpcode()) && + isNullConstant(Op.getOperand(1)) && + TLI.isTypeLegal(Op.getOperand(0).getValueType()) && + Op.hasOneUse() && Op.getOperand(0).hasOneUse(); + }; + + bool SelectableLHS = SelectableOp(LHS); + bool SelectableRHS = SelectableOp(RHS); + bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) { + EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType() + : RHS.getOperand(0).getValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts); + LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL, + VT.getSizeInBits()); + RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL, + VT.getSizeInBits()); + Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT, + DAG.getUNDEF(SrcCondVT), Cond, + DAG.getIntPtrConstant(0, DL)); + SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS); + return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); + } + } + if (SDValue V = combineSelectOfTwoConstants(N, DAG)) return V; @@ -34338,14 +36409,16 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C - // TODO: Handle build_vectors with undef elements. auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { - return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1); + return (!Op && !Cond) || + (Op && Cond && + Cond->getAPIntValue() == (-Op->getAPIntValue() - 1)); }; if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) { - OpRHS = DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), OpRHS); + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT, + /*AllowUndefs*/ true)) { + OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + OpRHS); return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } @@ -34432,6 +36505,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) return V; + if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) + return V; + // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); @@ -34715,7 +36791,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, // When legalizing carry, we create carries via add X, -1 // If that comes from an actual carry, via setcc, we use the // carry directly. -static SDValue combineCarryThroughADD(SDValue EFLAGS) { +static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { SDValue Carry = EFLAGS.getOperand(0); @@ -34728,8 +36804,34 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) { Carry = Carry.getOperand(0); if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { - if (Carry.getConstantOperandVal(0) == X86::COND_B) - return Carry.getOperand(1); + // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? + uint64_t CarryCC = Carry.getConstantOperandVal(0); + SDValue CarryOp1 = Carry.getOperand(1); + if (CarryCC == X86::COND_B) + return CarryOp1; + if (CarryCC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp + // instruction cannot take an immediate as its first operand. + // + if (CarryOp1.getOpcode() == X86ISD::SUB && + CarryOp1.getNode()->hasOneUse() && + CarryOp1.getValueType().isInteger() && + !isa(CarryOp1.getOperand(1))) { + SDValue SubCommute = + DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(), + CarryOp1.getOperand(1), CarryOp1.getOperand(0)); + return SDValue(SubCommute.getNode(), CarryOp1.getResNo()); + } + } + // If this is a check of the z flag of an add with 1, switch to the + // C flag. + if (CarryCC == X86::COND_E && + CarryOp1.getOpcode() == X86ISD::ADD && + isOneConstant(CarryOp1.getOperand(1))) + return CarryOp1; } } } @@ -34744,7 +36846,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (CC == X86::COND_B) - if (SDValue Flags = combineCarryThroughADD(EFLAGS)) + if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) @@ -34763,6 +36865,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); + // cmov X, X, ?, ? --> X + if (TrueOp == FalseOp) + return TrueOp; + // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { @@ -35044,7 +37150,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. - bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize(); + bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); @@ -35283,8 +37389,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); + MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) }, @@ -35352,7 +37458,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. - if (DAG.getMachineFunction().getFunction().optForMinSize()) + if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) @@ -35489,7 +37595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); + APInt Mask = N0.getConstantOperandAPInt(1); Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if @@ -35638,24 +37744,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineShift(SDNode* N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (N->getOpcode() == ISD::SHL) - if (SDValue V = combineShiftLeft(N, DAG)) - return V; - - if (N->getOpcode() == ISD::SRA) - if (SDValue V = combineShiftRightArithmetic(N, DAG)) - return V; - - if (N->getOpcode() == ISD::SRL) - if (SDValue V = combineShiftRightLogical(N, DAG, DCI)) - return V; - - return SDValue(); -} - static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35677,8 +37765,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; - if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) && - (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) && + if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) && + (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) && getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; @@ -35750,10 +37838,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Attempt to combine as shuffle. SDValue Op(N, 0); - if (SDValue Res = - combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); @@ -35766,11 +37851,22 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); // Shift zero -> zero. - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); + // Detect constant shift amounts. + APInt UndefElts; + SmallVector EltBits; + if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) { + unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false); + return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0, + EltBits[0].getZExtValue(), DAG); + } + APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); @@ -35829,9 +37925,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -35864,18 +37958,20 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - assert( - ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || - (N->getOpcode() == X86ISD::PINSRW && - N->getValueType(0) == MVT::v8i16)) && - "Unexpected vector insertion"); + EVT VT = N->getValueType(0); + assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && + "Unexpected vector insertion"); + + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBitsPerElt), DCI)) + return SDValue(N, 0); // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); - if (SDValue Res = - combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); @@ -35894,8 +37990,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - SDValue CMP0 = N0->getOperand(1); - SDValue CMP1 = N1->getOperand(1); + SDValue CMP0 = N0.getOperand(1); + SDValue CMP1 = N1.getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. @@ -35987,6 +38083,34 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Match (xor X, -1) -> X. +// Match extract_subvector(xor X, -1) -> extract_subvector(X). +// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). +static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { + V = peekThroughBitcasts(V); + if (V.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + return V.getOperand(0); + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { + if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), + Not, V.getOperand(1)); + } + } + SmallVector CatOps; + if (collectConcatOps(V.getNode(), CatOps)) { + for (SDValue &CatOp : CatOps) { + SDValue NotCat = IsNOT(CatOp, DAG); + if (!NotCat) return SDValue(); + CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); + } + return SDValue(); +} + /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); @@ -35996,15 +38120,14 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { return SDValue(); SDValue X, Y; - SDValue N0 = peekThroughBitcasts(N->getOperand(0)); - SDValue N1 = peekThroughBitcasts(N->getOperand(1)); - if (N0.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) { - X = N0.getOperand(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (SDValue Not = IsNOT(N0, DAG)) { + X = Not; Y = N1; - } else if (N1.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) { - X = N1.getOperand(0); + } else if (SDValue Not = IsNOT(N1, DAG)) { + X = Not; Y = N0; } else return SDValue(); @@ -36046,7 +38169,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // The type of the truncated inputs. - if (N0->getOperand(0).getValueType() != VT) + if (N0.getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. @@ -36062,9 +38185,9 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. - N0 = N0->getOperand(0); + N0 = N0.getOperand(0); if (RHSTrunc) - N1 = N1->getOperand(0); + N1 = N1.getOperand(0); else N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); @@ -36088,34 +38211,35 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - unsigned FPOpcode = ISD::DELETED_NODE; - if (N->getOpcode() == ISD::AND) - FPOpcode = X86ISD::FAND; - else if (N->getOpcode() == ISD::OR) - FPOpcode = X86ISD::FOR; - else if (N->getOpcode() == ISD::XOR) - FPOpcode = X86ISD::FXOR; - - assert(FPOpcode != ISD::DELETED_NODE && - "Unexpected input node for FP logic conversion"); - EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && - ((Subtarget.hasSSE1() && VT == MVT::i32) || - (Subtarget.hasSSE2() && VT == MVT::i64))) { - SDValue N00 = N0.getOperand(0); - SDValue N10 = N1.getOperand(0); - EVT N00Type = N00.getValueType(); - EVT N10Type = N10.getValueType(); - if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { - SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); - return DAG.getBitcast(VT, FPLogic); - } + + if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + + // Ensure that both types are the same and are legal scalar fp types. + if (N00Type != N10Type || + !((Subtarget.hasSSE1() && N00Type == MVT::f32) || + (Subtarget.hasSSE2() && N00Type == MVT::f64))) + return SDValue(); + + unsigned FPOpcode; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected input node for FP logic conversion"); + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; } - return SDValue(); + + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); } /// If this is a zero/all-bits result that is bitwise-anded with a low bits @@ -36371,6 +38495,24 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineParity(N, DAG, Subtarget)) return V; + // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector SrcOps; + if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (Mask) { + APInt AllBits = APInt::getAllOnesValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ); + } + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -36392,9 +38534,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -36440,6 +38580,52 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y)) +static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); + + EVT VT = N->getValueType(0); + if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0) + return SDValue(); + + SDValue N0 = peekThroughBitcasts(N->getOperand(0)); + SDValue N1 = peekThroughBitcasts(N->getOperand(1)); + if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) + return SDValue(); + + // On XOP we'll lower to PCMOV so accept one use, otherwise only + // do this if either mask has multiple uses already. + if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() || + !N1.getOperand(1).hasOneUse())) + return SDValue(); + + // Attempt to extract constant byte masks. + APInt UndefElts0, UndefElts1; + SmallVector EltBits0, EltBits1; + if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0, + false, false)) + return SDValue(); + if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1, + false, false)) + return SDValue(); + + for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) { + // TODO - add UNDEF elts support. + if (UndefElts0[i] || UndefElts1[i]) + return SDValue(); + if (EltBits0[i] != ~EltBits1[i]) + return SDValue(); + } + + SDLoc DL(N); + SDValue X = N->getOperand(0); + SDValue Y = + DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), + DAG.getBitcast(VT, N1.getOperand(0))); + return DAG.getNode(ISD::OR, DL, VT, X, Y); +} + // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern. static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { if (N->getOpcode() != ISD::OR) @@ -36472,6 +38658,68 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { return true; } +// Try to match: +// (or (and (M, (sub 0, X)), (pandn M, X))) +// which is a special case of vselect: +// (vselect M, (sub 0, X), X) +// Per: +// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate +// We know that, if fNegate is 0 or 1: +// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) +// +// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: +// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) +// ( M ? -X : X) == ((X ^ M ) + (M & 1)) +// This lets us transform our vselect to: +// (add (xor X, M), (and M, 1)) +// And further to: +// (sub (xor X, M), M) +static SDValue combineLogicBlendIntoConditionalNegate( + EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + EVT MaskVT = Mask.getValueType(); + assert(MaskVT.isInteger() && + DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && + "Mask must be zero/all-bits"); + + if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) + return SDValue(); + if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) + return SDValue(); + + auto IsNegV = [](SDNode *N, SDValue V) { + return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && + ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); + }; + + SDValue V; + if (IsNegV(Y.getNode(), X)) + V = X; + else if (IsNegV(X.getNode(), Y)) + V = Y; + else + return SDValue(); + + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); + SDValue SubOp2 = Mask; + + // If the negate was on the false side of the select, then + // the operands of the SUB need to be swapped. PR 27251. + // This is because the pattern being matched above is + // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) + // but if the pattern matched was + // (vselect M, X, (sub (0, X))), that is really negation of the pattern + // above, -(vselect M, (sub 0, X), X), and therefore the replacement + // pattern also needs to be a negation of the replacement pattern above. + // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the + // sub accomplishes the negation of the replacement pattern. + if (V == Y) + std::swap(SubOp1, SubOp2); + + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); +} + // Try to fold: // (or (and (m, y), (pandn m, x))) // into: @@ -36507,55 +38755,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); - // Try to match: - // (or (and (M, (sub 0, X)), (pandn M, X))) - // which is a special case of vselect: - // (vselect M, (sub 0, X), X) - // Per: - // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate - // We know that, if fNegate is 0 or 1: - // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) - // - // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: - // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) - // ( M ? -X : X) == ((X ^ M ) + (M & 1)) - // This lets us transform our vselect to: - // (add (xor X, M), (and M, 1)) - // And further to: - // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && - DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { - auto IsNegV = [](SDNode *N, SDValue V) { - return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && - ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); - }; - SDValue V; - if (IsNegV(Y.getNode(), X)) - V = X; - else if (IsNegV(X.getNode(), Y)) - V = Y; - - if (V) { - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); - SDValue SubOp2 = Mask; - - // If the negate was on the false side of the select, then - // the operands of the SUB need to be swapped. PR 27251. - // This is because the pattern being matched above is - // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) - // but if the pattern matched was - // (vselect M, X, (sub (0, X))), that is really negation of the pattern - // above, -(vselect M, (sub 0, X), X), and therefore the replacement - // pattern also needs to be a negation of the replacement pattern above. - // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the - // sub accomplishes the negation of the replacement pattern. - if (V == Y) - std::swap(SubOp1, SubOp2); - - SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); - return DAG.getBitcast(VT, Res); - } - } + // Attempt to combine to conditional negate: (sub (xor X, M), M) + if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL, + DAG, Subtarget)) + return Res; // PBLENDVB is only available on SSE 4.1. if (!Subtarget.hasSSE41()) @@ -36665,8 +38868,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); - EVT VT = OR->getValueType(0); - SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); + NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); if (!NewRHS) return SDValue(); Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); @@ -36702,15 +38904,16 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; + if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) + return R; + if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -36718,7 +38921,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some @@ -36747,14 +38950,14 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SDValue ShMsk0; if (ShAmt0.getOpcode() == ISD::AND && isa(ShAmt0.getOperand(1)) && - ShAmt0.getConstantOperandVal(1) == (Bits - 1)) { + ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk0 = ShAmt0; ShAmt0 = ShAmt0.getOperand(0); } SDValue ShMsk1; if (ShAmt1.getOpcode() == ISD::AND && isa(ShAmt1.getOperand(1)) && - ShAmt1.getConstantOperandVal(1) == (Bits - 1)) { + ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk1 = ShAmt1; ShAmt1 = ShAmt1.getOperand(0); } @@ -36765,46 +38968,55 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); - unsigned Opc = X86ISD::SHLD; + unsigned Opc = ISD::FSHL; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); - if (ShAmt0.getOpcode() == ISD::SUB || - ShAmt0.getOpcode() == ISD::XOR) { - Opc = X86ISD::SHRD; + if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { + Opc = ISD::FSHR; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); std::swap(ShMsk0, ShMsk1); } - // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C ) - // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C ) - // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C ) + auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1, + SDValue Amt) { + if (Opc == ISD::FSHR) + std::swap(Op0, Op1); + return DAG.getNode(Opc, DL, VT, Op0, Op1, + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt)); + }; + + // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C ) + // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C ) + // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C ) if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (auto *SumC = dyn_cast(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getOpcode() == ISD::AND && + isa(ShAmt1Op1.getOperand(1)) && + ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) { + ShMsk1 = ShAmt1Op1; + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + } if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if ((SumC->getAPIntValue() == Bits || (SumC->getAPIntValue() == 0 && ShMsk1)) && ShAmt1Op1 == ShAmt0) - return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1, ShAmt0); } } else if (auto *ShAmt1C = dyn_cast(ShAmt1)) { auto *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) - return DAG.getNode(Opc, DL, VT, - N0.getOperand(0), N1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, - MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1, ShAmt0); } else if (ShAmt1.getOpcode() == ISD::XOR) { SDValue Mask = ShAmt1.getOperand(1); if (auto *MaskC = dyn_cast(Mask)) { - unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL); + unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL); SDValue ShAmt1Op0 = ShAmt1.getOperand(0); if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) ShAmt1Op0 = ShAmt1Op0.getOperand(0); @@ -36812,15 +39024,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { if (Op1.getOpcode() == InnerShift && isa(Op1.getOperand(1)) && - Op1.getConstantOperandVal(1) == 1) { - return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + Op1.getConstantOperandAPInt(1) == 1) { + return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && Op1.getOperand(0) == Op1.getOperand(1)) { - return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } } } @@ -36862,7 +39072,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // Make sure the shift amount extracts the sign bit. if (!isa(Shift.getOperand(1)) || - Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. @@ -36915,13 +39125,10 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return SDValue(); // The shift should be smearing the sign bit across each vector element. - auto *ShiftBV = dyn_cast(Shift.getOperand(1)); - if (!ShiftBV) - return SDValue(); - - EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); - auto *ShiftAmt = ShiftBV->getConstantSplatNode(); - if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + auto *ShiftAmt = + isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true); + if (!ShiftAmt || + ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. We don't use the more obvious @@ -37203,15 +39410,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, AVGBuilder); } - if (Operands[0].getOpcode() == ISD::ADD) + // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). + // Match the or case only if its 'add-like' - can be replaced by an add. + auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) { + if (ISD::ADD == V.getOpcode()) { + Op0 = V.getOperand(0); + Op1 = V.getOperand(1); + return true; + } + if (ISD::ZERO_EXTEND != V.getOpcode()) + return false; + V = V.getOperand(0); + if (V.getValueType() != VT || ISD::OR != V.getOpcode() || + !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1))) + return false; + Op0 = V.getOperand(0); + Op1 = V.getOperand(1); + return true; + }; + + SDValue Op0, Op1; + if (FindAddLike(Operands[0], Op0, Op1)) std::swap(Operands[0], Operands[1]); - else if (Operands[1].getOpcode() != ISD::ADD) + else if (!FindAddLike(Operands[1], Op0, Op1)) return SDValue(); - Operands[2] = Operands[1].getOperand(0); - Operands[1] = Operands[1].getOperand(1); + Operands[2] = Op0; + Operands[1] = Op1; // Now we have three operands of two additions. Check that one of them is a - // constant vector with ones, and the other two are promoted from i8/i16. + // constant vector with ones, and the other two can be promoted from i8/i16. for (int i = 0; i < 3; ++i) { if (!IsConstVectorInRange(Operands[i], 1, 1)) continue; @@ -37219,14 +39446,16 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) - if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || - Operands[j].getOperand(0).getValueType() != VT) - return SDValue(); + if (Operands[j].getValueType() != VT) { + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + Operands[j] = Operands[j].getOperand(0); + } // The pattern is detected, emit X86ISD::AVG instruction(s). - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Operands[0].getOperand(0), - Operands[1].getOperand(0) }, AVGBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]}, + AVGBuilder); } return SDValue(); @@ -37246,38 +39475,51 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; - unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, - AddressSpace, Alignment, &Fast) && !Fast))) { + *Ld->getMemOperand(), &Fast) && + !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Ptr = Ld->getBasePtr(); - + unsigned HalfAlign = 16; + SDValue Ptr1 = Ld->getBasePtr(); + SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - NumElems/2); + NumElems / 2); SDValue Load1 = - DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), Alignment, Ld->getMemOperand()->getFlags()); - - Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl); - SDValue Load2 = - DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags()); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, + Ld->getPointerInfo().getWithOffset(HalfAlign), + MinAlign(Alignment, HalfAlign), + Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - Load1.getValue(1), - Load2.getValue(1)); + Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2); return DCI.CombineTo(N, NewVec, TF, true); } + // Bool vector load - attempt to cast to an integer, as we have good + // (vXiY *ext(vXi1 bitcast(iX))) handling. + if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && + RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { + unsigned NumElts = RegVT.getVectorNumElements(); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + if (TLI.isTypeLegal(IntVT)) { + SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Alignment, + Ld->getMemOperand()->getFlags()); + SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); + return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); + } + } + return SDValue(); } @@ -37404,6 +39646,9 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, if (ML->getPassThru().isUndef()) return SDValue(); + if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode())) + return SDValue(); + // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), @@ -37434,7 +39679,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::SEXTLOAD) + if (Mld->getExtensionType() != ISD::EXTLOAD) return SDValue(); // Resolve extending loads. @@ -37504,8 +39749,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); - return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -37543,6 +39800,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!Mst->isTruncatingStore()) { if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) return ScalarStore; @@ -37551,7 +39812,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); @@ -37561,20 +39821,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // pattern above, but that pattern will be different. It will either need to // match setcc more generally or match PCMPGTM later (in tablegen?). + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); + } + return SDValue(); } // Resolve truncating stores. unsigned NumElems = VT.getVectorNumElements(); - EVT StVT = Mst->getMemoryVT(); - SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. @@ -37644,11 +39909,13 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); + unsigned Alignment = St->getAlignment(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -37699,8 +39966,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); - unsigned Alignment = St->getAlignment(); - SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl); @@ -37724,30 +39989,48 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. bool Fast; - unsigned AddressSpace = St->getAddressSpace(); - unsigned Alignment = St->getAlignment(); if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AddressSpace, Alignment, &Fast) && + *St->getMemOperand(), &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); - SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); + return splitVectorStore(St, DAG); + } + + // Split under-aligned vector non-temporal stores. + if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) { + // ZMM/YMM nt-stores - either it can be stored as a series of shorter + // vectors or the legalizer can scalarize it to use MOVNTI. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + return splitVectorStore(St, DAG); + } - SDValue Ptr0 = St->getBasePtr(); - SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); + // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 + // to use MOVNTI. + if (VT.is128BitVector() && Subtarget.hasSSE2()) { + MVT NTVT = Subtarget.hasSSE4A() + ? MVT::v2f64 + : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); + return scalarizeVectorStore(St, NTVT, DAG); + } + } - SDValue Ch0 = - DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), - Alignment, St->getMemOperand()->getFlags()); - SDValue Ch1 = - DAG.getStore(St->getChain(), dl, Value1, Ptr1, - St->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), St->getMemOperand()->getFlags()); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + // Try to optimize v16i16->v16i8 truncating stores when BWI is not + // supported, but avx512f is by extending to v16i32 and truncating. + if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && + St->getValue().getOpcode() == ISD::TRUNCATE && + St->getValue().getOperand(0).getValueType() == MVT::v16i16 && + TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) && + !DCI.isBeforeLegalizeOps()) { + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); + return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), + MVT::v16i8, St->getMemOperand()); } // Optimize trunc store (of multiple scalars) to shuffle and store. @@ -37763,7 +40046,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue Val = detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget, TLI)) @@ -37867,7 +40149,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if ((VT.isVector() || + if (((VT.isVector() && !VT.isFloatingPoint()) || (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && @@ -37890,8 +40172,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = (Subtarget.is64Bit() && - (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64; + MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); @@ -37965,7 +40246,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool IsCommutative) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -37979,51 +40262,83 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. - // At least one of the operands should be a vector shuffle. - if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && - RHS.getOpcode() != ISD::VECTOR_SHUFFLE) - return false; - MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); + unsigned NumElts = VT.getVectorNumElements(); + + // TODO - can we make a general helper method that does all of this for us? + auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, + SmallVectorImpl &ShuffleMask) { + if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (!Op.getOperand(0).isUndef()) + N0 = Op.getOperand(0); + if (!Op.getOperand(1).isUndef()) + N1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + ShuffleMask.append(Mask.begin(), Mask.end()); + return; + } + bool UseSubVector = false; + if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Op.getOperand(0).getValueType().is256BitVector() && + llvm::isNullConstant(Op.getOperand(1))) { + Op = Op.getOperand(0); + UseSubVector = true; + } + bool IsUnary; + SmallVector SrcOps; + SmallVector SrcShuffleMask; + SDValue BC = peekThroughBitcasts(Op); + if (isTargetShuffle(BC.getOpcode()) && + getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, + SrcOps, SrcShuffleMask, IsUnary)) { + if (!UseSubVector && SrcShuffleMask.size() == NumElts && + SrcOps.size() <= 2) { + N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); + N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); + ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); + } + if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) && + SrcOps.size() == 1) { + N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op)); + N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op)); + ArrayRef Mask = ArrayRef(SrcShuffleMask).slice(0, NumElts); + ShuffleMask.append(Mask.begin(), Mask.end()); + } + } + }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. - unsigned NumElts = VT.getVectorNumElements(); SDValue A, B; - SmallVector LMask(NumElts); - if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!LHS.getOperand(0).isUndef()) - A = LHS.getOperand(0); - if (!LHS.getOperand(1).isUndef()) - B = LHS.getOperand(1); - ArrayRef Mask = cast(LHS.getNode())->getMask(); - llvm::copy(Mask, LMask.begin()); - } else { - A = LHS; - for (unsigned i = 0; i != NumElts; ++i) - LMask[i] = i; - } + SmallVector LMask; + GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector RMask(NumElts); - if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!RHS.getOperand(0).isUndef()) - C = RHS.getOperand(0); - if (!RHS.getOperand(1).isUndef()) - D = RHS.getOperand(1); - ArrayRef Mask = cast(RHS.getNode())->getMask(); - llvm::copy(Mask, RMask.begin()); - } else { + SmallVector RMask; + GetShuffle(RHS, C, D, RMask); + + // At least one of the operands should be a vector shuffle. + unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1); + if (NumShuffles == 0) + return false; + + if (LMask.empty()) { + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask.push_back(i); + } + + if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) - RMask[i] = i; + RMask.push_back(i); } // If A and B occur in reverse order in RHS, then canonicalize by commuting @@ -38072,6 +40387,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + + if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + return false; + + LHS = DAG.getBitcast(VT, LHS); + RHS = DAG.getBitcast(VT, RHS); return true; } @@ -38088,8 +40409,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd) && - shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget)) + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); return SDValue(); @@ -38105,7 +40425,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const SDLoc &DL) { assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); SDValue Src = N->getOperand(0); - unsigned Opcode = Src.getOpcode(); + unsigned SrcOpcode = Src.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); @@ -38123,14 +40443,17 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, return true; // See if this is a single use constant which can be constant folded. - SDValue BC = peekThroughOneUseBitcasts(Op); - return ISD::isBuildVectorOfConstantSDNodes(BC.getNode()); + // NOTE: We don't peek throught bitcasts here because there is currently + // no support for constant folding truncate+bitcast+vector_of_constants. So + // we'll just send up with a truncate on both operands which will + // get turned back into (truncate (binop)) causing an infinite loop. + return ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); }; auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); - return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1); + return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1); }; // Don't combine if the operation has other uses. @@ -38145,13 +40468,13 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // In most cases its only worth pre-truncating if we're only facing the cost // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. - switch (Opcode) { + switch (SrcOpcode) { case ISD::AND: case ISD::XOR: case ISD::OR: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegalOrPromote(Opcode, VT) && + if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; @@ -38160,14 +40483,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. - if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && - !TLI.isOperationLegal(Opcode, SrcVT)) + if (SrcVT.getScalarType() == MVT::i64 && + TLI.isOperationLegal(SrcOpcode, VT) && + !TLI.isOperationLegal(SrcOpcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(Opcode, VT) && + if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; @@ -38177,7 +40501,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // truncatable to avoid interfering with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(Opcode, VT) && + if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1)))) return TruncateArithmetic(Op0, Op1); break; @@ -38188,36 +40512,19 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, } /// Truncate using ISD::AND mask and X86ISD::PACKUS. +/// e.g. trunc <8 x i32> X to <8 x i16> --> +/// MaskX = X & 0xffff (clear high bits to prevent saturation) +/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1) static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); - EVT InSVT = InVT.getVectorElementType(); EVT OutVT = N->getValueType(0); - EVT OutSVT = OutVT.getVectorElementType(); - - // Split a long vector into vectors of legal type and mask to unset all bits - // that won't appear in the result to prevent saturation. - // TODO - we should be doing this at the maximum legal size but this is - // causing regressions where we're concatenating back to max width just to - // perform the AND and then extracting back again..... - unsigned NumSubRegs = InVT.getSizeInBits() / 128; - unsigned NumSubRegElts = 128 / InSVT.getSizeInBits(); - EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts); - SmallVector SubVecs(NumSubRegs); - - APInt Mask = - APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits()); - SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT); - - for (unsigned i = 0; i < NumSubRegs; i++) { - SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In, - DAG.getIntPtrConstant(i * NumSubRegElts, DL)); - SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal); - } - In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs); + APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT)); return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget); } @@ -38580,16 +40887,23 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); + unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); + SDValue Op = peekThroughBitcasts(SDValue(N, 0)); - auto VT = Op->getValueType(0); + EVT VT = Op->getValueType(0); + // Make sure the element size does't change. + if (VT.getScalarSizeInBits() != ScalarSize) + return SDValue(); + if (auto SVOp = dyn_cast(Op.getNode())) { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!SVOp->getOperand(1).isUndef()) return SDValue(); if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode())) - return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), - SVOp->getMask()); + if (NegOp0.getValueType() == VT) // FIXME: Can we do better? + return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), + SVOp->getMask()); return SDValue(); } unsigned Opc = Op.getOpcode(); @@ -38601,19 +40915,17 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (!InsVector.isUndef()) return SDValue(); if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode())) - return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, - NegInsVal, Op.getOperand(2)); + if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, + NegInsVal, Op.getOperand(2)); return SDValue(); } if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB) return SDValue(); - SDValue Op1 = peekThroughBitcasts(Op.getOperand(1)); - if (!Op1.getValueType().isFloatingPoint()) - return SDValue(); - - SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); + SDValue Op1 = Op.getOperand(1); + SDValue Op0 = Op.getOperand(0); // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit // masks. For FSUB, we have to check if constant bits of Op0 are sign bit @@ -38625,7 +40937,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { SmallVector EltBits; // Extract constant bits and see if they are all sign bit masks. Ignore the // undef elements. - if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(), + if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) { @@ -38922,13 +41234,12 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (Subtarget.useSoftFloat()) return SDValue(); - // TODO: If an operand is already known to be a NaN or not a NaN, this - // should be an optional swap and FMAX/FMIN. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); - if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || - (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + if (!((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64) || + (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); SDValue Op0 = N->getOperand(0); @@ -38941,13 +41252,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + // If one of the operands is known non-NaN use the native min/max instructions + // with the non-NaN input as second operand. + if (DAG.isKnownNeverNaN(Op1)) + return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + if (DAG.isKnownNeverNaN(Op0)) + return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags()); + // If we have to respect NaN inputs, this takes at least 3 instructions. // Favor a library call when operating on a scalar and minimizing code size. - if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) + if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); - EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( - DAG.getDataLayout(), *DAG.getContext(), VT); + EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: @@ -38987,6 +41305,69 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, KnownZero, DCI)) return SDValue(N, 0); + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getIntegerVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + + return SDValue(); +} + +static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getFloatingPointVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + return SDValue(); } @@ -39005,18 +41386,14 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(0, SDLoc(N), VT); // Turn ANDNP back to AND if input is inverted. - if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) { - return DAG.getNode(ISD::AND, SDLoc(N), VT, - N->getOperand(0).getOperand(0), N->getOperand(1)); - } + if (SDValue Not = IsNOT(N->getOperand(0), DAG)) + return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), + N->getOperand(1)); // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -39039,18 +41416,24 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + + EVT DstVT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); - if (ExtraVT != MVT::i16) + if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16) return SDValue(); - // Look through single use any_extends. - if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse()) + // Look through single use any_extends / truncs. + SDValue IntermediateBitwidthOp; + if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) && + N0.hasOneUse()) { + IntermediateBitwidthOp = N0; N0 = N0.getOperand(0); + } // See if we have a single use cmov. if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse()) @@ -39066,21 +41449,37 @@ static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // If we looked through an any_extend above, add one to the constants. - if (N0.getValueType() != VT) { - CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0); - CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1); + // If we looked through an any_extend/trunc above, add one to the constants. + if (IntermediateBitwidthOp) { + unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode(); + CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0); + CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1); } - CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1); - CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1); + CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1); + CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1); - return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1, - N0.getOperand(2), N0.getOperand(3)); + EVT CMovVT = DstVT; + // We do not want i16 CMOV's. Promote to i32 and truncate afterwards. + if (DstVT == MVT::i16) { + CMovVT = MVT::i32; + CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0); + CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1); + } + + SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1, + N0.getOperand(2), N0.getOperand(3)); + + if (CMovVT != DstVT) + CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov); + + return CMov; } static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + if (SDValue V = combineSextInRegCmov(N, DAG)) return V; @@ -39336,6 +41735,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, return SDValue(); unsigned Opcode = N->getOpcode(); + // TODO - add ANY_EXTEND support. if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) @@ -39382,13 +41782,13 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { - EVT InVT = N.getValueType(); - EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), - Size / InVT.getScalarSizeInBits()); - SmallVector Opnds(Size / InVT.getSizeInBits(), - DAG.getUNDEF(InVT)); + EVT SrcVT = N.getValueType(); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + Size / SrcVT.getScalarSizeInBits()); + SmallVector Opnds(Size / SrcVT.getSizeInBits(), + DAG.getUNDEF(SrcVT)); Opnds[0] = N; - return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); }; // If target-size is less than 128-bits, extend to a type that would extend @@ -39410,8 +41810,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, (VT.is256BitVector() && Subtarget.hasAVX()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); - Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, ExOp); } @@ -39421,9 +41820,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); - unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; - + unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); SmallVector Opnds; for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, @@ -39457,7 +41854,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Only do this combine with AVX512 for vector extends. - if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC) + if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC) return SDValue(); // Only combine legal element types. @@ -39473,7 +41870,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since // that's the only integer compares with we have. - ISD::CondCode CC = cast(N0->getOperand(2))->get(); + ISD::CondCode CC = cast(N0.getOperand(2))->get(); if (ISD::isUnsignedIntSetCC(CC)) return SDValue(); @@ -39629,6 +42026,10 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, if (!NegVal) return SDValue(); + // FIXME: Should we bitcast instead? + if (NegVal.getValueType() != VT) + return SDValue(); + unsigned NewOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); @@ -39705,6 +42106,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) return R; + // TODO: Combine with any target/faux shuffle. + if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && + VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); + unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); + APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); + if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && + (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { + return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); + } + } + return SDValue(); } @@ -39734,9 +42149,14 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, if (isNullConstant(Y) && !IsOrXorXorCCZero) return SDValue(); - // Bail out if we know that this is not really just an oversized integer. - if (peekThroughBitcasts(X).getValueType() == MVT::f128 || - peekThroughBitcasts(Y).getValueType() == MVT::f128) + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa(X) || X.getValueType().isVector() || + X.getOpcode() == ISD::LOAD; + }; + if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && + !IsOrXorXorCCZero) return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? @@ -39873,66 +42293,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); + unsigned NumBits = VT.getScalarSizeInBits(); + unsigned NumElts = SrcVT.getVectorNumElements(); // Perform constant folding. if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) { - assert(VT== MVT::i32 && "Unexpected result type"); + assert(VT == MVT::i32 && "Unexpected result type"); APInt Imm(32, 0); for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) { - SDValue In = Src.getOperand(Idx); - if (!In.isUndef() && - cast(In)->getAPIntValue().isNegative()) + if (!Src.getOperand(Idx).isUndef() && + Src.getConstantOperandAPInt(Idx).isNegative()) Imm.setBit(Idx); } return DAG.getConstant(Imm, SDLoc(N), VT); } // Look through int->fp bitcasts that don't change the element width. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() && - SrcVT.isFloatingPoint() && - Src.getOperand(0).getValueType() == - EVT(SrcVT).changeVectorElementTypeToInteger()) - Src = Src.getOperand(0); + unsigned EltWidth = SrcVT.getScalarSizeInBits(); + if (Src.getOpcode() == ISD::BITCAST && + Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) + return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); + + // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results + // with scalar comparisons. + if (SDValue NotSrc = IsNOT(Src, DAG)) { + SDLoc DL(N); + APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); + NotSrc = DAG.getBitcast(SrcVT, NotSrc); + return DAG.getNode(ISD::XOR, DL, VT, + DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc), + DAG.getConstant(NotMask, DL, VT)); + } // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + APInt DemandedMask(APInt::getAllOnesValue(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); - // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)). - // Only do this when the setcc input and output types are the same and the - // setcc and the 'and' node have a single use. - // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't. - APInt SplatVal; - if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && - Src.getOperand(0).getValueType() == Src.getValueType() && - cast(Src.getOperand(2))->get() == ISD::SETNE && - ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && - Src.getOperand(0).getOpcode() == ISD::AND) { - SDValue And = Src.getOperand(0); - if (And.hasOneUse() && - ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) && - SplatVal.isPowerOf2()) { - MVT VT = Src.getSimpleValueType(); - unsigned BitWidth = VT.getScalarSizeInBits(); - unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1; - SDLoc DL(And); - SDValue X = And.getOperand(0); - // If the element type is i8, we need to bitcast to i16 to use a legal - // shift. If we wait until lowering we end up with an extra and to bits - // from crossing the 8-bit elements, but we don't care about that here. - if (VT.getVectorElementType() == MVT::i8) { - VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); - X = DAG.getBitcast(VT, X); - } - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(ShAmt, DL, VT)); - SDValue Cast = DAG.getBitcast(SrcVT, Shl); - return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast); - } - } - return SDValue(); } @@ -40065,8 +42463,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. - if (BuildVectorSDNode *BV = - dyn_cast(N->getOperand(0)->getOperand(1))) { + if (auto *BV = dyn_cast(N->getOperand(0).getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); @@ -40088,6 +42485,41 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, return SDValue(); } +/// If we are converting a value to floating-point, try to replace scalar +/// truncate of an extracted vector element with a bitcast. This tries to keep +/// the sequence on XMM registers rather than moving between vector and GPRs. +static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { + // TODO: This is currently only used by combineSIntToFP, but it is generalized + // to allow being called by any similar cast opcode. + // TODO: Consider merging this into lowering: vectorizeExtractedCast(). + SDValue Trunc = N->getOperand(0); + if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue ExtElt = Trunc.getOperand(0); + if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isNullConstant(ExtElt.getOperand(1))) + return SDValue(); + + EVT TruncVT = Trunc.getValueType(); + EVT SrcVT = ExtElt.getValueType(); + unsigned DestWidth = TruncVT.getSizeInBits(); + unsigned SrcWidth = SrcVT.getSizeInBits(); + if (SrcWidth % DestWidth != 0) + return SDValue(); + + // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0) + EVT SrcVecVT = ExtElt.getOperand(0).getValueType(); + unsigned VecWidth = SrcVecVT.getSizeInBits(); + unsigned NumElts = VecWidth / DestWidth; + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts); + SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0)); + SDLoc DL(N); + SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT, + BitcastVec, ExtElt.getOperand(1)); + return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt); +} + static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); @@ -40181,6 +42613,10 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return FILDChain; } } + + if (SDValue V = combineToFPTruncExtElt(N, DAG)) + return V; + return SDValue(); } @@ -40267,13 +42703,13 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && Op.hasOneUse() && isa(Op.getOperand(1)) && onlyZeroFlagUsed(SDValue(N, 0))) { - EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); - unsigned ShAmt = Op.getConstantOperandVal(1); - if (ShAmt < BitWidth) { // Avoid undefined shifts. + const APInt &ShAmt = Op.getConstantOperandAPInt(1); + if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts. + unsigned MaskBits = BitWidth - ShAmt.getZExtValue(); APInt Mask = Op.getOpcode() == ISD::SRL - ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) - : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + ? APInt::getHighBitsSet(BitWidth, MaskBits) + : APInt::getLowBitsSet(BitWidth, MaskBits); if (Mask.isSignedIntN(32)) { Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), DAG.getConstant(Mask, dl, VT)); @@ -40283,7 +42719,6 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { } } - // Look for a truncate with a single use. if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse()) return SDValue(); @@ -40337,8 +42772,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { return Op.getValue(1); } +static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && + "Expected X86ISD::ADD or X86ISD::SUB"); + + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + MVT VT = LHS.getSimpleValueType(); + unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB; + + // If we don't use the flag result, simplify back to a generic ADD/SUB. + if (!N->hasAnyUseOfValue(1)) { + SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS); + return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL); + } + + // Fold any similar generic ADD/SUB opcodes to reuse this node. + auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { + SDValue Ops[] = {N0, N1}; + SDVTList VTs = DAG.getVTList(N->getValueType(0)); + if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { + SDValue Op(N, 0); + if (Negate) + Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); + DCI.CombineTo(GenericAddSub, Op); + } + }; + MatchGeneric(LHS, RHS, false); + MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); + + return SDValue(); +} + static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, @@ -40346,6 +42815,15 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { Flags); } + // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) + // iff the flag result is dead. + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) && + !N->hasAnyUseOfValue(1)) + return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0), + Op0.getOperand(1), N->getOperand(2)); + return SDValue(); } @@ -40372,7 +42850,7 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, Res1, CarryOut); } - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, @@ -40468,7 +42946,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), @@ -40575,8 +43053,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, // Madd vector size is half of the original vector size auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); + MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; auto BuildPMADDWD = [&](SDValue Mul) { @@ -40631,10 +43109,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, return SDValue(); // We know N is a reduction add, which means one of its operands is a phi. - // To match SAD, we need the other operand to be a vector select. - if (Op0.getOpcode() != ISD::VSELECT) + // To match SAD, we need the other operand to be a ABS. + if (Op0.getOpcode() != ISD::ABS) std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::VSELECT) + if (Op0.getOpcode() != ISD::ABS) return SDValue(); auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) { @@ -40673,7 +43151,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, Op0 = BuildPSADBW(SadOp0, SadOp1); // It's possible we have a sad on the other side too. - if (Op1.getOpcode() == ISD::VSELECT && + if (Op1.getOpcode() == ISD::ABS && detectZextAbsDiff(Op1, SadOp0, SadOp1)) { Op1 = BuildPSADBW(SadOp0, SadOp1); } @@ -40815,39 +43293,6 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, PMADDBuilder); } -// Try to turn (add (umax X, C), -C) into (psubus X, C) -static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - EVT VT = N->getValueType(0); - - // psubus is available in SSE2 for i8 and i16 vectors. - if (!VT.isVector() || VT.getVectorNumElements() < 2 || - !isPowerOf2_32(VT.getVectorNumElements()) || - !(VT.getVectorElementType() == MVT::i8 || - VT.getVectorElementType() == MVT::i16)) - return SDValue(); - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - if (Op0.getOpcode() != ISD::UMAX) - return SDValue(); - - // The add should have a constant that is the negative of the max. - // TODO: Handle build_vectors with undef elements. - auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { - return Max->getAPIntValue() == (-Op->getAPIntValue()); - }; - if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT)) - return SDValue(); - - SDLoc DL(N); - return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0), - Op0.getOperand(1)); -} - // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) @@ -40957,12 +43402,12 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, ArrayRef Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. - EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i16 && + EVT OpVT = Ops[0].getValueType(); + assert(OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"); - assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); + assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - InVT.getVectorNumElements() / 2); + OpVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, @@ -40990,8 +43435,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); @@ -41003,9 +43448,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; - if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget)) - return V; - return combineAddOrSubToADCOrSBB(N, DAG); } @@ -41110,7 +43552,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa(Op1.getOperand(1))) { - APInt XorC = cast(Op1.getOperand(1))->getAPIntValue(); + const APInt &XorC = Op1.getConstantOperandAPInt(1); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), @@ -41124,8 +43566,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); @@ -41159,6 +43601,149 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Helper that combines an array of subvector ops as if they were the operands +/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g. +/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type. +static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, + ArrayRef Ops, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); + + if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) + return DAG.getUNDEF(VT); + + if (llvm::all_of(Ops, [](SDValue Op) { + return ISD::isBuildVectorAllZeros(Op.getNode()); + })) + return getZeroVector(VT, Subtarget, DAG, DL); + + SDValue Op0 = Ops[0]; + + // Fold subvector loads into one. + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast(peekThroughBitcasts(Op0))) { + bool Fast; + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + *FirstLd->getMemOperand(), &Fast) && + Fast) { + if (SDValue Ld = + EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + return Ld; + } + } + + // Repeated subvectors. + if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) { + // If this broadcast/subv_broadcast is inserted into both halves, use a + // larger broadcast/subv_broadcast. + if (Op0.getOpcode() == X86ISD::VBROADCAST || + Op0.getOpcode() == X86ISD::SUBV_BROADCAST) + return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); + + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) + if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && + (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, + Op0.getOperand(0), + DAG.getIntPtrConstant(0, DL))); + + // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) + if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Subtarget.hasAVX2() || + (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) && + Op0.getOperand(0).getValueType() == VT.getScalarType()) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); + } + + bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); + + // Repeated opcode. + // TODO - combineX86ShufflesRecursively should handle shuffle concatenation + // but it currently struggles with different vector widths. + if (llvm::all_of(Ops, [Op0](SDValue Op) { + return Op.getOpcode() == Op0.getOpcode(); + })) { + unsigned NumOps = Ops.size(); + switch (Op0.getOpcode()) { + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFD: + if (!IsSplat && NumOps == 2 && VT.is256BitVector() && + Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + SmallVector Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(Ops[i].getOperand(0)); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src), + Op0.getOperand(1)); + } + LLVM_FALLTHROUGH; + case X86ISD::VPERMILPI: + // TODO - add support for vXf64/vXi64 shuffles. + if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) && + Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + SmallVector Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0))); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src); + Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res, + Op0.getOperand(1)); + return DAG.getBitcast(VT, Res); + } + break; + case X86ISD::PACKUS: + if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) { + SmallVector LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); + } + break; + } + } + + // If we're inserting all zeros into the upper half, change this to + // an insert into an all zeros vector. We will match this to a move + // with implicit upper bit zeroing during isel. + if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), Ops[0], + DAG.getIntPtrConstant(0, DL)); + + return SDValue(); +} + +static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + EVT SrcVT = N->getOperand(0).getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Don't do anything for i1 vectors. + if (VT.getVectorElementType() == MVT::i1) + return SDValue(); + + if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) { + SmallVector Ops(N->op_begin(), N->op_end()); + if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, + DCI, Subtarget)) + return R; + } + + return SDValue(); +} + static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -41173,19 +43758,23 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); - unsigned IdxVal = N->getConstantOperandVal(2); + uint64_t IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); - if (ISD::isBuildVectorAllZeros(Vec.getNode())) { - // Inserting zeros into zeros is a nop. - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - return getZeroVector(OpVT, Subtarget, DAG, dl); + if (Vec.isUndef() && SubVec.isUndef()) + return DAG.getUNDEF(OpVT); + + // Inserting undefs/zeros into zeros/undefs is a zero vector. + if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) && + (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode()))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { // If we're inserting into a zero vector and then into a larger zero vector, // just insert into the larger zero vector directly. if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { - unsigned Idx2Val = SubVec.getConstantOperandVal(2); + uint64_t Idx2Val = SubVec.getConstantOperandVal(2); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVec.getOperand(1), @@ -41197,30 +43786,16 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && - SubVec.getConstantOperandVal(1) == 0 && + SubVec.getConstantOperandAPInt(1) == 0 && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); - if (Ins.getConstantOperandVal(2) == 0 && + if (Ins.getConstantOperandAPInt(2) == 0 && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), Ins.getOperand(1), N->getOperand(2)); } - - // If we're inserting a bitcast into zeros, rewrite the insert and move the - // bitcast to the other side. This helps with detecting zero extending - // during isel. - // TODO: Is this useful for other indices than 0? - if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) { - MVT CastVT = SubVec.getOperand(0).getSimpleValueType(); - unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits(); - MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems); - SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, - DAG.getBitcast(NewVT, Vec), - SubVec.getOperand(0), N->getOperand(2)); - return DAG.getBitcast(OpVT, Insert); - } } // Stop here if this is an i1 vector. @@ -41248,77 +43823,92 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, } } - // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte - // load: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr + 16), Elts/2) - // --> load32 addr - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr + 32), Elts/2) - // --> load64 addr - // or a 16-byte or 32-byte broadcast: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr), Elts/2) - // --> X86SubVBroadcast(load16 addr) - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr), Elts/2) - // --> X86SubVBroadcast(load32 addr) + // Match concat_vector style patterns. + SmallVector SubVectorOps; + if (collectConcatOps(N, SubVectorOps)) + if (SDValue Fold = + combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) + return Fold; + + // If we are inserting into both halves of the vector, the starting vector + // should be undef. If it isn't, make it so. Only do this if the early insert + // has no other uses. + // TODO: Should this be a generic DAG combine? + // TODO: Why doesn't SimplifyDemandedVectorElts catch this? if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - if (isNullConstant(Vec.getOperand(2))) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast(peekThroughBitcasts(SubVec2))) { - bool Fast; - unsigned Alignment = FirstLd->getAlignment(); - unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, - Subtarget, false)) - return Ld; - } - } - // If lower/upper loads are the same and there's no other use of the lower - // load, then splat the loaded value with a broadcast. - if (auto *Ld = dyn_cast(peekThroughOneUseBitcasts(SubVec2))) - if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse()) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - - // If this is subv_broadcast insert into both halves, use a larger - // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, - SubVec.getOperand(0)); - - // If we're inserting all zeros into the upper half, change this to - // an insert into an all zeros vector. We will match this to a move - // with implicit upper bit zeroing during isel. - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, - getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2, - Vec.getOperand(2)); + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 && + isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() && + Vec.hasOneUse()) { + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), + Vec.getOperand(1), Vec.getOperand(2)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, + N->getOperand(2)); + } - // If we are inserting into both halves of the vector, the starting - // vector should be undef. If it isn't, make it so. Only do this if the - // the early insert has no other uses. - // TODO: Should this be a generic DAG combine? - if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) { - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), - SubVec2, Vec.getOperand(2)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, - N->getOperand(2)); + // If this is a broadcast insert into an upper undef, use a larger broadcast. + if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) + return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); - } - } + return SDValue(); +} + +/// If we are extracting a subvector of a vector select and the select condition +/// is composed of concatenated vectors, try to narrow the select width. This +/// is a common pattern for AVX1 integer code because 256-bit selects may be +/// legal, but there is almost no integer math/logic available for 256-bit. +/// This function should only be called with legal types (otherwise, the calls +/// to get simple value types will assert). +static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { + SDValue Sel = peekThroughBitcasts(Ext->getOperand(0)); + SmallVector CatOps; + if (Sel.getOpcode() != ISD::VSELECT || + !collectConcatOps(Sel.getOperand(0).getNode(), CatOps)) + return SDValue(); + + // Note: We assume simple value types because this should only be called with + // legal operations/types. + // TODO: This can be extended to handle extraction to 256-bits. + MVT VT = Ext->getSimpleValueType(0); + if (!VT.is128BitVector()) + return SDValue(); + + MVT SelCondVT = Sel.getOperand(0).getSimpleValueType(); + if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector()) + return SDValue(); + + MVT WideVT = Ext->getOperand(0).getSimpleValueType(); + MVT SelVT = Sel.getSimpleValueType(); + assert((SelVT.is256BitVector() || SelVT.is512BitVector()) && + "Unexpected vector type with legal operations"); + + unsigned SelElts = SelVT.getVectorNumElements(); + unsigned CastedElts = WideVT.getVectorNumElements(); + unsigned ExtIdx = cast(Ext->getOperand(1))->getZExtValue(); + if (SelElts % CastedElts == 0) { + // The select has the same or more (narrower) elements than the extract + // operand. The extraction index gets scaled by that factor. + ExtIdx *= (SelElts / CastedElts); + } else if (CastedElts % SelElts == 0) { + // The select has less (wider) elements than the extract operand. Make sure + // that the extraction index can be divided evenly. + unsigned IndexDivisor = CastedElts / SelElts; + if (ExtIdx % IndexDivisor != 0) + return SDValue(); + ExtIdx /= IndexDivisor; + } else { + llvm_unreachable("Element count of simple vector types are not divisible?"); } - return SDValue(); + unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits(); + unsigned NarrowElts = SelElts / NarrowingFactor; + MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts); + SDLoc DL(Ext); + SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL); + SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL); + SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL); + SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF); + return DAG.getBitcast(VT, NarrowSel); } static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, @@ -41334,7 +43924,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Capture the original wide type in the likely case that we need to bitcast // back to this type. - EVT VT = N->getValueType(0); + if (!N->getValueType(0).isSimple()) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); EVT WideVecVT = N->getOperand(0).getValueType(); SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -41360,65 +43953,102 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - MVT OpVT = N->getSimpleValueType(0); + if (SDValue V = narrowExtractedVectorSelect(N, DAG)) + return V; + SDValue InVec = N->getOperand(0); unsigned IdxVal = cast(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) - return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N)); + return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); if (ISD::isBuildVectorAllOnes(InVec.getNode())) { - if (OpVT.getScalarType() == MVT::i1) - return DAG.getConstant(1, SDLoc(N), OpVT); - return getOnesVector(OpVT, DAG, SDLoc(N)); + if (VT.getScalarType() == MVT::i1) + return DAG.getConstant(1, SDLoc(N), VT); + return getOnesVector(VT, DAG, SDLoc(N)); } if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector( - OpVT, SDLoc(N), - InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements())); + VT, SDLoc(N), + InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); + + // Try to move vector bitcast after extract_subv by scaling extraction index: + // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') + // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR + if (InVec.getOpcode() == ISD::BITCAST && + InVec.getOperand(0).getValueType().isVector()) { + SDValue SrcOp = InVec.getOperand(0); + EVT SrcVT = SrcOp.getValueType(); + unsigned SrcNumElts = SrcVT.getVectorNumElements(); + unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if ((DestNumElts % SrcNumElts) == 0) { + unsigned DestSrcRatio = DestNumElts / SrcNumElts; + if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { + unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NewExtNumElts); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + SrcOp, NewIndex); + return DAG.getBitcast(VT, NewExtract); + } + } + } + } + + // If we're extracting from a broadcast then we're better off just + // broadcasting to the smaller type directly, assuming this is the only use. + // As its a broadcast we don't care about the extraction index. + if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() && + InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); - if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { + if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { - return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0)); + return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); + } + // v2f64 CVTUDQ2PD(v4i32). + if (InOpcode == ISD::UINT_TO_FP && + InVec.getOperand(0).getValueType() == MVT::v4i32) { + return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTPS2PD(v4f32). if (InOpcode == ISD::FP_EXTEND && InVec.getOperand(0).getValueType() == MVT::v4f32) { - return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0)); + return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0)); } } - if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) && - OpVT.is128BitVector() && - InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - unsigned ExtOp = - InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG; - return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); - } - if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + if ((InOpcode == ISD::ANY_EXTEND || + InOpcode == ISD::ANY_EXTEND_VECTOR_INREG || + InOpcode == ISD::ZERO_EXTEND || + InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && - OpVT.is128BitVector() && + VT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0)); + unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); + return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0)); } - if (InOpcode == ISD::BITCAST) { - // TODO - do this for target shuffles in general. - SDValue InVecBC = peekThroughOneUseBitcasts(InVec); - if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) { - SDLoc DL(N); - SDValue SubPSHUFB = - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, - extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL), - extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL)); - return DAG.getBitcast(OpVT, SubPSHUFB); - } + if (InOpcode == ISD::VSELECT && + InVec.getOperand(0).getValueType().is256BitVector() && + InVec.getOperand(1).getValueType().is256BitVector() && + InVec.getOperand(2).getValueType().is256BitVector()) { + SDLoc DL(N); + SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128); + SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128); + SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); + return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } } @@ -41428,6 +44058,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + SDLoc DL(N); // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. // This occurs frequently in our masked scalar intrinsic code and our @@ -41436,7 +44067,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) if (auto *C = dyn_cast(Src.getOperand(1))) if (C->getAPIntValue().isOneValue()) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. @@ -41445,8 +44076,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) if (auto *C = dyn_cast(Src.getOperand(1))) if (C->isNullValue()) - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - Src.getOperand(0), Src.getOperand(1)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), + Src.getOperand(1)); + + // Reduce v2i64 to v4i32 if we don't need the upper bits. + // TODO: Move to DAGCombine? + if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && + Src.getValueType() == MVT::i64 && Src.hasOneUse() && + Src.getOperand(0).getScalarValueSizeInBits() <= 32) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, + DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); return SDValue(); } @@ -41483,6 +44123,56 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Try to merge vector loads and extend_inreg to an extload. + if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && + In.hasOneUse()) { + auto *Ld = cast(In); + if (!Ld->isVolatile()) { + MVT SVT = In.getSimpleValueType().getVectorElementType(); + ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, + VT.getVectorNumElements()); + if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { + SDValue Load = + DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + return Load; + } + } + } + + // Disabling for widening legalization for now. We can enable if we find a + // case that needs it. Otherwise it can be deleted when we switch to + // widening legalization. + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + // Combine (ext_invec (ext_invec X)) -> (ext_invec X) + if (In.getOpcode() == N->getOpcode() && + TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); + + // Attempt to combine as a shuffle. + // TODO: SSE41 support + if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { + SDValue Op(N, 0); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -41494,6 +44184,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PEXTRW: case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); + case ISD::CONCAT_VECTORS: + return combineConcatVectors(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: @@ -41506,19 +44198,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::CMP: return combineCMP(N, DAG); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); + case X86ISD::ADD: + case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); + case ISD::SHL: return combineShiftLeft(N, DAG); + case ISD::SRA: return combineShiftRightArithmetic(N, DAG); + case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); - case ISD::STORE: return combineStore(N, DAG, Subtarget); + case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); @@ -41535,13 +44229,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); - case X86ISD::CVTSI2P: + case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); + case X86ISD::CVTP2SI: + case X86ISD::CVTP2UI: + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI, + Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); @@ -41624,11 +44326,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; - // 8-bit multiply is probably not much cheaper than 32-bit multiply, and - // we have specializations to turn 32-bit multiply into LEA or other ops. + // TODO: Almost no 8-bit ops are desirable because they have no actual + // size/speed advantages vs. 32-bit ops, but they do have a major + // potential disadvantage by causing partial register stalls. + // + // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and + // we have specializations to turn 32-bit multiply/shl into LEA or other ops. // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally // check for a constant operand to the multiply. - if (Opc == ISD::MUL && VT == MVT::i8) + if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) return false; // i16 instruction encodings are longer and some i16 instructions are slow, @@ -41642,6 +44348,7 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: + case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: @@ -41717,6 +44424,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { case ISD::ANY_EXTEND: break; case ISD::SHL: + case ISD::SRA: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). @@ -41889,6 +44597,40 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { return false; } +static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { + X86::CondCode Cond = StringSwitch(Constraint) + .Case("{@cca}", X86::COND_A) + .Case("{@ccae}", X86::COND_AE) + .Case("{@ccb}", X86::COND_B) + .Case("{@ccbe}", X86::COND_BE) + .Case("{@ccc}", X86::COND_B) + .Case("{@cce}", X86::COND_E) + .Case("{@ccz}", X86::COND_E) + .Case("{@ccg}", X86::COND_G) + .Case("{@ccge}", X86::COND_GE) + .Case("{@ccl}", X86::COND_L) + .Case("{@ccle}", X86::COND_LE) + .Case("{@ccna}", X86::COND_BE) + .Case("{@ccnae}", X86::COND_B) + .Case("{@ccnb}", X86::COND_AE) + .Case("{@ccnbe}", X86::COND_A) + .Case("{@ccnc}", X86::COND_AE) + .Case("{@ccne}", X86::COND_NE) + .Case("{@ccnz}", X86::COND_NE) + .Case("{@ccng}", X86::COND_LE) + .Case("{@ccnge}", X86::COND_L) + .Case("{@ccnl}", X86::COND_GE) + .Case("{@ccnle}", X86::COND_G) + .Case("{@ccno}", X86::COND_NO) + .Case("{@ccnp}", X86::COND_P) + .Case("{@ccns}", X86::COND_NS) + .Case("{@cco}", X86::COND_O) + .Case("{@ccp}", X86::COND_P) + .Case("{@ccs}", X86::COND_S) + .Default(X86::COND_INVALID); + return Cond; +} + /// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { @@ -41949,7 +44691,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } } - } + } else if (parseConstraintCode(Constraint) != X86::COND_INVALID) + return C_Other; return TargetLowering::getConstraintType(Constraint); } @@ -42120,6 +44863,32 @@ LowerXConstraint(EVT ConstraintVT) const { return TargetLowering::LowerXConstraint(ConstraintVT); } +// Lower @cc targets via setcc. +SDValue X86TargetLowering::LowerAsmOutputForConstraint( + SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo, + SelectionDAG &DAG) const { + X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); + if (Cond == X86::COND_INVALID) + return SDValue(); + // Check that return type is valid. + if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || + OpInfo.ConstraintVT.getSizeInBits() < 8) + report_fatal_error("Flag output operand is of invalid type"); + + // Get EFLAGS register. Only update chain when copyfrom is glued. + if (Flag.getNode()) { + Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag); + Chain = Flag.getValue(1); + } else + Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32); + // Extract CC code. + SDValue CC = getSETCC(Cond, Flag, DL, DAG); + // Extend to 32-bits + SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); + + return Result; +} + /// Lower the specified operand into the Ops vector. /// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, @@ -42229,8 +44998,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast(Op)) { - // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); + bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; + BooleanContent BCont = getBooleanContents(MVT::i64); + ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) + : ISD::SIGN_EXTEND; + int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() + : CST->getSExtValue(); + Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); break; } @@ -42242,40 +45016,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. - GlobalAddressSDNode *GA = nullptr; - int64_t Offset = 0; - - // Match either (GA), (GA+C), (GA+C1+C2), etc. - while (1) { - if ((GA = dyn_cast(Op))) { - Offset += GA->getOffset(); - break; - } else if (Op.getOpcode() == ISD::ADD) { - if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { - Offset += C->getZExtValue(); - Op = Op.getOperand(0); - continue; - } - } else if (Op.getOpcode() == ISD::SUB) { - if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { - Offset += -C->getZExtValue(); - Op = Op.getOperand(0); - continue; - } - } - - // Otherwise, this isn't something we can handle, reject it. - return; - } - - const GlobalValue *GV = GA->getGlobal(); - // If we require an extra load to get this address, as in PIC mode, we - // can't accept it. - if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV))) - return; - - Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), - GA->getValueType(0), Offset); + if (auto *GA = dyn_cast(Op)) + // If we require an extra load to get this address, as in PIC mode, we + // can't accept it. + if (isGlobalStubReference( + Subtarget.classifyGlobalReference(GA->getGlobal()))) + return; break; } } @@ -42307,6 +45053,18 @@ static bool isFRClass(const TargetRegisterClass &RC) { RC.hasSuperClassEq(&X86::VR512RegClass); } +/// Check if \p RC is a mask register class. +/// I.e., VK* or one of their variant. +static bool isVKClass(const TargetRegisterClass &RC) { + return RC.hasSuperClassEq(&X86::VK1RegClass) || + RC.hasSuperClassEq(&X86::VK2RegClass) || + RC.hasSuperClassEq(&X86::VK4RegClass) || + RC.hasSuperClassEq(&X86::VK8RegClass) || + RC.hasSuperClassEq(&X86::VK16RegClass) || + RC.hasSuperClassEq(&X86::VK32RegClass) || + RC.hasSuperClassEq(&X86::VK64RegClass); +} + std::pair X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, @@ -42317,25 +45075,31 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // GCC Constraint Letters switch (Constraint[0]) { default: break; + // 'A' means [ER]AX + [ER]DX. + case 'A': + if (Subtarget.is64Bit()) + return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); + assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && + "Expecting 64, 32 or 16 bit subtarget"); + return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); + // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'k': if (Subtarget.hasAVX512()) { - // Only supported in AVX512 or later. - switch (VT.SimpleTy) { - default: break; - case MVT::i32: - return std::make_pair(0U, &X86::VK32RegClass); - case MVT::i16: - return std::make_pair(0U, &X86::VK16RegClass); - case MVT::i8: - return std::make_pair(0U, &X86::VK8RegClass); - case MVT::i1: + if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1RegClass); - case MVT::i64: + if (VT == MVT::i8) + return std::make_pair(0U, &X86::VK8RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::VK16RegClass); + } + if (Subtarget.hasBWI()) { + if (VT == MVT::i32) + return std::make_pair(0U, &X86::VK32RegClass); + if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64RegClass); - } } break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. @@ -42403,7 +45167,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Scalar SSE types. case MVT::f32: case MVT::i32: - if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX()) + if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: @@ -42431,12 +45195,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::v4f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR256XRegClass); - return std::make_pair(0U, &X86::VR256RegClass); + if (Subtarget.hasAVX()) + return std::make_pair(0U, &X86::VR256RegClass); + break; case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: - return std::make_pair(0U, &X86::VR512RegClass); + if (!Subtarget.hasAVX512()) break; + if (VConstraint) + return std::make_pair(0U, &X86::VR512RegClass); + return std::make_pair(0U, &X86::VR512_0_15RegClass); } break; } @@ -42457,25 +45226,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(X86::XMM0, &X86::VR128RegClass); case 'k': // This register class doesn't allocate k0 for masked vector operation. - if (Subtarget.hasAVX512()) { // Only supported in AVX512. - switch (VT.SimpleTy) { - default: break; - case MVT::i32: - return std::make_pair(0U, &X86::VK32WMRegClass); - case MVT::i16: - return std::make_pair(0U, &X86::VK16WMRegClass); - case MVT::i8: - return std::make_pair(0U, &X86::VK8WMRegClass); - case MVT::i1: + if (Subtarget.hasAVX512()) { + if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1WMRegClass); - case MVT::i64: + if (VT == MVT::i8) + return std::make_pair(0U, &X86::VK8WMRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::VK16WMRegClass); + } + if (Subtarget.hasBWI()) { + if (VT == MVT::i32) + return std::make_pair(0U, &X86::VK32WMRegClass); + if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64WMRegClass); - } } break; } } + if (parseConstraintCode(Constraint) != X86::COND_INVALID) + return std::make_pair(0U, &X86::GR32RegClass); + // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; @@ -42505,14 +45276,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (StringRef("{flags}").equals_lower(Constraint)) return std::make_pair(X86::EFLAGS, &X86::CCRRegClass); - // 'A' means [ER]AX + [ER]DX. - if (Constraint == "A") { - if (Subtarget.is64Bit()) - return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); - assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && - "Expecting 64, 32 or 16 bit subtarget"); - return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); - } + // dirflag -> DF + if (StringRef("{dirflag}").equals_lower(Constraint)) + return std::make_pair(X86::DF, &X86::DFCCRRegClass); + + // fpsr -> FPSW + if (StringRef("{fpsr}").equals_lower(Constraint)) + return std::make_pair(X86::FPSW, &X86::FPCCRRegClass); + return Res; } @@ -42561,20 +45332,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Size == 64 && !is64Bit) { // Model GCC's behavior here and select a fixed pair of 32-bit // registers. - switch (Res.first) { - case X86::EAX: + switch (DestReg) { + case X86::RAX: return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); - case X86::EDX: + case X86::RDX: return std::make_pair(X86::EDX, &X86::GR32_DCRegClass); - case X86::ECX: + case X86::RCX: return std::make_pair(X86::ECX, &X86::GR32_CBRegClass); - case X86::EBX: + case X86::RBX: return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass); - case X86::ESI: + case X86::RSI: return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass); - case X86::EDI: + case X86::RDI: return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass); - case X86::EBP: + case X86::RBP: return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass); default: return std::make_pair(0, nullptr); @@ -42594,13 +45365,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) - Res.second = &X86::FR32RegClass; + Res.second = &X86::FR32XRegClass; else if (VT == MVT::f64 || VT == MVT::i64) - Res.second = &X86::FR64RegClass; - else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) - Res.second = &X86::VR128RegClass; - else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) - Res.second = &X86::VR256RegClass; + Res.second = &X86::FR64XRegClass; + else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT)) + Res.second = &X86::VR128XRegClass; + else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT)) + Res.second = &X86::VR256XRegClass; else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { @@ -42608,6 +45379,22 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.first = 0; Res.second = nullptr; } + } else if (isVKClass(*Class)) { + if (VT == MVT::i1) + Res.second = &X86::VK1RegClass; + else if (VT == MVT::i8) + Res.second = &X86::VK8RegClass; + else if (VT == MVT::i16) + Res.second = &X86::VK16RegClass; + else if (VT == MVT::i32) + Res.second = &X86::VK32RegClass; + else if (VT == MVT::i64) + Res.second = &X86::VK64RegClass; + else { + // Type mismatch and not a clobber: Return an error; + Res.first = 0; + Res.second = nullptr; + } } return Res; @@ -42660,7 +45447,7 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = - Entry->getParent()->getInfo(); + Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } @@ -42688,9 +45475,9 @@ void X86TargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction().hasFnAttribute( - Attribute::NoUnwind) && - "Function should be nounwind in insertCopiesSplitCSR!"); + assert( + Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); @@ -42709,7 +45496,8 @@ bool X86TargetLowering::supportSwiftError() const { /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. -StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { +StringRef +X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 910acd80e8b8..e0be03bc3f9d 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1,9 +1,8 @@ //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -78,15 +77,6 @@ namespace llvm { /// Same as call except it adds the NoTrack prefix. NT_CALL, - /// This operation implements the lowering for readcyclecounter. - RDTSC_DAG, - - /// X86 Read Time-Stamp Counter and Processor ID. - RDTSCP_DAG, - - /// X86 Read Performance Monitoring Counters. - RDPMC_DAG, - /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, @@ -110,13 +100,12 @@ namespace llvm { FSETCC, /// X86 FP SETCC, similar to above, but with output as an i1 mask and - /// with optional rounding mode. - FSETCCM, FSETCCM_RND, + /// and a version with SAE. + FSETCCM, FSETCCM_SAE, /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the - /// flag operand produced by a CMP or TEST instruction. It also writes a - /// flag result. + /// flag operand produced by a CMP or TEST instruction. CMOV, /// X86 conditional branches. Operand 0 is the chain operand, operand 1 @@ -204,28 +193,29 @@ namespace llvm { /// Dynamic (non-constant condition) vector blend where only the sign bits /// of the condition elements are used. This is used to enforce that the /// condition mask is not valid for generic VSELECT optimizations. This - /// can also be used to implement the intrinsics. + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE BLENDV, /// Combined add and sub on an FP vector. ADDSUB, // FP vector ops with rounding mode. - FADD_RND, FADDS_RND, - FSUB_RND, FSUBS_RND, - FMUL_RND, FMULS_RND, - FDIV_RND, FDIVS_RND, - FMAX_RND, FMAXS_RND, - FMIN_RND, FMINS_RND, - FSQRT_RND, FSQRTS_RND, + FADD_RND, FADDS, FADDS_RND, + FSUB_RND, FSUBS, FSUBS_RND, + FMUL_RND, FMULS, FMULS_RND, + FDIV_RND, FDIVS, FDIVS_RND, + FMAX_SAE, FMAXS_SAE, + FMIN_SAE, FMINS_SAE, + FSQRT_RND, FSQRTS, FSQRTS_RND, // FP vector get exponent. - FGETEXP_RND, FGETEXPS_RND, + FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE, // Extract Normalized Mantissas. - VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND, + VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE, // FP Scale. - SCALEF, - SCALEFS, + SCALEF, SCALEF_RND, + SCALEFS, SCALEFS_RND, // Unsigned Integer average. AVG, @@ -300,10 +290,10 @@ namespace llvm { VMTRUNC, VMTRUNCUS, VMTRUNCS, // Vector FP extend. - VFPEXT, VFPEXT_RND, VFPEXTS_RND, + VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE, // Vector FP round. - VFPROUND, VFPROUND_RND, VFPROUNDS_RND, + VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND, // Masked version of above. Used for v2f64->v4f32. // SRC, PASSTHRU, MASK @@ -315,10 +305,8 @@ namespace llvm { // Vector shift elements VSHL, VSRL, VSRA, - // Vector variable shift right arithmetic. - // Unlike ISD::SRA, in case shift count greater then element size - // use sign bit to fill destination data element. - VSRAV, + // Vector variable shift + VSHLV, VSRLV, VSRAV, // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, @@ -343,8 +331,8 @@ namespace llvm { /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, - // Vector comparison with rounding mode for FP values - CMPM_RND, + // Vector comparison with SAE for FP values + CMPM_SAE, // Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, UMUL, @@ -419,16 +407,16 @@ namespace llvm { // Bitwise ternary logic. VPTERNLOG, // Fix Up Special Packed Float32/64 values. - VFIXUPIMM, - VFIXUPIMMS, + VFIXUPIMM, VFIXUPIMM_SAE, + VFIXUPIMMS, VFIXUPIMMS_SAE, // Range Restriction Calculation For Packed Pairs of Float32/64 values. - VRANGE, VRANGE_RND, VRANGES, VRANGES_RND, + VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE, // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND, + VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE, // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. // Also used by the legacy (V)ROUND intrinsics where we mask out the // scaling part of the immediate. - VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND, + VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE, // Tests Types Of a FP Values for packed types. VFPCLASS, // Tests Types Of a FP Values for scalar types. @@ -499,6 +487,7 @@ namespace llvm { // Convert Unsigned/Integer to Floating-Point Value with rounding mode. SINT_TO_FP_RND, UINT_TO_FP_RND, + SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP, SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, // Vector float/double to signed/unsigned integer. @@ -507,9 +496,9 @@ namespace llvm { CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND, // Vector float/double to signed/unsigned integer with truncation. - CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND, + CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE, // Scalar float/double to signed/unsigned integer with truncation. - CVTTS2SI, CVTTS2UI, CVTTS2SI_RND, CVTTS2UI_RND, + CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE, // Vector signed/unsigned integer to float/double. CVTSI2P, CVTUI2P, @@ -517,6 +506,20 @@ namespace llvm { // Masked versions of above. Used for v2f64->v4f32. // SRC, PASSTHRU, MASK MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI, + MCVTSI2P, MCVTUI2P, + + // Vector float to bfloat16. + // Convert TWO packed single data to one packed BF16 data + CVTNE2PS2BF16, + // Convert packed single data to packed BF16 data + CVTNEPS2BF16, + // Masked version of above. + // SRC, PASSTHRU, MASK + MCVTNEPS2BF16, + + // Dot product of BF16 pairs to accumulated into + // packed single precision. + DPBF16PS, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. @@ -547,6 +550,12 @@ namespace llvm { // indicate whether it is valid in CF. RDSEED, + // Protection keys + // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. + // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is + // value for ECX. + RDPKRU, WRPKRU, + // SSE42 string comparisons. // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG // will emit one or two instructions based on which results are used. If @@ -560,10 +569,11 @@ namespace llvm { XTEST, // ERI instructions. - RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2, + RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE, + RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE, // Conversions between float and half-float. - CVTPS2PH, CVTPH2PS, CVTPH2PS_RND, + CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE, // Masked version of above. // SRC, RND, PASSTHRU, MASK @@ -578,6 +588,12 @@ namespace llvm { // User level wait UMWAIT, TPAUSE, + // Enqueue Stores Instructions + ENQCMD, ENQCMDS, + + // For avx512-vp2intersect + VP2INTERSECT, + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, @@ -592,6 +608,9 @@ namespace llvm { // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, + // extract_vector_elt, store. + VEXTRACT_STORE, + // Store FP control world into i16 memory. FNSTCW16m, @@ -599,29 +618,33 @@ namespace llvm { /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It /// has two inputs (token chain and address) and two outputs (int value - /// and token chain). - FP_TO_INT16_IN_MEM, - FP_TO_INT32_IN_MEM, - FP_TO_INT64_IN_MEM, + /// and token chain). Memory VT specifies the type to store to. + FP_TO_INT_IN_MEM, /// This instruction implements SINT_TO_FP with the /// integer source in memory and FP reg result. This corresponds to the - /// X86::FILD*m instructions. It has three inputs (token chain, address, - /// and source type) and two outputs (FP value and token chain). FILD_FLAG - /// also produces a flag). + /// X86::FILD*m instructions. It has two inputs (token chain and address) + /// and two outputs (FP value and token chain). FILD_FLAG also produces a + /// flag). The integer source type is specified by the memory VT. FILD, FILD_FLAG, + /// This instruction implements a fp->int store from FP stack + /// slots. This corresponds to the fist instruction. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FIST, + /// This instruction implements an extending load to FP stack slots. /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain - /// operand, ptr to load from, and a ValueType node indicating the type - /// to load to. + /// operand, and ptr to load from. The memory VT specifies the type to + /// load from. FLD, - /// This instruction implements a truncating store to FP stack + /// This instruction implements a truncating store from FP stack /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a - /// chain operand, value to store, address, and a ValueType to store it - /// as. + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. FST, /// This instruction grabs the address of the next argument @@ -708,7 +731,7 @@ namespace llvm { /// target-independent logic. EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; /// Returns true if it's safe to use load / store of the /// specified type to expand memcpy / memset inline. This is mostly true @@ -721,7 +744,8 @@ namespace llvm { /// Returns true if the target allows unaligned memory accesses of the /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, - bool *Fast) const override; + MachineMemOperand::Flags Flags, + bool *Fast) const override; /// Provide custom lowering hooks for some operations. /// @@ -775,7 +799,11 @@ namespace llvm { /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; - bool mergeStoresAfterLegalization() const override { return true; } + /// Do not merge vector stores after legalization because that may conflict + /// with x86-specific store splitting optimizations. + bool mergeStoresAfterLegalization(EVT MemVT) const override { + return !MemVT.isVector(); + } bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override; @@ -812,7 +840,10 @@ namespace llvm { bool hasAndNot(SDValue Y) const override; - bool preferShiftsToClearExtremeBits(SDValue Y) const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; + + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, @@ -832,6 +863,12 @@ namespace llvm { return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return false; + return true; + } + bool shouldSplatInsEltVarIndex(EVT VT) const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { @@ -841,11 +878,6 @@ namespace llvm { /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; - /// Allow multiple load pairs per block for smaller and faster code. - unsigned getMemcmpEqZeroLoadsPerBlock() const override { - return 2; - } - /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -881,6 +913,8 @@ namespace llvm { TargetLoweringOpt &TLO, unsigned Depth) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; + SDValue unwrapAddress(SDValue N) const override; SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; @@ -918,6 +952,11 @@ namespace llvm { return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } + /// Handle Lowering flag assembly outputs. + SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL, + const AsmOperandInfo &Constraint, + SelectionDAG &DAG) const override; + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On @@ -956,6 +995,12 @@ namespace llvm { bool isVectorShiftByScalarCheap(Type *Ty) const override; + /// Add x86-specific opcodes to the default list. + bool isBinOp(unsigned Opcode) const override; + + /// Returns true if the opcode is a commutative binary operation. + bool isCommutativeBinOp(unsigned Opcode) const override; + /// Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. @@ -1001,7 +1046,8 @@ namespace llvm { /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a @@ -1063,6 +1109,17 @@ namespace llvm { /// supported. bool shouldScalarizeBinop(SDValue) const override; + /// Extract of a scalar FP value from index 0 of a vector is free. + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; + } + + /// Overflow nodes should get combined/lowered to optimal instructions + /// (they should allow eliminating explicit compares by getting flags from + /// math ops). + bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override; + bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const override { // If we can replace more than 2 scalar stores, there will be a reduction @@ -1070,7 +1127,9 @@ namespace llvm { return NumElem > 2; } - bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override; + bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const override; /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { @@ -1105,7 +1164,7 @@ namespace llvm { bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; - Value *getSSPStackGuardCheck(const Module &M) const override; + Function *getSSPStackGuardCheck(const Module &M) const override; SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override; @@ -1221,9 +1280,7 @@ namespace llvm { unsigned getAddressSpace(void) const; - std::pair FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool isSigned, - bool isReplace) const; + SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -1234,12 +1291,15 @@ namespace llvm { const unsigned char OpFlags = 0) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl, - int64_t Offset, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + /// Creates target global address or external symbol nodes for calls or + /// other uses. + SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, + bool ForCall) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; @@ -1568,10 +1628,10 @@ namespace llvm { void scaleShuffleMask(int Scale, ArrayRef Mask, SmallVectorImpl &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); - int NumElts = Mask.size(); - ScaledMask.assign(static_cast(NumElts * Scale), -1); + size_t NumElts = Mask.size(); + ScaledMask.assign(NumElts * Scale, -1); - for (int i = 0; i != NumElts; ++i) { + for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; // Repeat sentinel values in every mask element. diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp index 7c00c9260d15..04e8b2231fec 100644 --- a/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -1,9 +1,8 @@ //===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -58,7 +57,7 @@ private: /// The function will not add it if already exists. /// It will add ENDBR32 or ENDBR64 opcode, depending on the target. /// \returns true if the ENDBR was added and false otherwise. - bool addENDBR(MachineBasicBlock &MBB) const; + bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; }; } // end anonymous namespace @@ -69,20 +68,31 @@ FunctionPass *llvm::createX86IndirectBranchTrackingPass() { return new X86IndirectBranchTrackingPass(); } -bool X86IndirectBranchTrackingPass::addENDBR(MachineBasicBlock &MBB) const { +bool X86IndirectBranchTrackingPass::addENDBR( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { assert(TII && "Target instruction info was not initialized"); assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) && "Unexpected Endbr opcode"); - auto MI = MBB.begin(); - // If the MBB is empty or the first instruction is not ENDBR, - // add the ENDBR instruction to the beginning of the MBB. - if (MI == MBB.end() || EndbrOpcode != MI->getOpcode()) { - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(EndbrOpcode)); - NumEndBranchAdded++; + // If the MBB/I is empty or the current instruction is not ENDBR, + // insert ENDBR instruction to the location of I. + if (I == MBB.end() || I->getOpcode() != EndbrOpcode) { + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(EndbrOpcode)); + ++NumEndBranchAdded; return true; } + return false; +} +bool IsCallReturnTwice(llvm::MachineOperand &MOp) { + if (!MOp.isGlobal()) + return false; + auto *CalleeFn = dyn_cast(MOp.getGlobal()); + if (!CalleeFn) + return false; + AttributeList Attrs = CalleeFn->getAttributes(); + if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice)) + return true; return false; } @@ -108,14 +118,21 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { !MF.getFunction().hasLocalLinkage()) && !MF.getFunction().doesNoCfCheck()) { auto MBB = MF.begin(); - Changed |= addENDBR(*MBB); + Changed |= addENDBR(*MBB, MBB->begin()); } - for (auto &MBB : MF) + for (auto &MBB : MF) { // Find all basic blocks that their address was taken (for example // in the case of indirect jump) and add ENDBR instruction. if (MBB.hasAddressTaken()) - Changed |= addENDBR(MBB); - + Changed |= addENDBR(MBB, MBB.begin()); + + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (!I->isCall()) + continue; + if (IsCallReturnTwice(I->getOperand(0))) + Changed |= addENDBR(MBB, std::next(I)); + } + } return Changed; } diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp index 30b46a09ef0f..02ae73706a34 100644 --- a/lib/Target/X86/X86InsertPrefetch.cpp +++ b/lib/Target/X86/X86InsertPrefetch.cpp @@ -1,9 +1,8 @@ //===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,7 +33,8 @@ using namespace sampleprof; static cl::opt PrefetchHintsFile("prefetch-hints-file", - cl::desc("Path to the prefetch hints profile."), + cl::desc("Path to the prefetch hints profile. See also " + "-x86-discriminate-memops"), cl::Hidden); namespace { diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 49e9e924887a..cd1b06365971 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -1,9 +1,8 @@ //===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -74,7 +73,9 @@ defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>; defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>; defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>; -let SchedRW = [WriteEMMS] in +let SchedRW = [WriteEMMS], + Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>, TB; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7423cb85acd2..54eddeacaa17 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1,9 +1,8 @@ //===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,6 +26,10 @@ class X86VectorVTInfo("VK" # NumElts); + // Corresponding mask register pair class. + RegisterOperand KRPC = !if (!gt(NumElts, 16), ?, + !cast("VK" # NumElts # "Pair")); + // Corresponding write-mask register class. RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); @@ -95,10 +98,7 @@ class X86VectorVTInfo("v" # !srl(Size, 5) # "i32"); - dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + dag ImmAllZerosV = (VT immAllZerosV); string ZSuffix = !if (!eq (Size, 128), "Z128", !if (!eq (Size, 256), "Z256", "Z")); @@ -277,10 +277,9 @@ multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, - bit IsCommutable = 0> : + dag RHS> : AVX512_maskable; + RHS, 0, 0, 0, X86selects>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -365,7 +364,7 @@ multiclass AVX512_maskable_custom_cmp O, Format F, list Pattern, list MaskingPattern, bit IsCommutable = 0> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable in { def NAME: AVX512 O, Format F, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# "$dst {${mask}}, "#IntelSrcAsm#"}", MaskingPattern>, EVEX_K; + } } multiclass AVX512_maskable_common_cmp O, Format F, X86VectorVTInfo _, @@ -392,38 +392,11 @@ multiclass AVX512_maskable_common_cmp O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_cmp O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, bit IsCommutable = 0> : + dag RHS, dag RHS_su, bit IsCommutable = 0> : AVX512_maskable_common_cmp; - -multiclass AVX512_maskable_cmp_alt O, Format F, X86VectorVTInfo _, - dag Outs, dag Ins, string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm> : - AVX512_maskable_custom_cmp; - -// This multiclass generates the unconditional/non-masking, the masking and -// the zero-masking variant of the vector instruction. In the masking case, the -// perserved vector elements come from a new dummy input operand tied to $dst. -multiclass AVX512_maskable_logic O, Format F, X86VectorVTInfo _, - dag Outs, dag Ins, string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag MaskedRHS, - bit IsCommutable = 0, SDNode Select = vselect> : - AVX512_maskable_custom; + (and _.KRCWM:$mask, RHS_su), IsCommutable>; // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. @@ -451,8 +424,8 @@ def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), (ins VK8WM:$mask), "", [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask), - (bc_v8i64 (v16i32 immAllOnesV)), - (bc_v8i64 (v16i32 immAllZerosV))))]>; + (v8i64 immAllOnesV), + (v8i64 immAllZerosV)))]>; } let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -753,6 +726,7 @@ defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info, // vinsertps - insert f32 to XMM let ExeDomain = SSEPackedSingle in { +let isCommutable = 1 in def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -1378,15 +1352,15 @@ multiclass avx512_subvec_broadcast_rm_dq opc, string OpcodeStr, let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))), + def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; } let Predicates = [HasVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ128m addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { @@ -1396,13 +1370,31 @@ let Predicates = [HasVLX, HasBWI] in { (VPBROADCASTWZ128m addr:$src)>; def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; } +let Predicates = [HasBWI] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; + def : Pat<(v32i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; + def : Pat<(v32i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; +} //===----------------------------------------------------------------------===// // AVX-512 BROADCAST SUBVECTORS @@ -1464,7 +1456,7 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))), // Patterns for selects of bitcasted operations. def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - (bc_v16f32 (v16i32 immAllZerosV))), + (v16f32 immAllZerosV)), (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), @@ -1481,7 +1473,7 @@ def : Pat<(vselect VK16WM:$mask, def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), - (bc_v8f64 (v16i32 immAllZerosV))), + (v8f64 immAllZerosV)), (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), @@ -1489,7 +1481,7 @@ def : Pat<(vselect VK8WM:$mask, (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), - (bc_v8i64 (v16i32 immAllZerosV))), + (v8i64 immAllZerosV)), (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), @@ -1517,7 +1509,7 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), // Patterns for selects of bitcasted operations. def : Pat<(vselect VK8WM:$mask, (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - (bc_v8f32 (v8i32 immAllZerosV))), + (v8f32 immAllZerosV)), (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), @@ -1566,7 +1558,7 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2" // Patterns for selects of bitcasted operations. def : Pat<(vselect VK4WM:$mask, (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - (bc_v4f64 (v8i32 immAllZerosV))), + (v4f64 immAllZerosV)), (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), @@ -1574,7 +1566,7 @@ def : Pat<(vselect VK4WM:$mask, (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - (bc_v4i64 (v8i32 immAllZerosV))), + (v4i64 immAllZerosV)), (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), @@ -1599,7 +1591,7 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", // Patterns for selects of bitcasted operations. def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), - (bc_v16f32 (v16i32 immAllZerosV))), + (v16f32 immAllZerosV)), (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), @@ -1616,7 +1608,7 @@ def : Pat<(vselect VK16WM:$mask, def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - (bc_v8f64 (v16i32 immAllZerosV))), + (v8f64 immAllZerosV)), (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), @@ -1624,7 +1616,7 @@ def : Pat<(vselect VK8WM:$mask, (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - (bc_v8i64 (v16i32 immAllZerosV))), + (v8i64 immAllZerosV)), (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), @@ -2031,96 +2023,86 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend, // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar { defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V, Sched<[sched]>; + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", + (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + imm:$cc), + (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (OpNodeRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc, - (i32 FROUND_NO_EXC))>, - EVEX_4V, EVEX_B, Sched<[sched]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs VK1:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V, - Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, - Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; - - defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, - EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable; - }// let isAsmParserOnly = 1, hasSideEffects = 0 + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", + (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc), + (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, + EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; let isCodeGenOnly = 1 in { let isCommutable = 1 in def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, u8imm:$cc), + !strconcat("vcmp", _.Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, imm:$cc))]>, - EVEX_4V, Sched<[sched]>; + EVEX_4V, VEX_LIG, Sched<[sched]>; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), - (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + !strconcat("vcmp", _.Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2), imm:$cc))]>, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } +def X86cmpms_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpms node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; +def X86cmpmsSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpmsSAE node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; + let Predicates = [HasAVX512] in { let ExeDomain = SSEPackedSingle in - defm VCMPSSZ : avx512_cmp_scalar, AVX512XSIi8Base; let ExeDomain = SSEPackedDouble in - defm VCMPSDZ : avx512_cmp_scalar, AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed opc, string OpcodeStr, PatFrag OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _, - bit IsCommutable> { + PatFrag OpNode_su, X86FoldableSchedWrite sched, + X86VectorVTInfo _, bit IsCommutable> { let isCommutable = IsCommutable in def rr : AVX512BI opc, string OpcodeStr, PatFrag OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmk : AVX512BI, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, + PatFrag OpNode_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed { + avx512_icmp_packed { def rmb : AVX512BI opc, string OpcodeStr, PatFrag OpNode, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (OpNode_su (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, EVEX_4V, EVEX_K, EVEX_B, @@ -2177,33 +2160,34 @@ multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, } multiclass avx512_icmp_packed_vl opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched, + PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed, EVEX_V256; - defm Z128 : avx512_icmp_packed, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched, + PatFrag OpNode, PatFrag OpNode_su, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb, EVEX_V128; } } @@ -2216,59 +2200,69 @@ def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; +def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpeqm_c node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; +def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpgtm node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } multiclass avx512_icmp_cc opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86FoldableSchedWrite sched, + PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su, + X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { let isCommutable = 1 in def rri : AVX512AIi8, EVEX_4V, Sched<[sched]>; def rmi : AVX512AIi8 opc, string Suffix, PatFrag Frag, let isCommutable = 1 in def rrik : AVX512AIi8, + (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmik : AVX512AIi8, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : AVX512AIi8, - EVEX_4V, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rmi_alt : AVX512AIi8, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; - def rrik_alt : AVX512AIi8, - EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rmik_alt : AVX512AIi8, - EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } - def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), + (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, @@ -2346,15 +2309,17 @@ multiclass avx512_icmp_cc opc, string Suffix, PatFrag Frag, } multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86FoldableSchedWrite sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> : - avx512_icmp_cc { + avx512_icmp_cc { def rmib : AVX512AIi8 opc, string Suffix, PatFrag Frag, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmibk : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { - def rmib_alt : AVX512AIi8, - EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - def rmibk_alt : AVX512AIi8, - EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } - def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), (_.VT _.RC:$src1), cond)), (!cast(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (X86VBroadcast + (_.KVT (CommFrag_su:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmibk") @@ -2410,32 +2355,34 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, } multiclass avx512_icmp_cc_vl opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86SchedWriteWidths sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc, EVEX_V512; + defm Z : avx512_icmp_cc, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc, EVEX_V256; - defm Z128 : avx512_icmp_cc, EVEX_V128; + defm Z256 : avx512_icmp_cc, EVEX_V256; + defm Z128 : avx512_icmp_cc, EVEX_V128; } } multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86SchedWriteWidths sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb, EVEX_V512; + defm Z : avx512_icmp_cc_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; + defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; } } @@ -2459,6 +2406,12 @@ def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc), return !ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm>; +def X86pcmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + // Same as above, but commutes immediate. Use for load folding. def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ @@ -2466,12 +2419,24 @@ def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), return !ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm_commute>; +def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ ISD::CondCode CC = cast(N->getOperand(2))->get(); return ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm>; +def X86pcmpum_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + // Same as above, but commutes immediate. Use for load folding. def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ @@ -2479,93 +2444,91 @@ def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), return ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm_commute>; +def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + // FIXME: Is there a better scheduler class for VPCMP/VPCMPU? -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute, +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute, +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute, +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute, +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute, +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute, +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute, +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute, +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpm node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; +def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; + multiclass avx512_vcmp_common { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", - (X86cmpm (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc), 1>, - Sched<[sched]>; + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + 1>, Sched<[sched]>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", - (X86cmpm (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2)), - imm:$cc)>, + (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + imm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $cc", (X86cmpm (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc)>, + imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - Sched<[sched]>, NotMemoryFoldable; - - let mayLoad = 1 in { - defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - - defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $cc">, - EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } - } // Patterns for selecting with loads in other operand. def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), @@ -2573,9 +2536,9 @@ multiclass avx512_vcmp_common(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, imm:$cc)>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2), - (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), (!cast(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc)>; @@ -2585,10 +2548,10 @@ multiclass avx512_vcmp_common(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, imm:$cc)>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), (!cast(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc)>; @@ -2597,24 +2560,14 @@ multiclass avx512_vcmp_common { // comparison code form (VCMP[EQ/LT/LE/...] defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (X86cmpmRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc, - (i32 FROUND_NO_EXC))>, + (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $cc", + (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, EVEX_B, Sched<[sched]>; - - let isAsmParserOnly = 1, hasSideEffects = 0 in { - defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1", - "$src1, $src2, {sae}, $cc">, - EVEX_B, Sched<[sched]>, NotMemoryFoldable; - } } multiclass avx512_vcmp { @@ -2647,16 +2600,27 @@ let Predicates = [HasAVX512] in { // ---------------------------------------------------------------- // FPClass + +def X86Vfpclasss_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vfpclasss node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + +def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vfpclass node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + //handle fpclass instruction mask = op(reg_scalar,imm) // op(mem_scalar,imm) -multiclass avx512_scalar_fpclass opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_scalar_fpclass opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, Predicate prd> { let Predicates = [prd], ExeDomain = _.ExeDomain in { def rr : AVX512, Sched<[sched]>; def rrk : AVX512 opc, string OpcodeStr, SDNode OpNode, OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (X86Vfpclasss_su (_.VT _.RC:$src1), (i32 imm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512 opc, string OpcodeStr, SDNode OpNode, OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, - (OpNode _.ScalarIntMemCPat:$src1, - (i32 imm:$src2)))]>, + (X86Vfpclasss _.ScalarIntMemCPat:$src1, + (i32 imm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2689,14 +2653,14 @@ multiclass avx512_scalar_fpclass opc, string OpcodeStr, SDNode OpNode, //handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) // fpclass(reg_vec, mem_vec, imm) // fpclass(reg_vec, broadcast(eltVt), imm) -multiclass avx512_vector_fpclass opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_vector_fpclass opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, - string mem, string broadcast>{ + string mem>{ let ExeDomain = _.ExeDomain in { def rr : AVX512, Sched<[sched]>; def rrk : AVX512 opc, string OpcodeStr, SDNode OpNode, OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (X86Vfpclass_su (_.VT _.RC:$src1), (i32 imm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : AVX512, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512, EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } + + // Allow registers or broadcast with the x, y, z suffix we use to disambiguate + // the memory form. + def : InstAlias(NAME#"rr") + _.KRC:$dst, _.RC:$src1, i32u8imm:$src2), 0, "att">; + def : InstAlias(NAME#"rrk") + _.KRC:$dst, _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), 0, "att">; + def : InstAlias(NAME#"rmb") + _.KRC:$dst, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">; + def : InstAlias(NAME#"rmbk") + _.KRC:$dst, _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">; } multiclass avx512_vector_fpclass_all opc, SDNode OpNode, - X86SchedWriteWidths sched, Predicate prd, - string broadcast>{ + bits<8> opc, X86SchedWriteWidths sched, + Predicate prd>{ let Predicates = [prd] in { - defm Z : avx512_vector_fpclass, EVEX_V512; + defm Z : avx512_vector_fpclass, EVEX_V512; } let Predicates = [prd, HasVLX] in { - defm Z128 : avx512_vector_fpclass, EVEX_V128; - defm Z256 : avx512_vector_fpclass, EVEX_V256; + defm Z128 : avx512_vector_fpclass, EVEX_V128; + defm Z256 : avx512_vector_fpclass, EVEX_V256; } } multiclass avx512_fp_fpclass_all opcVec, - bits<8> opcScalar, SDNode VecOpNode, - SDNode ScalarOpNode, X86SchedWriteWidths sched, + bits<8> opcScalar, X86SchedWriteWidths sched, Predicate prd> { defm PS : avx512_vector_fpclass_all, + sched, prd>, EVEX_CD8<32, CD8VF>; defm PD : avx512_vector_fpclass_all, + sched, prd>, EVEX_CD8<64, CD8VF> , VEX_W; - defm SSZ : avx512_scalar_fpclass, + defm SSZ : avx512_scalar_fpclass, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_scalar_fpclass, + defm SDZ : avx512_scalar_fpclass, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W; } -defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, - X86Vfpclasss, SchedWriteFCmp, HasDQI>, - AVX512AIi8Base, EVEX; +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp, + HasDQI>, AVX512AIi8Base, EVEX; //----------------------------------------------------------------- // Mask register copy, including @@ -3039,26 +3021,24 @@ defm : avx512_binop_pat; defm : avx512_binop_pat; // Mask unpacking -multiclass avx512_mask_unpck { let Predicates = [prd] in { let hasSideEffects = 0 in - def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), - (ins KRC:$src1, KRC:$src2), + def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst), + (ins Src.KRC:$src1, Src.KRC:$src2), "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX_4V, VEX_L, Sched<[sched]>; - def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), - (!cast(NAME##rr) - (COPY_TO_REGCLASS KRCSrc:$src2, KRC), - (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)), + (!cast(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>; } } -defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD; -defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS; -defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W; +defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info, WriteShuffle, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, @@ -3118,7 +3098,8 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_no_vlx_lowering { def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), @@ -3130,8 +3111,8 @@ multiclass axv512_icmp_packed_no_vlx_lowering; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2)))), + (Frag_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2)))), (COPY_TO_REGCLASS (!cast(InstStr#"Zrrk") (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), @@ -3141,7 +3122,7 @@ multiclass axv512_icmp_packed_no_vlx_lowering { @@ -3154,9 +3135,9 @@ def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), (Frag.OperandTransform $cc)), Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), - cond)))), + (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), + cond)))), (COPY_TO_REGCLASS (!cast(InstStr##Zrrik) (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3165,7 +3146,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, } // Same as above, but for fp types which don't use PatFrags. -multiclass axv512_cmp_packed_cc_no_vlx_lowering { def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), @@ -3177,8 +3159,8 @@ def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), imm:$cc), Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (OpNode (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), imm:$cc))), + (OpNode_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), imm:$cc))), (COPY_TO_REGCLASS (!cast(InstStr##Zrrik) (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3190,65 +3172,65 @@ let Predicates = [HasAVX512, NoVLX] in { // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; } - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_cmp_packed_cc_no_vlx_lowering; - defm : axv512_cmp_packed_cc_no_vlx_lowering; - defm : axv512_cmp_packed_cc_no_vlx_lowering; - defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; } let Predicates = [HasBWI, NoVLX] in { // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; } - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; } // Mask setting all 0s or 1s @@ -3394,15 +3376,15 @@ multiclass avx512_alignedload_vl opc, string OpcodeStr, string EVEX2VEXOvrd, bit NoRMPattern = 0> { let Predicates = [prd] in defm Z : avx512_load, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load, EVEX_V256; defm Z128 : avx512_load, EVEX_V128; } } @@ -3414,15 +3396,15 @@ multiclass avx512_load_vl opc, string OpcodeStr, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in defm Z : avx512_load, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load, EVEX_V256; defm Z128 : avx512_load, EVEX_V128; } } @@ -3488,14 +3470,14 @@ multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, string EVEX2VEXOvrd, bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; } } @@ -3506,15 +3488,15 @@ multiclass avx512_alignedstore_vl opc, string OpcodeStr, string EVEX2VEXOvrd, bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; } } @@ -3609,7 +3591,7 @@ def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), "", []>, Sched<[WriteFStoreY]>; } -def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), +def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 immAllZerosV), (v8i64 VR512:$src))), (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), VK8), VR512:$src)>; @@ -3621,7 +3603,7 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), // These patterns exist to prevent the above patterns from introducing a second // mask inversion when one already exists. def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)), - (bc_v8i64 (v16i32 immAllZerosV)), + (v8i64 immAllZerosV), (v8i64 VR512:$src))), (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>; def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)), @@ -3761,75 +3743,6 @@ let Predicates = [HasVLX] in { (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } -multiclass masked_move_for_extract { - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (extract_subvector - (From.VT From.RC:$src), (iPTR 0)))), - To.RC:$src0)), - (Cast.VT (!cast(InstrStr#"rrk") - Cast.RC:$src0, Cast.KRCWM:$mask, - (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; - - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (extract_subvector - (From.VT From.RC:$src), (iPTR 0)))), - Cast.ImmAllZerosV)), - (Cast.VT (!cast(InstrStr#"rrkz") - Cast.KRCWM:$mask, - (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; -} - - -let Predicates = [HasVLX] in { -// A masked extract from the first 128-bits of a 256-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>; - -// A masked extract from the first 128-bits of a 512-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>; - -// A masked extract from the first 256-bits of a 512-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>; -} - // Move Int Doubleword to Packed Double Int // let ExeDomain = SSEPackedInt in { @@ -3858,19 +3771,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src) "vmovq\t{$src, $dst|$dst, $src}", [(set FR64X:$dst, (bitconvert GR64:$src))]>, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; -def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>, - EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>; def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64X:$src))]>, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; -def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>, - EVEX, VEX_W, Sched<[WriteVecStore]>, - EVEX_CD8<64, CD8VT1>; } } // ExeDomain = SSEPackedInt @@ -3881,11 +3785,6 @@ def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src) "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))]>, EVEX, Sched<[WriteVecMoveFromGpr]>; - -def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), - "vmovd\t{$src, $dst|$dst, $src}", - [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>, - EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move doubleword from xmm register to r/m32 @@ -3938,6 +3837,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>; +let Predicates = [HasAVX512] in { + def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst), + (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>; +} + // Move Scalar Single to Double Int // let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { @@ -3946,11 +3850,6 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))]>, EVEX, Sched<[WriteVecMoveToGpr]>; -def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), - (ins i32mem:$dst, FR32X:$src), - "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move Quadword Int to Packed Quadword Int @@ -3974,7 +3873,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { let Predicates = [HasAVX512, OptForSize] in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), @@ -3999,11 +3898,18 @@ multiclass avx512_move_scalar, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>; - let canFoldAsLoad = 1, isReMaterializable = 1 in - def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + let canFoldAsLoad = 1, isReMaterializable = 1 in { + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))], _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; + // _alt version uses FR32/FR64 register class. + let isCodeGenOnly = 1 in + def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; + } let mayLoad = 1, hasSideEffects = 0 in { let Constraints = "$src0 = $dst" in def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), @@ -4023,16 +3929,16 @@ multiclass avx512_move_scalar; let mayStore = 1, hasSideEffects = 0 in def mrk: AVX512PI<0x11, MRMDestMem, (outs), - (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.RC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>, NotMemoryFoldable; } -defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>, VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; -defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -4070,7 +3976,7 @@ def : Pat<(masked_store (iPTR 0))), addr:$dst, Mask), (!cast(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; } @@ -4085,7 +3991,7 @@ def : Pat<(masked_store (iPTR 0))), addr:$dst, Mask), (!cast(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; } @@ -4105,13 +4011,13 @@ def : Pat<(masked_store (iPTR 0))), addr:$dst, Mask512), (!cast(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; // AVX512VL pattern. def : Pat<(masked_store (_.info128.VT _.info128.RC:$src), addr:$dst, Mask128), (!cast(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; } multiclass avx512_load_scalar_lowering(InstrStr#rmkz) (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), @@ -4145,8 +4050,7 @@ multiclass avx512_load_scalar_lowering_subreg(InstrStr#rmkz) (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), @@ -4175,8 +4079,7 @@ multiclass avx512_load_scalar_lowering_subreg2(InstrStr#rmkz) (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), @@ -4194,7 +4097,7 @@ def : Pat<(_.info128.VT (extract_subvector // AVX512Vl patterns. def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128, - (_.info128.VT (bitconvert (v4i32 immAllZerosV))))), + _.info128.ImmAllZerosV)), (!cast(InstrStr#rmkz) (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; @@ -4383,15 +4286,6 @@ let Predicates = [HasAVX512, OptForSize] in { (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4400,17 +4294,6 @@ let Predicates = [HasAVX512, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; - } // Use 128-bit blends for OptForSpeed since BLENDs have better throughput than @@ -4426,79 +4309,27 @@ let Predicates = [HasAVX512, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } let Predicates = [HasAVX512] in { - - // MOVSSrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; - def : Pat<(v4f32 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; - - // MOVSDrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (VMOVSSZrm addr:$src)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (VMOVSDZrm addr:$src)>; // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + def : Pat<(v8f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v8f32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzload addr:$src)), + def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; // Represent the same patterns above but in the form they appear for // 512-bit types - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v16f32 (X86vzload addr:$src)), + def : Pat<(v16f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + def : Pat<(v8f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - def : Pat<(v8f64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; - - // Extract and store. - def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), - addr:$dst), - (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; } let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { @@ -4517,47 +4348,47 @@ let Predicates = [HasAVX512] in { def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (VMOV64toPQIZrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; - // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v8i32 (X86vzload addr:$src)), + def : Pat<(v8i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), (VMOVZPQILo2PQIZrr VR128X:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), + def : Pat<(v2i64 (X86vzload64 addr:$src)), (VMOVQI2PQIZrm addr:$src)>; - def : Pat<(v4i64 (X86vzload addr:$src)), + def : Pat<(v4i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. - def : Pat<(v16i32 (X86vzload addr:$src)), + def : Pat<(v16i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v8i64 (X86vzload addr:$src)), + def : Pat<(v8i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), + sub_xmm)>; } //===----------------------------------------------------------------------===// @@ -4686,7 +4517,7 @@ multiclass avx512_binop_rm opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - IsCommutable>, AVX512BIBase, EVEX_4V, + IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable opc, string OpcodeStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - IsCommutable>, + IsCommutable, IsCommutable>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable opc, string OpcodeStr,X86VectorVTInfo _, defm rr_Int : AVX512_maskable_scalar, + (_.VT (VecNode _.RC:$src1, _.RC:$src2))>, Sched<[sched]>; defm rm_Int : AVX512_maskable_scalar, + _.ScalarIntMemCPat:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), @@ -5495,7 +5324,7 @@ multiclass avx512_fp_scalar_round opc, string OpcodeStr,X86VectorVTInfo (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$rc)), IsCommutable>, + (i32 timm:$rc))>, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, @@ -5534,23 +5363,22 @@ multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, defm rrb_Int : AVX512_maskable_scalar, EVEX_B, - Sched<[sched]>; + (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_B, Sched<[sched]>; } } multiclass avx512_binop_s_round opc, string OpcodeStr, SDNode OpNode, - SDNode VecNode, X86SchedWriteSizes sched, - bit IsCommutable> { + SDNode VecNode, SDNode RndNode, + X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar, - avx512_fp_scalar_round, XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalar, - avx512_fp_scalar_round, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } @@ -5565,17 +5393,17 @@ multiclass avx512_binop_s_sae opc, string OpcodeStr, SDNode OpNode, VecNode, SaeNode, sched.PD.Scl, IsCommutable>, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } -defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds, SchedWriteFAddSizes, 1>; -defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmuls, X86fmulRnds, SchedWriteFMulSizes, 1>; -defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubs, X86fsubRnds, SchedWriteFAddSizes, 0>; -defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivs, X86fdivRnds, SchedWriteFDivSizes, 0>; -defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds, +defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs, SchedWriteFCmpSizes, 0>; -defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds, +defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, SchedWriteFCmpSizes, 0>; // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use @@ -5618,13 +5446,13 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, multiclass avx512_fp_packed opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, bit IsCommutable, - bit IsKZCommutable = IsCommutable> { + bit IsKCommutable = IsCommutable> { let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr: AVX512_maskable, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, + IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>; let mayLoad = 1 in { defm rm: AVX512_maskable opc, string OpcodeStr, defm rrb: AVX512_maskable, + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fp_sae_packed opc, string OpcodeStr, - SDPatternOperator OpNodeRnd, + SDPatternOperator OpNodeSAE, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rrb: AVX512_maskable, + (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>, EVEX_4V, EVEX_B, Sched<[sched]>; } @@ -5731,10 +5559,10 @@ defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, SchedWriteFCmpSizes, 0>, - avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>; + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>; defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, SchedWriteFCmpSizes, 0>, - avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>; + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>; let isCodeGenOnly = 1 in { defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, SchedWriteFCmpSizes, 1>; @@ -5750,71 +5578,25 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; -let Predicates = [HasVLX,HasDQI] in { - // Use packed logical operations for scalar ops. - def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - - def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; - def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; - def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; - def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; -} - multiclass avx512_fp_scalef_p opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable, + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V, Sched<[sched]>; defm rm: AVX512_maskable, + (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmb: AVX512_maskable, + (_.ScalarLdFrag addr:$src2))))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5825,332 +5607,139 @@ multiclass avx512_fp_scalef_scalar opc, string OpcodeStr, SDNode OpNode, defm rr: AVX512_maskable_scalar, + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, Sched<[sched]>; defm rm: AVX512_maskable_scalar, + (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } multiclass avx512_fp_scalef_all opc, bits<8> opcScaler, string OpcodeStr, - SDNode OpNode, SDNode OpNodeScal, X86SchedWriteWidths sched> { - defm PSZ : avx512_fp_scalef_p, - avx512_fp_round_packed, + defm PSZ : avx512_fp_scalef_p, + avx512_fp_round_packed, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_scalef_p, - avx512_fp_round_packed, + defm PDZ : avx512_fp_scalef_p, + avx512_fp_round_packed, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm SSZ : avx512_fp_scalef_scalar, - avx512_fp_scalar_round, - EVEX_4V,EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_fp_scalef_scalar, - avx512_fp_scalar_round, - EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + defm SSZ : avx512_fp_scalef_scalar, + avx512_fp_scalar_round, + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalef_scalar, + avx512_fp_scalar_round, + EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W; // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp_scalef_p, + defm PSZ128 : avx512_fp_scalef_p, EVEX_V128, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp_scalef_p, + defm PSZ256 : avx512_fp_scalef_p, EVEX_V256, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp_scalef_p, + defm PDZ128 : avx512_fp_scalef_p, EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp_scalef_p, + defm PDZ256 : avx512_fp_scalef_p, EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs, +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest opc, string OpcodeStr, PatFrag OpNode, +multiclass avx512_vptest opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { - let ExeDomain = _.ExeDomain in { - let isCommutable = 1 in + // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG. + // There are just too many permuations due to commutability and bitcasts. + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr : AVX512_maskable_cmp, + (null_frag), (null_frag), 1>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_cmp, + (null_frag), (null_frag)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - - // Patterns for compare with 0 that just use the same source twice. - def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), - (_.KVT (!cast(Name # _.ZSuffix # "rr") - _.RC:$src, _.RC:$src))>; - - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), - (_.KVT (!cast(Name # _.ZSuffix # "rrk") - _.KRC:$mask, _.RC:$src, _.RC:$src))>; } -multiclass avx512_vptest_mb opc, string OpcodeStr, PatFrag OpNode, +multiclass avx512_vptest_mb opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in defm rmb : AVX512_maskable_cmp, + (null_frag), (null_frag)>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -// Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass avx512_vptest_lowering { - def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast(Name # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast(Name # "Zrrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC)>; - - def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast(Name # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast(Name # "Zrrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx)), - _.KRC)>; -} - -multiclass avx512_vptest_dq_sizes opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { +multiclass avx512_vptest_dq_sizes opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_vptest, - avx512_vptest_mb, EVEX_V512; + defm Z : avx512_vptest, + avx512_vptest_mb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_vptest, - avx512_vptest_mb, EVEX_V256; - defm Z128 : avx512_vptest, - avx512_vptest_mb, EVEX_V128; - } - let Predicates = [HasAVX512, NoVLX] in { - defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>; - defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>; + defm Z256 : avx512_vptest, + avx512_vptest_mb, EVEX_V256; + defm Z128 : avx512_vptest, + avx512_vptest_mb, EVEX_V128; } } -multiclass avx512_vptest_dq opc, string OpcodeStr, PatFrag OpNode, +multiclass avx512_vptest_dq opc, string OpcodeStr, X86SchedWriteWidths sched> { - defm D : avx512_vptest_dq_sizes; - defm Q : avx512_vptest_dq_sizes, VEX_W; } multiclass avx512_vptest_wb opc, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched> { + X86SchedWriteWidths sched> { let Predicates = [HasBWI] in { - defm WZ: avx512_vptest, EVEX_V512, VEX_W; - defm BZ: avx512_vptest, EVEX_V512; } let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_vptest, EVEX_V256, VEX_W; - defm WZ128: avx512_vptest, EVEX_V128, VEX_W; - defm BZ256: avx512_vptest, EVEX_V256; - defm BZ128: avx512_vptest, EVEX_V128; } - - let Predicates = [HasBWI, NoVLX] in { - defm BZ256_Alt : avx512_vptest_lowering; - defm BZ128_Alt : avx512_vptest_lowering; - defm WZ256_Alt : avx512_vptest_lowering; - defm WZ128_Alt : avx512_vptest_lowering; - } } -// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm -// as commutable here because we already canonicalized all zeros vectors to the -// RHS during lowering. -def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), - (setcc node:$src1, node:$src2, SETEQ)>; -def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), - (setcc node:$src1, node:$src2, SETNE)>; - multiclass avx512_vptest_all_forms opc_wb, bits<8> opc_dq, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched> : - avx512_vptest_wb, - avx512_vptest_dq; + X86SchedWriteWidths sched> : + avx512_vptest_wb, + avx512_vptest_dq; -defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem, +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", SchedWriteVecLogic>, T8PD; -defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm, +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", SchedWriteVecLogic>, T8XS; - -multiclass avx512_vptest_lowering_pats { - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (!cast(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), - (!cast(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1, - _.RC:$src2)>; - - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _.ImmAllZerosV)), - (!cast(InstrStr # "rm") _.RC:$src1, addr:$src2)>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _.ImmAllZerosV))), - (!cast(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1, - addr:$src2)>; -} - -// Patterns to use 512-bit instructions when 128/256 are not available. -multiclass avx512_vptest_lowering_wide_pats { - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast(InstrStr#"rr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast(InstrStr#"rrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC)>; -} - -multiclass avx512_vptest_lowering_sizes { -let Predicates = [prd, HasVLX] in { - defm : avx512_vptest_lowering_pats; - defm : avx512_vptest_lowering_pats; -} -let Predicates = [prd] in { - defm : avx512_vptest_lowering_pats; -} - -let Predicates = [prd, NoVLX] in { - defm : avx512_vptest_lowering_wide_pats; - defm : avx512_vptest_lowering_wide_pats; -} -} - -multiclass avx512_vptest_lowering_types { - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; -} - -defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>; -defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>; - //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// @@ -6427,86 +6016,23 @@ multiclass avx512_var_shift_w opc, string OpcodeStr, } } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>, - avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>; +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", X86vshlv, SchedWriteVarVecShift>, + avx512_var_shift_w<0x12, "vpsllvw", X86vshlv, SchedWriteVarVecShift>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>, - avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", X86vsrav, SchedWriteVarVecShift>, + avx512_var_shift_w<0x11, "vpsravw", X86vsrav, SchedWriteVarVecShift>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>, - avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecShift>, + avx512_var_shift_w<0x10, "vpsrlvw", X86vsrlv, SchedWriteVarVecShift>; defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; -// Special handing for handling VPSRAV intrinsics. -multiclass avx512_var_shift_int_lowering p> { - let Predicates = p in { - def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)), - (!cast(InstrStr#_.ZSuffix#rr) _.RC:$src1, - _.RC:$src2)>; - def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))), - (!cast(InstrStr#_.ZSuffix##rm) - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)), - (!cast(InstrStr#_.ZSuffix#rrk) _.RC:$src0, - _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)), - _.RC:$src0)), - (!cast(InstrStr#_.ZSuffix##rmk) _.RC:$src0, - _.KRC:$mask, _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), - (!cast(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, - _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)), - _.ImmAllZerosV)), - (!cast(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, - _.RC:$src1, addr:$src2)>; - } -} - -multiclass avx512_var_shift_int_lowering_mb p> : - avx512_var_shift_int_lowering { - let Predicates = p in { - def : Pat<(_.VT (X86vsrav _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - (!cast(InstrStr#_.ZSuffix##rmb) - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2))), - _.RC:$src0)), - (!cast(InstrStr#_.ZSuffix##rmbk) _.RC:$src0, - _.KRC:$mask, _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2))), - _.ImmAllZerosV)), - (!cast(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask, - _.RC:$src1, addr:$src2)>; - } -} - -defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>; -defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>; -defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. let Predicates = [HasAVX512, NoVLX] in { @@ -6827,17 +6353,20 @@ let Predicates = [HasAVX512] in { def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + + // VMOVLPD patterns + def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; } let SchedRW = [WriteFStore] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), - (bc_v2f64 (v4f32 VR128X:$src))), - (iPTR 0))), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT2>; + []>, EVEX, EVEX_CD8<32, CD8VT2>; def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovhpd\t{$src, $dst|$dst, $src}", @@ -6845,12 +6374,11 @@ def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), (iPTR 0))), addr:$dst)]>, EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +let mayStore = 1, hasSideEffects = 0 in def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)), - (iPTR 0))), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT2>; + []>, EVEX, EVEX_CD8<32, CD8VT2>; def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovlpd\t{$src, $dst|$dst, $src}", @@ -6903,7 +6431,7 @@ multiclass avx512_fma3_213_round opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src, + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -6978,7 +6506,7 @@ multiclass avx512_fma3_231_round opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -7056,7 +6584,7 @@ multiclass avx512_fma3_132_round opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -7132,7 +6660,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in { def rb : AVX512FMA3S, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>; }// isCodeGenOnly = 1 @@ -7151,7 +6679,7 @@ multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, (_.ScalarLdFrag addr:$src3)))), (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1, - _.FRC:$src3, (i32 imm:$rc)))), 0>; + _.FRC:$src3, (i32 timm:$rc)))), 0>; defm NAME#231#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3, - _.FRC:$src1, (i32 imm:$rc)))), 1>; + _.FRC:$src1, (i32 timm:$rc)))), 1>; // One pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -7169,7 +6697,7 @@ multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3), _.FRC:$src1, _.FRC:$src2))), (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3, - _.FRC:$src2, (i32 imm:$rc)))), 1>; + _.FRC:$src2, (i32 timm:$rc)))), 1>; } } @@ -7333,62 +6861,62 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zrb_Int") VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (i32 imm:$rc)))))), + (i32 timm:$rc)))))), (!cast(Prefix#"231"#Suffix#"Zrb_Int") VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src3, (i32 imm:$rc)), + _.FRC:$src3, (i32 timm:$rc)), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast(Prefix#"213"#Suffix#"Zrb_Intk") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (i32 imm:$rc)), + (i32 timm:$rc)), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast(Prefix#"231"#Suffix#"Zrb_Intk") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src3, (i32 imm:$rc)), + _.FRC:$src3, (i32 timm:$rc)), (_.EltVT ZeroFP)))))), (!cast(Prefix#"213"#Suffix#"Zrb_Intkz") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (i32 imm:$rc)), + (i32 timm:$rc)), (_.EltVT ZeroFP)))))), (!cast(Prefix#"231"#Suffix#"Zrb_Intkz") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; } } @@ -7468,44 +6996,44 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h, // AVX-512 Scalar convert from sign integer to float/double //===----------------------------------------------------------------------===// -multiclass avx512_vcvtsi opc, SDNode OpNode, X86FoldableSchedWrite sched, +multiclass avx512_vcvtsi opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched, RegisterClass SrcRC, X86VectorVTInfo DstVT, - X86MemOperand x86memop, PatFrag ld_frag, string asm> { - let hasSideEffects = 0 in { + X86MemOperand x86memop, PatFrag ld_frag, string asm, + string mem> { + let hasSideEffects = 0, isCodeGenOnly = 1 in { def rr : SI, - EVEX_4V, Sched<[sched]>; + EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>; let mayLoad = 1 in def rm : SI, + asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } // hasSideEffects = 0 - let isCodeGenOnly = 1 in { - def rr_Int : SI, - EVEX_4V, Sched<[sched]>; - - def rm_Int : SI, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - }//isCodeGenOnly = 1 + def rr_Int : SI, + EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>; + + def rm_Int : SI, + EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; + def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast(NAME#"rr_Int") DstVT.RC:$dst, + DstVT.RC:$src1, SrcRC:$src2), 0, "att">; } multiclass avx512_vcvtsi_round opc, SDNode OpNode, X86FoldableSchedWrite sched, RegisterClass SrcRC, - X86VectorVTInfo DstVT, string asm> { + X86VectorVTInfo DstVT, string asm, + string mem> { def rrb_Int : SI opc, SDNode OpNode, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2, - (i32 imm:$rc)))]>, - EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; + (i32 timm:$rc)))]>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>; + def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}", + (!cast(NAME#"rrb_Int") DstVT.RC:$dst, + DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">; } -multiclass avx512_vcvtsi_common opc, SDNode OpNode, +multiclass avx512_vcvtsi_common opc, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, RegisterClass SrcRC, X86VectorVTInfo DstVT, - X86MemOperand x86memop, PatFrag ld_frag, string asm> { - defm NAME : avx512_vcvtsi_round, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, string mem> { + defm NAME : avx512_vcvtsi_round, avx512_vcvtsi, VEX_LIG; + ld_frag, asm, mem>, VEX_LIG; } let Predicates = [HasAVX512] in { -defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32, - v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, + WriteCvtI2SS, GR32, + v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64, - v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, + WriteCvtI2SS, GR64, + v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32, - v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, - XD, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64, - v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, +defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32, + v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">, + XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, + WriteCvtI2SD, GR64, + v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -7563,23 +7098,26 @@ def : Pat<(f64 (sint_to_fp GR32:$src)), def : Pat<(f64 (sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; -defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32, +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, + WriteCvtI2SS, GR32, v4f32x_info, i32mem, loadi32, - "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64, - v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, + "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, + WriteCvtI2SS, GR64, + v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info, - i32mem, loadi32, "cvtusi2sd{l}">, +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info, + i32mem, loadi32, "cvtusi2sd", "l">, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64, - v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, + WriteCvtI2SD, GR64, + v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -7608,8 +7146,7 @@ multiclass avx512_cvt_s_int_round opc, X86VectorVTInfo SrcVT, X86VectorVTInfo DstVT, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, string asm, - string aliasStr, - bit CodeGenOnly = 1> { + string aliasStr> { let Predicates = [HasAVX512] in { def rr_Int : SI opc, X86VectorVTInfo SrcVT, EVEX, VEX_LIG, Sched<[sched]>; def rrb_Int : SI, + [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>, EVEX, VEX_LIG, EVEX_B, EVEX_RC, Sched<[sched]>; - let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in def rm_Int : SI, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; - - def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", - (!cast(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">; - def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}", - (!cast(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">; } // Predicates = [HasAVX512] -} -multiclass avx512_cvt_s_int_round_aliases opc, X86VectorVTInfo SrcVT, - X86VectorVTInfo DstVT, SDNode OpNode, - SDNode OpNodeRnd, - X86FoldableSchedWrite sched, string asm, - string aliasStr> : - avx512_cvt_s_int_round { - let Predicates = [HasAVX512] in { - def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", - (!cast(NAME # "rm_Int") DstVT.RC:$dst, - SrcVT.IntScalarMemOp:$src), 0, "att">; - } // Predicates = [HasAVX512] + def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">; + def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}", + (!cast(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">; + def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast(NAME # "rm_Int") DstVT.RC:$dst, + SrcVT.IntScalarMemOp:$src), 0, "att">; } // Convert float/double to signed/unsigned int 32/64 @@ -7654,10 +7180,10 @@ defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si, defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si, X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info, X86cvts2usi, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info, X86cvts2usi, +defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si, @@ -7666,10 +7192,10 @@ defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si, defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si, X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info, X86cvts2usi, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info, X86cvts2usi, +defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -7760,19 +7286,18 @@ def : Pat<(v2f64 (X86Movsd // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeInt, SDNode OpNodeRnd, - X86FoldableSchedWrite sched, string aliasStr, - bit CodeGenOnly = 1>{ + SDNode OpNodeInt, SDNode OpNodeSAE, + X86FoldableSchedWrite sched, string aliasStr>{ let Predicates = [HasAVX512] in { let isCodeGenOnly = 1 in { def rr : AVX512, - EVEX, Sched<[sched]>; + EVEX, VEX_LIG, Sched<[sched]>; def rm : AVX512, - EVEX, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; } def rr_Int : AVX512; def rrb_Int : AVX512, - EVEX,VEX_LIG , EVEX_B, Sched<[sched]>; - let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in + [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>, + EVEX, VEX_LIG, EVEX_B, Sched<[sched]>; def rm_Int : AVX512, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; +} //HasAVX512 def : InstAlias(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; def : InstAlias(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; -} //HasAVX512 -} - -multiclass avx512_cvt_s_all_unsigned opc, string asm, - X86VectorVTInfo _SrcRC, - X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeInt, SDNode OpNodeRnd, - X86FoldableSchedWrite sched, - string aliasStr> : - avx512_cvt_s_all { -let Predicates = [HasAVX512] in { def : InstAlias(NAME # "rm_Int") _DstRC.RC:$dst, _SrcRC.IntScalarMemOp:$src), 0, "att">; } -} defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I, +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I, +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I, +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I, +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; //===----------------------------------------------------------------------===// @@ -7851,15 +7362,13 @@ multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2), - (i32 FROUND_CURRENT)))>, + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, Sched<[sched]>; defm rm_Int : AVX512_maskable_scalar, + (_Src.VT _Src.ScalarIntMemCPat:$src2)))>, EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -7878,14 +7387,13 @@ multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _ // Scalar Coversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { defm rrb_Int : AVX512_maskable_scalar, + (_.VT (OpNodeSAE (_.VT _.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; } @@ -7897,34 +7405,36 @@ multiclass avx512_cvt_fp_rc_scalar opc, string OpcodeStr, X86VectorVTInf (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (_.VT (OpNodeRnd (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>, EVEX_4V, VEX_LIG, Sched<[sched]>, EVEX_B, EVEX_RC; } multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, - SDNode OpNodeRnd, X86FoldableSchedWrite sched, - X86VectorVTInfo _src, X86VectorVTInfo _dst> { + SDNode OpNode, SDNode OpNodeRnd, + X86FoldableSchedWrite sched, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar, + defm Z : avx512_cvt_fp_scalar, avx512_cvt_fp_rc_scalar, VEX_W, EVEX_CD8<64, CD8VT1>, XD; } } -multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNodeRnd, +multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeSAE, X86FoldableSchedWrite sched, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar, - avx512_cvt_fp_sae_scalar, + defm Z : avx512_cvt_fp_scalar, + avx512_cvt_fp_sae_scalar, EVEX_CD8<32, CD8VT1>, XS; } } -defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", - X86froundRnd, WriteCvtSD2SS, f64x_info, +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds, + X86froundsRnd, WriteCvtSD2SS, f64x_info, f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", - X86fpextRnd, WriteCvtSS2SD, f32x_info, +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts, + X86fpextsSAE, WriteCvtSS2SD, f32x_info, f64x_info>; def : Pat<(f64 (fpextend FR32X:$src)), @@ -7934,14 +7444,6 @@ def : Pat<(f64 (fpextend (loadf32 addr:$src))), (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; -def : Pat<(f64 (extloadf32 addr:$src)), - (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX512, OptForSize]>; - -def : Pat<(f64 (extloadf32 addr:$src)), - (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, - Requires<[HasAVX512, OptForSpeed]>; - def : Pat<(f32 (fpround FR64X:$src)), (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, Requires<[HasAVX512]>; @@ -7970,7 +7472,8 @@ multiclass avx512_vcvt_fp opc, string OpcodeStr, X86VectorVTInfo _, X86FoldableSchedWrite sched, string Broadcast = _.BroadcastStr, string Alias = "", X86MemOperand MemOp = _Src.MemOp, - RegisterClass MaskRC = _.KRCWM> { + RegisterClass MaskRC = _.KRCWM, + dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> { defm rr : AVX512_maskable_common opc, string OpcodeStr, X86VectorVTInfo _, (ins _.RC:$src0, MaskRC:$mask, MemOp:$src), (ins MaskRC:$mask, MemOp:$src), OpcodeStr#Alias, "$src", "$src", - (_.VT (OpNode (_Src.VT - (_Src.LdFrag addr:$src)))), - (vselect MaskRC:$mask, - (_.VT (OpNode (_Src.VT - (_Src.LdFrag addr:$src)))), - _.RC:$src0), + LdDAG, + (vselect MaskRC:$mask, LdDAG, _.RC:$src0), vselect, "$src0 = $dst">, EVEX, Sched<[sched.Folded]>; @@ -8019,13 +7518,12 @@ multiclass avx512_vcvt_fp opc, string OpcodeStr, X86VectorVTInfo _, } // Coversion with SAE - suppress all exceptions multiclass avx512_vcvt_fp_sae opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { defm rrb : AVX512_maskable, + (_.VT (OpNodeSAE (_Src.VT _Src.RC:$src)))>, EVEX, EVEX_B, Sched<[sched]>; } @@ -8036,23 +7534,34 @@ multiclass avx512_vcvt_fp_rc opc, string OpcodeStr, X86VectorVTInfo _, defm rrb : AVX512_maskable, + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 timm:$rc)))>, EVEX, EVEX_B, EVEX_RC, Sched<[sched]>; } +// Similar to avx512_vcvt_fp, but uses an extload for the memory form. +multiclass avx512_vcvt_fpextend opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode, + X86FoldableSchedWrite sched, + string Broadcast = _.BroadcastStr, + string Alias = "", X86MemOperand MemOp = _Src.MemOp, + RegisterClass MaskRC = _.KRCWM> + : avx512_vcvt_fp("extload"#_Src.VTName) addr:$src))>; + // Extend Float to Double multiclass avx512_cvtps2pd opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp, avx512_vcvt_fp_sae, EVEX_V512; + X86vfpextSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fp, EVEX_V128; - defm Z256 : avx512_vcvt_fp, EVEX_V256; } } @@ -8060,7 +7569,7 @@ multiclass avx512_cvtps2pd opc, string OpcodeStr, // Truncate Double to Float multiclass avx512_cvtpd2ps opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp, + defm Z : avx512_vcvt_fp, avx512_vcvt_fp_rc, EVEX_V512; } @@ -8068,18 +7577,49 @@ multiclass avx512_cvtpd2ps opc, string OpcodeStr, X86SchedWriteWidths sc defm Z128 : avx512_vcvt_fp, EVEX_V128; - defm Z256 : avx512_vcvt_fp, EVEX_V256; - - def : InstAlias(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">; - def : InstAlias(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; } + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; } defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, @@ -8087,20 +7627,66 @@ defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>, PS, EVEX_CD8<32, CD8VH>; -def : Pat<(v8f64 (extloadv8f32 addr:$src)), - (VCVTPS2PDZrm addr:$src)>; +let Predicates = [HasAVX512] in { + def : Pat<(v8f32 (fpround (v8f64 VR512:$src))), + (VCVTPD2PSZrr VR512:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))), + VR256X:$src0), + (VCVTPD2PSZrrk VR256X:$src0, VK8WM:$mask, VR512:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))), + v8f32x_info.ImmAllZerosV), + (VCVTPD2PSZrrkz VK8WM:$mask, VR512:$src)>; -let Predicates = [HasVLX] in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), - (VCVTPD2PSZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), - (VCVTPD2PSZ128rm addr:$src)>; - def : Pat<(v2f64 (extloadv2f32 addr:$src)), - (VCVTPS2PDZ128rm addr:$src)>; - def : Pat<(v4f64 (extloadv4f32 addr:$src)), - (VCVTPS2PDZ256rm addr:$src)>; + def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), + (VCVTPD2PSZrm addr:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))), + VR256X:$src0), + (VCVTPD2PSZrmk VR256X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))), + v8f32x_info.ImmAllZerosV), + (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; + + def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))), + (VCVTPD2PSZrmb addr:$src)>; + def : Pat<(vselect VK8WM:$mask, + (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (v8f32 VR256X:$src0)), + (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(vselect VK8WM:$mask, + (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + v8f32x_info.ImmAllZerosV), + (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; +} + +let Predicates = [HasVLX] in { + def : Pat<(v4f32 (fpround (v4f64 VR256X:$src))), + (VCVTPD2PSZ256rr VR256X:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))), + VR128X:$src0), + (VCVTPD2PSZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))), + v4f32x_info.ImmAllZerosV), + (VCVTPD2PSZ256rrkz VK4WM:$mask, VR256X:$src)>; + + def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), + (VCVTPD2PSZ256rm addr:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))), + VR128X:$src0), + (VCVTPD2PSZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))), + v4f32x_info.ImmAllZerosV), + (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (VCVTPD2PSZ256rmb addr:$src)>; + def : Pat<(vselect VK4WM:$mask, + (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + VR128X:$src0), + (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(vselect VK4WM:$mask, + (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + v4f32x_info.ImmAllZerosV), + (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; // Special patterns to allow use of X86vmfpround for masking. Instruction // patterns have been disabled with null_frag. @@ -8142,7 +7728,11 @@ multiclass avx512_cvtdq2pd opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp, EVEX_V128; + OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM, + (v2f64 (OpNode128 (bc_v4i32 + (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; } @@ -8167,12 +7757,12 @@ multiclass avx512_cvtdq2ps opc, string OpcodeStr, SDNode OpNode, // Convert Float to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttps2dq opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode OpNodeSAE, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp, avx512_vcvt_fp_sae, EVEX_V512; + OpNodeSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp opc, string OpcodeStr, SDNode OpNode, // Convert Double to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttpd2dq opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode OpNodeSAE, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp, avx512_vcvt_fp_sae, EVEX_V512; + OpNodeSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 @@ -8218,16 +7808,49 @@ multiclass avx512_cvttpd2dq opc, string OpcodeStr, SDNode OpNode, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; - - def : InstAlias(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">; - def : InstAlias(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">; } + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; } // Convert Double to Signed/Unsigned Doubleword @@ -8249,16 +7872,47 @@ multiclass avx512_cvtpd2dq opc, string OpcodeStr, SDNode OpNode, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; - - def : InstAlias(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">; - def : InstAlias(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; } + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; } // Convert Double to Signed/Unsigned Quardword @@ -8325,7 +7979,11 @@ multiclass avx512_cvtps2qq opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp, EVEX_V128; + sched.XMM, "{1to2}", "", f64mem, VK2WM, + (v2i64 (OpNode (bc_v4f32 + (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; } @@ -8343,7 +8001,11 @@ multiclass avx512_cvttps2qq opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp, EVEX_V128; + sched.XMM, "{1to2}", "", f64mem, VK2WM, + (v2i64 (OpNode (bc_v4f32 + (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; } @@ -8351,8 +8013,7 @@ multiclass avx512_cvttps2qq opc, string OpcodeStr, SDNode OpNode, // Convert Signed/Unsigned Quardword to Float multiclass avx512_cvtqq2ps opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128, SDNode OpNodeRnd, - X86SchedWriteWidths sched> { + SDNode OpNodeRnd, X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp, @@ -8364,22 +8025,57 @@ multiclass avx512_cvtqq2ps opc, string OpcodeStr, SDNode OpNode, // memory forms of these instructions in Asm Parcer. They have the same // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. - defm Z128 : avx512_vcvt_fp, EVEX_V128, - NotEVEX2VEXConvertible; + defm Z128 : avx512_vcvt_fp, + EVEX_V128, NotEVEX2VEXConvertible; defm Z256 : avx512_vcvt_fp, EVEX_V256, NotEVEX2VEXConvertible; - - def : InstAlias(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">; - def : InstAlias(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">; } + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; } defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP, @@ -8390,19 +8086,19 @@ defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPS2DQ>, + X86cvttp2siSAE, SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPD2DQ>, + X86cvttp2siSAE, SchedWriteCvtPD2DQ>, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS, + X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, + X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, PS, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, @@ -8446,19 +8142,19 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, EVEX_CD8<32, CD8VH>; defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W, + X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD, + X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W, + X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD, + X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, @@ -8469,67 +8165,15 @@ defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP, +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP, +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD, EVEX_CD8<64, CD8VF>; -let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))), - (VCVTTPS2DQZrr VR512:$src)>; - def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))), - (VCVTTPS2DQZrm addr:$src)>; - - def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))), - (VCVTTPS2UDQZrr VR512:$src)>; - def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))), - (VCVTTPS2UDQZrm addr:$src)>; - - def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))), - (VCVTTPD2DQZrr VR512:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))), - (VCVTTPD2DQZrm addr:$src)>; - - def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))), - (VCVTTPD2UDQZrr VR512:$src)>; - def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))), - (VCVTTPD2UDQZrm addr:$src)>; -} - let Predicates = [HasVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))), - (VCVTTPS2DQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2DQZ128rm addr:$src)>; - - def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))), - (VCVTTPS2UDQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))), - (VCVTTPS2UDQZ128rm addr:$src)>; - - def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))), - (VCVTTPS2DQZ256rr VR256X:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2DQZ256rm addr:$src)>; - - def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))), - (VCVTTPS2UDQZ256rr VR256X:$src)>; - def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))), - (VCVTTPS2UDQZ256rm addr:$src)>; - - def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))), - (VCVTTPD2DQZ256rr VR256X:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), - (VCVTTPD2DQZ256rm addr:$src)>; - - def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))), - (VCVTTPD2UDQZ256rr VR256X:$src)>; - def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))), - (VCVTTPD2UDQZ256rm addr:$src)>; - // Special patterns to allow use of X86mcvtp2Int for masking. Instruction // patterns have been disabled with null_frag. def : Pat<(v4i32 (X86cvtp2Int (v2f64 VR128X:$src))), @@ -8647,72 +8291,64 @@ let Predicates = [HasVLX] in { (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; } -let Predicates = [HasDQI] in { - def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))), - (VCVTTPS2QQZrr VR256X:$src)>; - def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2QQZrm addr:$src)>; - - def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))), - (VCVTTPS2UQQZrr VR256X:$src)>; - def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))), - (VCVTTPS2UQQZrm addr:$src)>; - - def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))), - (VCVTTPD2QQZrr VR512:$src)>; - def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))), - (VCVTTPD2QQZrm addr:$src)>; - - def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))), - (VCVTTPD2UQQZrr VR512:$src)>; - def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))), - (VCVTTPD2UQQZrm addr:$src)>; -} - let Predicates = [HasDQI, HasVLX] in { - def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))), - (VCVTTPS2QQZ256rr VR128X:$src)>; - def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2QQZ256rm addr:$src)>; - - def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))), - (VCVTTPS2UQQZ256rr VR128X:$src)>; - def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))), - (VCVTTPS2UQQZ256rm addr:$src)>; - - def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))), - (VCVTTPD2QQZ128rr VR128X:$src)>; - def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))), - (VCVTTPD2QQZ128rm addr:$src)>; - - def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))), - (VCVTTPD2UQQZ128rr VR128X:$src)>; - def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))), - (VCVTTPD2UQQZ128rm addr:$src)>; - - def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))), - (VCVTTPD2QQZ256rr VR256X:$src)>; - def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))), - (VCVTTPD2QQZ256rm addr:$src)>; - - def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))), - (VCVTTPD2UQQZ256rr VR256X:$src)>; - def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))), - (VCVTTPD2UQQZ256rm addr:$src)>; + def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTPS2QQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTPS2UQQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTTPS2QQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTTPS2UQQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), +def : Pat<(v8i32 (X86cvttp2ui (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), +def : Pat<(v4i32 (X86cvttp2ui (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))), +def : Pat<(v4i32 (X86cvttp2ui (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_xmm)>; @@ -8738,80 +8374,117 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), VR128X:$src1, sub_xmm)))), sub_xmm)>; } -let Predicates = [HasAVX512, HasVLX] in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), - (VCVTPD2DQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), - (VCVTPD2DQZ128rm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))), - (VCVTPD2UDQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), - (VCVTTPD2DQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), - (VCVTTPD2DQZ128rm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))), - (VCVTTPD2UDQZ128rr VR128X:$src)>; - - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (VCVTDQ2PDZ128rm addr:$src)>; - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), +let Predicates = [HasVLX] in { + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDZ128rm addr:$src)>; - - def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (VCVTUDQ2PDZ128rm addr:$src)>; - def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + v2f64x_info.ImmAllZerosV)), + (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTUDQ2PDZ128rm addr:$src)>; -} - -let Predicates = [HasAVX512] in { - def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), - (VCVTPD2PSZrm addr:$src)>; - def : Pat<(v8f64 (extloadv8f32 addr:$src)), - (VCVTPS2PDZrm addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + v2f64x_info.ImmAllZerosV)), + (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasDQI, HasVLX] in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))), + // Special patterns to allow use of X86VMSintToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v4f32 (X86VSintToFP (v2i64 VR128X:$src))), (VCVTQQ2PSZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))), + def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; + + def : Pat<(v4f32 (X86VSintToFP (loadv2i64 addr:$src))), + (VCVTQQ2PSZ128rm addr:$src)>; + def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + (VCVTQQ2PSZ128rmb addr:$src)>; + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + (v4f32 VR128X:$src0), VK2WM:$mask), + (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + v4f32x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; + + // Special patterns to allow use of X86VMUintToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v4f32 (X86VUintToFP (v2i64 VR128X:$src))), (VCVTUQQ2PSZ128rr VR128X:$src)>; + def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; + + def : Pat<(v4f32 (X86VUintToFP (loadv2i64 addr:$src))), + (VCVTUQQ2PSZ128rm addr:$src)>; + def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + (VCVTUQQ2PSZ128rmb addr:$src)>; + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + (v4f32 VR128X:$src0), VK2WM:$mask), + (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + v4f32x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasDQI, NoVLX] in { -def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))), +def : Pat<(v2i64 (X86cvttp2si (v2f64 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))), +def : Pat<(v4i64 (X86cvttp2si (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_ymm)>; -def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))), +def : Pat<(v4i64 (X86cvttp2si (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))), +def : Pat<(v2i64 (X86cvttp2ui (v2f64 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))), +def : Pat<(v4i64 (X86cvttp2ui (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_ymm)>; -def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))), +def : Pat<(v4i64 (X86cvttp2ui (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; @@ -8870,8 +8543,7 @@ multiclass avx512_cvtph2ps_sae, + (X86cvtph2psSAE (_src.VT _src.RC:$src))>, T8PD, EVEX_B, Sched<[sched]>; } @@ -8890,9 +8562,7 @@ let Predicates = [HasVLX] in { EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), - (VCVTPH2PSZ128rm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSZ128rm addr:$src)>; def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), @@ -9055,12 +8725,12 @@ multiclass avx512_fp14_s opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, - EVEX_4V, Sched<[sched]>; + EVEX_4V, VEX_LIG, Sched<[sched]>; defm rm : AVX512_maskable_scalar, EVEX_4V, + _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -9129,47 +8799,45 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, - SDNode OpNode, X86FoldableSchedWrite sched> { + SDNode OpNode, SDNode OpNodeSAE, + X86FoldableSchedWrite sched> { let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable_scalar, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, Sched<[sched]>; defm rb : AVX512_maskable_scalar, EVEX_B, - Sched<[sched]>; + (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_B, Sched<[sched]>; defm m : AVX512_maskable_scalar, + (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } multiclass avx512_eri_s opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched> { - defm SSZ : avx512_fp28_s, - EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_fp28_s, - EVEX_CD8<64, CD8VT1>, VEX_W; + SDNode OpNodeSAE, X86FoldableSchedWrite sched> { + defm SSZ : avx512_fp28_s, EVEX_CD8<32, CD8VT1>, VEX_LIG; + defm SDZ : avx512_fp28_s, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } let Predicates = [HasERI] in { - defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>, - T8PD, EVEX_4V; - defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs, + SchedWriteFRcp.Scl>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs, SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V; } -defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, SchedWriteFRnd.Scl>, T8PD, EVEX_4V; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd @@ -9178,42 +8846,40 @@ multiclass avx512_fp28_p opc, string OpcodeStr, X86VectorVTInfo _, let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable, + (OpNode (_.VT _.RC:$src))>, Sched<[sched]>; defm m : AVX512_maskable, + (bitconvert (_.LdFrag addr:$src))))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb : AVX512_maskable, EVEX_B, - Sched<[sched.Folded, sched.ReadAfterFold]>; + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } -multiclass avx512_fp28_p_round opc, string OpcodeStr, X86VectorVTInfo _, +multiclass avx512_fp28_p_sae opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, X86FoldableSchedWrite sched> { let ExeDomain = _.ExeDomain in defm rb : AVX512_maskable, + (OpNode (_.VT _.RC:$src))>, EVEX_B, Sched<[sched]>; } multiclass avx512_eri opc, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched> { + SDNode OpNodeSAE, X86SchedWriteWidths sched> { defm PSZ : avx512_fp28_p, - avx512_fp28_p_round, + avx512_fp28_p_sae, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm PDZ : avx512_fp28_p, - avx512_fp28_p_round, + avx512_fp28_p_sae, T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -9221,24 +8887,32 @@ multiclass avx512_fp_unaryop_packed opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched> { // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp28_p, - EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp28_p, - EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp28_p, - EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp28_p, - EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + defm PSZ128 : avx512_fp28_p, + EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp28_p, + EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp28_p, + EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp28_p, + EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; } } let Predicates = [HasERI] in { - defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX; - defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX; - defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX; -} -defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>, - avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd, + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE, + SchedWriteFRsqrt>, EVEX; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE, + SchedWriteFRcp>, EVEX; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE, + SchedWriteFAdd>, EVEX; +} +defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE, + SchedWriteFRnd>, + avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>, EVEX; multiclass avx512_sqrt_packed_round opc, string OpcodeStr, @@ -9246,7 +8920,7 @@ multiclass avx512_sqrt_packed_round opc, string OpcodeStr, let ExeDomain = _.ExeDomain in defm rb: AVX512_maskable, + (_.VT (X86fsqrtRnd _.RC:$src, (i32 timm:$rc)))>, EVEX, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -9312,23 +8986,21 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, X86FoldableSchedWri defm r_Int : AVX512_maskable_scalar, + (X86fsqrts (_.VT _.RC:$src1), + (_.VT _.RC:$src2))>, Sched<[sched]>; defm m_Int : AVX512_maskable_scalar, + (X86fsqrts (_.VT _.RC:$src1), + _.ScalarIntMemCPat:$src2)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rb_Int : AVX512_maskable_scalar, + (i32 timm:$rc))>, EVEX_B, EVEX_RC, Sched<[sched]>; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in { @@ -9383,8 +9055,8 @@ multiclass avx512_rndscale_scalar opc, string OpcodeStr, defm rb_Int : AVX512_maskable_scalar, EVEX_B, + (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3)))>, EVEX_B, Sched<[sched]>; defm m_Int : AVX512_maskable_scalar opc, string OpcodeStr, } let Predicates = [HasAVX512] in { - def : Pat<(ffloor _.FRC:$src), - (_.EltVT (!cast(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0x9)))>; - def : Pat<(fceil _.FRC:$src), - (_.EltVT (!cast(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0xa)))>; - def : Pat<(ftrunc _.FRC:$src), - (_.EltVT (!cast(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0xb)))>; - def : Pat<(frint _.FRC:$src), + def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2), (_.EltVT (!cast(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0x4)))>; - def : Pat<(fnearbyint _.FRC:$src), - (_.EltVT (!cast(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0xc)))>; + _.FRC:$src1, imm:$src2))>; } let Predicates = [HasAVX512, OptForSize] in { - def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0x9)))>; - def : Pat<(fceil (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0xa)))>; - def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0xb)))>; - def : Pat<(frint (_.ScalarLdFrag addr:$src)), + def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2), (_.EltVT (!cast(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0x4)))>; - def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0xc)))>; + addr:$src1, imm:$src2))>; } } defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless", SchedWriteFRnd.Scl, f32x_info>, - AVX512AIi8Base, EVEX_4V, + AVX512AIi8Base, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd", SchedWriteFRnd.Scl, f64x_info>, - VEX_W, AVX512AIi8Base, EVEX_4V, + VEX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; multiclass avx512_masked_scalar; -multiclass avx512_masked_scalar_imm ImmV, Predicate BasePredicate> { - let Predicates = [BasePredicate] in { - def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask, - (OpNode (extractelt _.VT:$src2, (iPTR 0))), - (extractelt _.VT:$dst, (iPTR 0))))), - (!cast("V"#OpcPrefix#Zr_Intk) - _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; - - def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask, - (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), - (!cast("V"#OpcPrefix#Zr_Intkz) - VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; - } -} - -defm : avx512_masked_scalar_imm; -defm : avx512_masked_scalar_imm; -defm : avx512_masked_scalar_imm; -defm : avx512_masked_scalar_imm; - //------------------------------------------------- // Integer truncate and extend operations @@ -9966,26 +9588,14 @@ multiclass AVX512_pmovx_patterns_base { let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; } // 512-bit patterns @@ -10007,41 +9617,6 @@ multiclass AVX512_pmovx_patterns_base { } } -multiclass AVX512_pmovx_patterns_aext : - AVX512_pmovx_patterns_base { - let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))), - (!cast(OpcPrefix#BWZ256rr) VR128X:$src)>; - } - - let Predicates = [HasVLX] in { - def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))), - (!cast(OpcPrefix#WDZ256rr) VR128X:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))), - (!cast(OpcPrefix#DQZ256rr) VR128X:$src)>; - } - - // 512-bit patterns - let Predicates = [HasBWI] in { - def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))), - (!cast(OpcPrefix#BWZrr) VR256X:$src)>; - } - let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))), - (!cast(OpcPrefix#BDZrr) VR128X:$src)>; - def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))), - (!cast(OpcPrefix#WDZrr) VR256X:$src)>; - - def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))), - (!cast(OpcPrefix#WQZrr) VR128X:$src)>; - - def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))), - (!cast(OpcPrefix#DQZrr) VR256X:$src)>; - } -} - - multiclass AVX512_pmovx_patterns : AVX512_pmovx_patterns_base { @@ -10051,103 +9626,62 @@ multiclass AVX512_pmovx_patterns(OpcPrefix#BWZ128rm) addr:$src)>; def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BQZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; } // 512-bit patterns let Predicates = [HasAVX512] in { def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BQZrm) addr:$src)>; } } defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>; defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>; -defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>; // Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge // ext+trunc aggresively making it impossible to legalize the DAG to this @@ -10155,22 +9689,8 @@ defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>; let Predicates = [HasAVX512, NoBWI] in { def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; -def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), +def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))), (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; -def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), - (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; -} - -// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge -// ext+trunc aggresively making it impossible to legalize the DAG to this -// pattern directly. -let Predicates = [HasAVX512, NoBWI] in { -def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), - (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; -def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), - (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; -def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), - (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; } //===----------------------------------------------------------------------===// @@ -10457,7 +9977,7 @@ multiclass compress_by_vec_width_common opc, X86VectorVTInfo _, string OpcodeStr, X86FoldableSchedWrite sched> { defm rr : AVX512_maskable, AVX5128IBase, + (null_frag)>, AVX5128IBase, Sched<[sched]>; let mayStore = 1, hasSideEffects = 0 in @@ -10479,6 +9999,13 @@ multiclass compress_by_vec_width_lowering { def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask), (!cast(Name#_.ZSuffix##mrk) addr:$dst, _.KRCWM:$mask, _.RC:$src)>; + + def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask), + (!cast(Name#_.ZSuffix##rrk) + _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>; + def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask), + (!cast(Name#_.ZSuffix##rrkz) + _.KRCWM:$mask, _.RC:$src)>; } multiclass compress_by_elt_width opc, string OpcodeStr, @@ -10512,13 +10039,12 @@ multiclass expand_by_vec_width opc, X86VectorVTInfo _, string OpcodeStr, X86FoldableSchedWrite sched> { defm rr : AVX512_maskable, AVX5128IBase, + (null_frag)>, AVX5128IBase, Sched<[sched]>; defm rm : AVX512_maskable, + (null_frag)>, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10537,6 +10063,13 @@ multiclass expand_by_vec_width_lowering { (_.VT _.RC:$src0))), (!cast(Name#_.ZSuffix##rmk) _.RC:$src0, _.KRCWM:$mask, addr:$src)>; + + def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask), + (!cast(Name#_.ZSuffix##rrk) + _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>; + def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask), + (!cast(Name#_.ZSuffix##rrkz) + _.KRCWM:$mask, _.RC:$src)>; } multiclass expand_by_elt_width opc, string OpcodeStr, @@ -10603,18 +10136,17 @@ multiclass avx512_unary_fp_sae_packed_imm opc, string OpcodeStr, OpcodeStr##_.Suffix, "$src2, {sae}, $src1", "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2), - (i32 FROUND_NO_EXC))>, + (i32 imm:$src2))>, EVEX_B, Sched<[sched]>; } multiclass avx512_common_unary_fp_sae_packed_imm opc, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ let Predicates = [prd] in { defm Z : avx512_unary_fp_packed_imm, - avx512_unary_fp_sae_packed_imm, EVEX_V512; } let Predicates = [prd, HasVLX] in { @@ -10733,8 +10265,7 @@ multiclass avx512_fp_sae_packed_imm opc, string OpcodeStr, "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), - (i32 FROUND_NO_EXC))>, + (i32 imm:$src3))>, EVEX_B, Sched<[sched]>; } @@ -10748,17 +10279,16 @@ multiclass avx512_fp_sae_scalar_imm opc, string OpcodeStr, SDNode OpNode "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), - (i32 FROUND_NO_EXC))>, + (i32 imm:$src3))>, EVEX_B, Sched<[sched]>; } multiclass avx512_common_fp_sae_packed_imm opc, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ let Predicates = [prd] in { defm Z : avx512_fp_packed_imm, - avx512_fp_sae_packed_imm, + avx512_fp_sae_packed_imm, EVEX_V512; } @@ -10802,267 +10332,64 @@ multiclass avx512_common_3Op_imm8 opc, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> { + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd> { let Predicates = [prd] in { defm Z : avx512_fp_scalar_imm, - avx512_fp_sae_scalar_imm; + avx512_fp_sae_scalar_imm; } } multiclass avx512_common_unary_fp_sae_packed_imm_all opcPs, bits<8> opcPd, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ defm PS : avx512_common_unary_fp_sae_packed_imm, + opcPs, OpNode, OpNodeSAE, sched, prd>, EVEX_CD8<32, CD8VF>; defm PD : avx512_common_unary_fp_sae_packed_imm, + opcPd, OpNode, OpNodeSAE, sched, prd>, EVEX_CD8<64, CD8VF>, VEX_W; } defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, - X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>, + X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX; defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, - X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>, + X86VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, - X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>, + X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, - 0x50, X86VRange, X86VRangeRnd, + 0x50, X86VRange, X86VRangeSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info, - 0x50, X86VRange, X86VRangeRnd, + 0x50, X86VRange, X86VRangeSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", - f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>, + f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, - 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>, + 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, - 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>, + 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, - 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>, + 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, - 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>, + 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, - 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>, + 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; - -multiclass AVX512_rndscale_lowering { - // Register - def : Pat<(_.VT (ffloor _.RC:$src)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0x9))>; - def : Pat<(_.VT (fnearbyint _.RC:$src)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0xC))>; - def : Pat<(_.VT (fceil _.RC:$src)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0xA))>; - def : Pat<(_.VT (frint _.RC:$src)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0x4))>; - def : Pat<(_.VT (ftrunc _.RC:$src)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0xB))>; - - // Merge-masking - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>; - - // Zero-masking - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0xB))>; - - // Load - def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0x9))>; - def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0xC))>; - def : Pat<(_.VT (fceil (_.LdFrag addr:$src))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0xA))>; - def : Pat<(_.VT (frint (_.LdFrag addr:$src))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0x4))>; - def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0xB))>; - - // Merge-masking + load - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>; - - // Zero-masking + load - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0xB))>; - - // Broadcast load - def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0x9))>; - def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0xC))>; - def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0xA))>; - def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0x4))>; - def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0xB))>; - - // Merge-masking + broadcast load - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>; - - // Zero-masking + broadcast load - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0xB))>; -} - -let Predicates = [HasAVX512] in { - defm : AVX512_rndscale_lowering; - defm : AVX512_rndscale_lowering; -} - -let Predicates = [HasVLX] in { - defm : AVX512_rndscale_lowering; - defm : AVX512_rndscale_lowering; - defm : AVX512_rndscale_lowering; - defm : AVX512_rndscale_lowering; -} - multiclass avx512_shuff_packed_128_common opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, @@ -11544,9 +10871,9 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), @@ -11554,21 +10881,21 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), - (bitconvert (v4i32 immAllZerosV))), + immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), - (bitconvert (v4i32 immAllZerosV))), + immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), - (bitconvert (v4i32 immAllZerosV))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), + immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -12067,39 +11394,39 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, // TODO: We should maybe have a more generalized algorithm for folding to // vpternlog. let Predicates = [HasAVX512] in { - def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v64i8 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; - def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v32i16 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; - def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v16i32 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; - def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v8i64 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } let Predicates = [HasAVX512, NoVLX] in { - def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), @@ -12107,28 +11434,28 @@ let Predicates = [HasAVX512, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (i8 15)), sub_ymm)>; - def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (i8 15)), sub_ymm)>; - def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (i8 15)), sub_ymm)>; - def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), @@ -12138,22 +11465,22 @@ let Predicates = [HasAVX512, NoVLX] in { } let Predicates = [HasVLX] in { - def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; } @@ -12161,58 +11488,55 @@ let Predicates = [HasVLX] in { // AVX-512 - FixupImm //===----------------------------------------------------------------------===// -multiclass avx512_fixupimm_packed opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fixupimm_packed opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo TblVT>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src, Sched<[sched]>; + (X86VFixupimm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT _.RC:$src3), + (i32 imm:$src4))>, Sched<[sched]>; defm rmi : AVX512_maskable_3src, + (X86VFixupimm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), + (i32 imm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src, + (X86VFixupimm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), + (i32 imm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Constraints = "$src1 = $dst" } multiclass avx512_fixupimm_packed_sae opc, string OpcodeStr, - SDNode OpNode, X86FoldableSchedWrite sched, - X86VectorVTInfo _, X86VectorVTInfo TblVT>{ + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo TblVT> + : avx512_fixupimm_packed { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rrib : AVX512_maskable_3src, + (X86VFixupimmSAE (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT _.RC:$src3), + (i32 imm:$src4))>, EVEX_B, Sched<[sched]>; } } -multiclass avx512_fixupimm_scalar opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fixupimm_scalar opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo _src3VT> { let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], @@ -12220,30 +11544,27 @@ multiclass avx512_fixupimm_scalar opc, string OpcodeStr, SDNode OpNode, defm rri : AVX512_maskable_3src_scalar, Sched<[sched]>; + (X86VFixupimms (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT _src3VT.RC:$src3), + (i32 imm:$src4))>, Sched<[sched]>; defm rrib : AVX512_maskable_3src_scalar, + (X86VFixupimmSAEs (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT _src3VT.RC:$src3), + (i32 imm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmi : AVX512_maskable_3src_scalar, + (X86VFixupimms (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT (scalar_to_vector + (_src3VT.ScalarLdFrag addr:$src3))), + (i32 imm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -12252,25 +11573,23 @@ multiclass avx512_fixupimm_packed_all { let Predicates = [HasAVX512] in - defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM, - _Vec.info512, _Tbl.info512>, - avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM, + defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM, _Vec.info512, _Tbl.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM, + defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM, _Vec.info128, _Tbl.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128; - defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM, + defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM, _Vec.info256, _Tbl.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256; } } -defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, +defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", SchedWriteFAdd.Scl, f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; -defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, +defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", SchedWriteFAdd.Scl, f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VFIXUPIMMPS : avx512_fixupimm_packed_all("V"#OpcPrefix#Zrr_Int) _.VT:$dst, (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>; + def : Pat<(MoveNode + (_.VT VR128X:$dst), + (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), + (_.ScalarLdFrag addr:$src))))), + (!cast("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -12344,6 +11669,16 @@ multiclass AVX512_scalar_math_fp_patterns; + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), + _.FRC:$src0))), + (!cast("V"#OpcPrefix#Zrm_Intk) + (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), + VK1WM:$mask, _.VT:$src1, addr:$src2)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -12355,6 +11690,13 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#Zrr_Intkz) VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))), + (!cast("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>; } } @@ -12380,26 +11722,6 @@ multiclass AVX512_scalar_unary_math_patterns; defm : AVX512_scalar_unary_math_patterns; -multiclass AVX512_scalar_unary_math_imm_patterns ImmV> { - let Predicates = [HasAVX512] in { - def : Pat<(_.VT (Move _.VT:$dst, - (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))), - (!cast("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src, - (i32 ImmV))>; - } -} - -defm : AVX512_scalar_unary_math_imm_patterns; -defm : AVX512_scalar_unary_math_imm_patterns; -defm : AVX512_scalar_unary_math_imm_patterns; -defm : AVX512_scalar_unary_math_imm_patterns; - //===----------------------------------------------------------------------===// // AES instructions //===----------------------------------------------------------------------===// @@ -12612,12 +11934,19 @@ defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU, defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>; defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>; +def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vpshufbitqmb node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + multiclass VPSHUFBITQMB_rm { defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst), (ins VTI.RC:$src1, VTI.RC:$src2), "vpshufbitqmb", "$src2, $src1", "$src1, $src2", (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT VTI.RC:$src2)), + (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD, Sched<[sched]>; defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst), @@ -12625,6 +11954,8 @@ multiclass VPSHUFBITQMB_rm { "vpshufbitqmb", "$src2, $src1", "$src1, $src2", (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT (VTI.LdFrag addr:$src2))), + (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src2)))>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -12720,13 +12051,13 @@ defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info, defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info, (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), "v4fmaddss", "$src3, $src2", "$src2, $src3", - []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, + []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, Sched<[SchedWriteFMA.Scl.Folded]>; defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info, (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), "v4fnmaddss", "$src3, $src2", "$src2, $src3", - []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, + []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, Sched<[SchedWriteFMA.Scl.Folded]>; } @@ -12749,3 +12080,196 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, Sched<[SchedWriteFMA.ZMM.Folded]>; } +let hasSideEffects = 0 in { + let mayStore = 1 in + def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>; + let mayLoad = 1 in + def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>; +} + +//===----------------------------------------------------------------------===// +// VP2INTERSECT +//===----------------------------------------------------------------------===// + +multiclass avx512_vp2intersect_modes { + def rr : I<0x68, MRMSrcReg, + (outs _.KRPC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat("vp2intersect", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRPC:$dst, (X86vp2intersect + _.RC:$src1, (_.VT _.RC:$src2)))]>, + EVEX_4V, T8XD; + + def rm : I<0x68, MRMSrcMem, + (outs _.KRPC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), + !strconcat("vp2intersect", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRPC:$dst, (X86vp2intersect + _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : I<0x68, MRMSrcMem, + (outs _.KRPC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr, + ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), + [(set _.KRPC:$dst, (X86vp2intersect + _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, + EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_vp2intersect { + let Predicates = [HasAVX512, HasVP2INTERSECT] in + defm Z : avx512_vp2intersect_modes<_.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in { + defm Z256 : avx512_vp2intersect_modes<_.info256>, EVEX_V256; + defm Z128 : avx512_vp2intersect_modes<_.info128>, EVEX_V128; + } +} + +defm VP2INTERSECTD : avx512_vp2intersect; +defm VP2INTERSECTQ : avx512_vp2intersect, VEX_W; + +multiclass avx512_binop_all2 opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _SrcVTInfo, + AVX512VLVectorVTInfo _DstVTInfo, + SDNode OpNode, Predicate prd, + bit IsCommutable = 0> { + let Predicates = [prd] in + defm NAME#Z : avx512_binop_rm2, + EVEX_V512, EVEX_CD8<32, CD8VF>; + let Predicates = [HasVLX, prd] in { + defm NAME#Z256 : avx512_binop_rm2, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME#Z128 : avx512_binop_rm2, + EVEX_V128, EVEX_CD8<32, CD8VF>; + } +} + +defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16", + SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF + avx512vl_f32_info, avx512vl_i16_info, + X86cvtne2ps2bf16, HasBF16, 0>, T8XD; + +// Truncate Float to BFloat16 +multiclass avx512_cvtps2bf16 opc, string OpcodeStr, + X86SchedWriteWidths sched> { + let Predicates = [HasBF16] in { + defm Z : avx512_vcvt_fp, EVEX_V512; + } + let Predicates = [HasBF16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0>; + def : InstAlias(NAME # "Z128rm") VR128X:$dst, + f128mem:$src), 0, "intel">; + def : InstAlias(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0>; + def : InstAlias(NAME # "Z256rm") VR128X:$dst, + f256mem:$src), 0, "intel">; + } +} + +defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16", + SchedWriteCvtPD2PS>, T8XS, + EVEX_CD8<32, CD8VF>; + +let Predicates = [HasBF16, HasVLX] in { + // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))), + (VCVTNEPS2BF16Z128rr VR128X:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0), + VK4WM:$mask), + (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>; + + def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), + (VCVTNEPS2BF16Z128rm addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0), + VK4WM:$mask), + (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 + (X86VBroadcast (loadf32 addr:$src))))), + (VCVTNEPS2BF16Z128rmb addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + (v8i16 VR128X:$src0), VK4WM:$mask), + (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + v8i16x_info.ImmAllZerosV, VK4WM:$mask), + (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; +} + +let Constraints = "$src1 = $dst" in { +multiclass avx512_dpbf16ps_rm opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo src_v> { + defm r: AVX512_maskable_3src, + EVEX_4V; + + defm m: AVX512_maskable_3src, EVEX_4V; + + defm mb: AVX512_maskable_3src, + EVEX_B, EVEX_4V; + +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_dpbf16ps_sizes opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _, + AVX512VLVectorVTInfo src_v, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_dpbf16ps_rm, EVEX_V512; + } + let Predicates = [HasVLX, prd] in { + defm Z256 : avx512_dpbf16ps_rm, EVEX_V256; + defm Z128 : avx512_dpbf16ps_rm, EVEX_V128; + } +} + +defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, + avx512vl_f32_info, avx512vl_i32_info, + HasBF16>, T8XS, EVEX_CD8<32, CD8VF>; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index cb5a4e5b5d41..e52635f8d48b 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1,9 +1,8 @@ //===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -195,19 +194,22 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), // Surprisingly enough, these are not two address instructions! let Defs = [EFLAGS] in { +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + // Register-Integer Signed Integer Multiply -def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 - (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), - "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, imm:$src2))]>, - Sched<[WriteIMul16Imm]>, OpSize16; def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR16:$dst, EFLAGS, (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>, Sched<[WriteIMul16Imm]>, OpSize16; +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))]>, + Sched<[WriteIMul16Imm]>, OpSize16; def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -220,26 +222,20 @@ def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 [(set GR32:$dst, EFLAGS, (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>, Sched<[WriteIMul32Imm]>, OpSize32; -def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32 - (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), - "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, EFLAGS, - (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>, - Sched<[WriteIMul64Imm]>; def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR64:$dst, EFLAGS, (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>, Sched<[WriteIMul64Imm]>; +def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32 + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>, + Sched<[WriteIMul64Imm]>; // Memory-Integer Signed Integer Multiply -def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 - (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), - "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, EFLAGS, - (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>, - Sched<[WriteIMul16Imm.Folded]>, OpSize16; def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -247,12 +243,12 @@ def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2))]>, Sched<[WriteIMul16Imm.Folded]>, OpSize16; -def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 - (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), - "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, EFLAGS, - (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>, - Sched<[WriteIMul32Imm.Folded]>, OpSize32; +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>, + Sched<[WriteIMul16Imm.Folded]>, OpSize16; def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -260,13 +256,12 @@ def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2))]>, Sched<[WriteIMul32Imm.Folded]>, OpSize32; -def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32 - (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), - "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, EFLAGS, - (X86smul_flag (loadi64 addr:$src1), - i64immSExt32:$src2))]>, - Sched<[WriteIMul64Imm.Folded]>; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>, + Sched<[WriteIMul32Imm.Folded]>, OpSize32; def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -274,6 +269,13 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2))]>, Sched<[WriteIMul64Imm.Folded]>; +def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (loadi64 addr:$src1), + i64immSExt32:$src2))]>, + Sched<[WriteIMul64Imm.Folded]>; } // Defs = [EFLAGS] // unsigned division/remainder @@ -436,11 +438,10 @@ def X86sub_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs), // TODO: inc/dec is slow for P4, but fast for Pentium-M. let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { -let CodeSize = 2 in +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "inc{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86add_flag_nocf GR8:$src1, 1))]>; -let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86add_flag_nocf GR16:$src1, 1))]>, @@ -484,11 +485,10 @@ let Predicates = [UseIncDec, In64BitMode] in { } // CodeSize = 2, SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { -let CodeSize = 2 in +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "dec{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86sub_flag_nocf GR8:$src1, 1))]>; -let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86sub_flag_nocf GR16:$src1, 1))]>, @@ -605,16 +605,16 @@ def invalid_node : SDNode<"<>", SDTIntLeaf,[],"<>">; def Xi8 : X86TypeInfo; def Xi16 : X86TypeInfo; def Xi32 : X86TypeInfo; def Xi64 : X86TypeInfo; /// ITy - This instruction base class takes the type info for the instruction. @@ -924,11 +924,12 @@ class BinOpAI_F opcode, string mnemonic, X86TypeInfo typeinfo, multiclass ArithBinOp_RF BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, string mnemonic, Format RegMRM, Format MemMRM, SDNode opnodeflag, SDNode opnode, - bit CommutableRR, bit ConvertibleToThreeAddress> { + bit CommutableRR, bit ConvertibleToThreeAddress, + bit ConvertibleToThreeAddressRR> { let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR in { - let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in { def NAME#8rr : BinOpRR_RF; def NAME#16rr : BinOpRR_RF; def NAME#32rr : BinOpRR_RF; @@ -1169,16 +1170,16 @@ multiclass ArithBinOp_F BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m, - X86and_flag, and, 1, 0>; + X86and_flag, and, 1, 0, 0>; defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m, - X86or_flag, or, 1, 0>; + X86or_flag, or, 1, 0, 0>; defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, - X86xor_flag, xor, 1, 0>; + X86xor_flag, xor, 1, 0, 0>; defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, - X86add_flag, add, 1, 1>; + X86add_flag, add, 1, 1, 1>; let isCompare = 1 in { defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, - X86sub_flag, sub, 0, 0>; + X86sub_flag, sub, 0, 1, 0>; } // Arithmetic. diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index dcce7b9951f2..50aed98112c3 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -1,9 +1,8 @@ //===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index f5494fc0b13f..099f6aa8d8bb 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -1,9 +1,8 @@ //===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,99 +13,94 @@ // CMOV instructions. -multiclass CMOV opc, string Mnemonic, X86FoldableSchedWrite Sched, - PatLeaf CondNode> { - let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", - isCommutable = 1, SchedRW = [Sched] in { - def NAME#16rr - : I, - TB, OpSize16; - def NAME#32rr - : I, - TB, OpSize32; - def NAME#64rr - :RI, TB; - } - - let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", - SchedRW = [Sched.Folded, Sched.ReadAfterFold] in { - def NAME#16rm - : I, TB, OpSize16; - def NAME#32rm - : I, TB, OpSize32; - def NAME#64rm - :RI, TB; - } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" -} // end multiclass +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + isCommutable = 1, SchedRW = [WriteCMOV] in { + def CMOV16rr + : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond), + "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>, + TB, OpSize16; + def CMOV32rr + : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond), + "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>, + TB, OpSize32; + def CMOV64rr + :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond), + "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, + (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB; +} +let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in { + def CMOV16rm + : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond), + "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + imm:$cond, EFLAGS))]>, TB, OpSize16; + def CMOV32rm + : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond), + "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + imm:$cond, EFLAGS))]>, TB, OpSize32; + def CMOV64rm + :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond), + "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + imm:$cond, EFLAGS))]>, TB; +} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // isCodeGenOnly = 1, ForceDisassemble = 1 -// Conditional Moves. -defm CMOVO : CMOV<0x40, "cmovo" , WriteCMOV, X86_COND_O>; -defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV, X86_COND_NO>; -defm CMOVB : CMOV<0x42, "cmovb" , WriteCMOV, X86_COND_B>; -defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV, X86_COND_AE>; -defm CMOVE : CMOV<0x44, "cmove" , WriteCMOV, X86_COND_E>; -defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV, X86_COND_NE>; -defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>; -defm CMOVA : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>; -defm CMOVS : CMOV<0x48, "cmovs" , WriteCMOV, X86_COND_S>; -defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV, X86_COND_NS>; -defm CMOVP : CMOV<0x4A, "cmovp" , WriteCMOV, X86_COND_P>; -defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV, X86_COND_NP>; -defm CMOVL : CMOV<0x4C, "cmovl" , WriteCMOV, X86_COND_L>; -defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV, X86_COND_GE>; -defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV, X86_COND_LE>; -defm CMOVG : CMOV<0x4F, "cmovg" , WriteCMOV, X86_COND_G>; +// SetCC instructions. +let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in { + def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond), + "set${cond}\t$dst", + [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>, + TB, Sched<[WriteSETCC]>; + def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond), + "set${cond}\t$dst", + [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>, + TB, Sched<[WriteSETCCStore]>; +} // Uses = [EFLAGS] +multiclass CMOV_SETCC_Aliases { + def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}", + (CMOV16rr GR16:$dst, GR16:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}", + (CMOV16rm GR16:$dst, i16mem:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}", + (CMOV32rr GR32:$dst, GR32:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}", + (CMOV32rm GR32:$dst, i32mem:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}", + (CMOV64rr GR64:$dst, GR64:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}", + (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>; -// SetCC instructions. -multiclass SETCC opc, string Mnemonic, PatLeaf OpNode> { - let Uses = [EFLAGS] in { - def r : I, - TB, Sched<[WriteSETCC]>; - def m : I, - TB, Sched<[WriteSETCCStore]>; - } // Uses = [EFLAGS] + def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>; + def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>; } -defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set -defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set -defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than -defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal -defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to -defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to -defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal -defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than -defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set -defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed -defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set -defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set -defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than -defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal -defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal -defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than +defm : CMOV_SETCC_Aliases<"o" , 0>; +defm : CMOV_SETCC_Aliases<"no", 1>; +defm : CMOV_SETCC_Aliases<"b" , 2>; +defm : CMOV_SETCC_Aliases<"ae", 3>; +defm : CMOV_SETCC_Aliases<"e" , 4>; +defm : CMOV_SETCC_Aliases<"ne", 5>; +defm : CMOV_SETCC_Aliases<"be", 6>; +defm : CMOV_SETCC_Aliases<"a" , 7>; +defm : CMOV_SETCC_Aliases<"s" , 8>; +defm : CMOV_SETCC_Aliases<"ns", 9>; +defm : CMOV_SETCC_Aliases<"p" , 10>; +defm : CMOV_SETCC_Aliases<"np", 11>; +defm : CMOV_SETCC_Aliases<"l" , 12>; +defm : CMOV_SETCC_Aliases<"ge", 13>; +defm : CMOV_SETCC_Aliases<"le", 14>; +defm : CMOV_SETCC_Aliases<"g" , 15>; // SALC is an undocumented instruction. Information for this instruction can be found // here http://www.rcollins.org/secrets/opcodes/SALC.html diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 394dca8e7817..efaccdc9ee96 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -1,9 +1,8 @@ //===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,11 +19,6 @@ def GetLo32XForm : SDNodeXFormgetZExtValue(), SDLoc(N)); }]>; -def GetLo8XForm : SDNodeXFormgetZExtValue(), SDLoc(N)); -}]>; - //===----------------------------------------------------------------------===// // Random Pseudo Instructions. @@ -360,7 +354,7 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), // this happens, it is great. However, if we are left with an 8-bit sbb and an // and, we might as well just match it as a setb. def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), - (SETBr)>; + (SETCCr (i8 2))>; // Patterns to give priority when both inputs are zero so that we don't use // an immediate for the RHS. @@ -574,8 +568,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _RFP80 : CMOVrr_PSEUDO; - defm _FR32 : CMOVrr_PSEUDO; - defm _FR64 : CMOVrr_PSEUDO; + let Predicates = [NoAVX512] in { + defm _FR32 : CMOVrr_PSEUDO; + defm _FR64 : CMOVrr_PSEUDO; + } + let Predicates = [HasAVX512] in { + defm _FR32X : CMOVrr_PSEUDO; + defm _FR64X : CMOVrr_PSEUDO; + } let Predicates = [NoVLX] in { defm _VR128 : CMOVrr_PSEUDO; defm _VR256 : CMOVrr_PSEUDO; @@ -712,6 +712,32 @@ def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, "{$src2, $dst|$dst, $src2}"), [(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK; +// NOTE: These are order specific, we want the mi8 forms to be listed +// first so that they are slightly preferred to the mi forms. +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>, + OpSize16, LOCK; + +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>, + OpSize32, LOCK; + +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>, + LOCK; + def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), @@ -742,30 +768,6 @@ def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, "{$src2, $dst|$dst, $src2}"), [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>, LOCK; - -def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), - !strconcat(mnemonic, "{w}\t", - "{$src2, $dst|$dst, $src2}"), - [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>, - OpSize16, LOCK; - -def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), - !strconcat(mnemonic, "{l}\t", - "{$src2, $dst|$dst, $src2}"), - [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>, - OpSize32, LOCK; - -def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), - !strconcat(mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>, - LOCK; } } @@ -868,7 +870,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in { } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - SchedRW = [WriteCMPXCHGRMW] in { + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in { defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; } @@ -892,8 +894,9 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; // the instruction and we are sure we will have a valid register to restore // the value of RBX. let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX], - SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1, - Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in { + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst", + usesCustomInserter = 1 in { def LCMPXCHG8B_SAVE_EBX : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save), @@ -904,14 +907,14 @@ def LCMPXCHG8B_SAVE_EBX : let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW] in { + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in { defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", X86cas16, i128mem>, REX_W; } // Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant. let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], - Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", usesCustomInserter = 1 in { def LCMPXCHG16B_SAVE_RBX : @@ -1001,28 +1004,31 @@ defm : RELEASE_BINOP_MI<"OR", or>; defm : RELEASE_BINOP_MI<"XOR", xor>; defm : RELEASE_BINOP_MI<"SUB", sub>; -// Same as above, but for floating-point. -// FIXME: imm version. -// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// Atomic load + floating point patterns. // FIXME: This could also handle SIMD operations with *ps and *pd instructions. -let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in { -multiclass RELEASE_FP_BINOP_MI { - def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), - "#BINOP "#NAME#"32mr PSEUDO!", - [(atomic_store_32 addr:$dst, - (i32 (bitconvert (op - (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), - FR32:$src))))]>, Requires<[HasSSE1]>; - def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), - "#BINOP "#NAME#"64mr PSEUDO!", - [(atomic_store_64 addr:$dst, - (i64 (bitconvert (op - (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), - FR64:$src))))]>, Requires<[HasSSE2]>; +multiclass ATOMIC_LOAD_FP_BINOP_MI { + def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast(Name#"SSrm") FR32:$src1, addr:$src2)>, + Requires<[UseSSE1]>; + def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast("V"#Name#"SSrm") FR32:$src1, addr:$src2)>, + Requires<[UseAVX]>; + def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>, + Requires<[HasAVX512]>; + + def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast(Name#"SDrm") FR64:$src1, addr:$src2)>, + Requires<[UseSSE1]>; + def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast("V"#Name#"SDrm") FR64:$src1, addr:$src2)>, + Requires<[UseAVX]>; + def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>, + Requires<[HasAVX512]>; } -defm RELEASE_FADD : RELEASE_FP_BINOP_MI; +defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>; // FIXME: Add fsub, fmul, fdiv, ... -} multiclass RELEASE_UNOP { @@ -1083,6 +1089,35 @@ def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>; def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>; def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>; +// Floating point loads/stores. +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>; +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>; + +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>; +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>; +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>; + +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (MOVSSrm_alt addr:$src)>, Requires<[UseSSE1]>; +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (VMOVSSrm_alt addr:$src)>, Requires<[UseAVX]>; +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (VMOVSSZrm_alt addr:$src)>, Requires<[HasAVX512]>; + +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (MOVSDrm_alt addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (VMOVSDrm_alt addr:$src)>, Requires<[UseAVX]>; +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (VMOVSDZrm_alt addr:$src)>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules //===----------------------------------------------------------------------===// @@ -1241,37 +1276,23 @@ def : Pat<(X86cmp GR32:$src1, 0), def : Pat<(X86cmp GR64:$src1, 0), (TEST64rr GR64:$src1, GR64:$src1)>; +def inv_cond_XFORM : SDNodeXForm(N->getZExtValue()); + return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC), + SDLoc(N), MVT::i8); +}]>; + // Conditional moves with folded loads with operands swapped and conditions // inverted. -multiclass CMOVmr { - let Predicates = [HasCMov] in { - def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), - (Inst16 GR16:$src2, addr:$src1)>; - def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), - (Inst32 GR32:$src2, addr:$src1)>; - def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), - (Inst64 GR64:$src2, addr:$src1)>; - } +let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS), + (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS), + (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS), + (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; } -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; -defm : CMOVmr; - // zextload bool -> zextload byte // i1 stored in one byte in zero-extended form. // Upper bits cleanup should be executed before Store. @@ -1298,14 +1319,16 @@ def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; // For other extloads, use subregs, since the high contents of the register are // defined after an extload. +// NOTE: The extloadi64i32 pattern needs to be first as it will try to form +// 32-bit loads for 4 byte aligned i8/i16 loads. +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; def : Pat<(extloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; def : Pat<(extloadi64i8 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; def : Pat<(extloadi64i16 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; -def : Pat<(extloadi64i32 addr:$src), - (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. @@ -1351,6 +1374,8 @@ def def32 : PatLeaf<(i32 GR32:$src), [{ // we can use a SUBREG_TO_REG. def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; +def : Pat<(i64 (and (anyext def32:$src), 0x00000000FFFFFFFF)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; //===----------------------------------------------------------------------===// // Pattern match OR as ADD @@ -1377,9 +1402,12 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ // Try this before the selecting to OR. let SchedRW = [WriteALU] in { -let isConvertibleToThreeAddress = 1, +let isConvertibleToThreeAddress = 1, isPseudo = 1, Constraints = "$src1 = $dst", Defs = [EFLAGS] in { let isCommutable = 1 in { +def ADD8rr_DB : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2), + "", // orb/addb REG, REG + [(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>; def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "", // orw/addw REG, REG [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; @@ -1394,6 +1422,10 @@ def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. +def ADD8ri_DB : I<0, Pseudo, + (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), + "", // orb/addb REG, imm8 + [(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>; def ADD16ri8_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "", // orw/addw REG, imm8 @@ -1483,6 +1515,13 @@ def : Pat<(add GR64:$src1, 128), def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), (SUB64mi8 addr:$dst, -128)>; +def : Pat<(X86add_flag_nocf GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(X86add_flag_nocf GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(X86add_flag_nocf GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; + // The same trick applies for 32-bit immediate fields in 64-bit // instructions. def : Pat<(add GR64:$src1, 0x0000000080000000), @@ -1490,6 +1529,9 @@ def : Pat<(add GR64:$src1, 0x0000000080000000), def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst), (SUB64mi32 addr:$dst, 0xffffffff80000000)>; +def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; + // To avoid needing to materialize an immediate in a register, use a 32-bit and // with implicit zero-extension instead of a 64-bit and if the immediate has at // least 32 bits of leading zeros. If in addition the last 32 bits can be @@ -1504,7 +1546,7 @@ def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), (i64 0), (AND32ri8 (EXTRACT_SUBREG GR64:$src, sub_32bit), - (i32 (GetLo8XForm imm:$imm))), + (i32 (GetLo32XForm imm:$imm))), sub_32bit)>; def : Pat<(and GR64:$src, i64immZExt32:$imm), @@ -1714,40 +1756,43 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; -// Helper imms to check if a mask doesn't change significant shift/rotate bits. -def immShift8 : ImmLeaf(Imm) >= 3; +def shiftMask8 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 3); }]>; -def immShift16 : ImmLeaf(Imm) >= 4; + +def shiftMask16 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 4); }]>; -def immShift32 : ImmLeaf(Imm) >= 5; + +def shiftMask32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 5); }]>; -def immShift64 : ImmLeaf(Imm) >= 6; + +def shiftMask64 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 6); }]>; + // Shift amount is implicitly masked. multiclass MaskedShiftAmountPats { // (shift x (and y, 31)) ==> (shift x, y) - def : Pat<(frag GR8:$src1, (and CL, immShift32)), + def : Pat<(frag GR8:$src1, (shiftMask32 CL)), (!cast(name # "8rCL") GR8:$src1)>; - def : Pat<(frag GR16:$src1, (and CL, immShift32)), + def : Pat<(frag GR16:$src1, (shiftMask32 CL)), (!cast(name # "16rCL") GR16:$src1)>; - def : Pat<(frag GR32:$src1, (and CL, immShift32)), + def : Pat<(frag GR32:$src1, (shiftMask32 CL)), (!cast(name # "32rCL") GR32:$src1)>; - def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast(name # "8mCL") addr:$dst)>; - def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast(name # "16mCL") addr:$dst)>; - def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast(name # "32mCL") addr:$dst)>; // (shift x (and y, 63)) ==> (shift x, y) - def : Pat<(frag GR64:$src1, (and CL, immShift64)), + def : Pat<(frag GR64:$src1, (shiftMask64 CL)), (!cast(name # "64rCL") GR64:$src1)>; - def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst), (!cast(name # "64mCL") addr:$dst)>; } @@ -1763,23 +1808,23 @@ defm : MaskedShiftAmountPats; // not tracking flags for these nodes. multiclass MaskedRotateAmountPats { // (rot x (and y, BitWidth - 1)) ==> (rot x, y) - def : Pat<(frag GR8:$src1, (and CL, immShift8)), + def : Pat<(frag GR8:$src1, (shiftMask8 CL)), (!cast(name # "8rCL") GR8:$src1)>; - def : Pat<(frag GR16:$src1, (and CL, immShift16)), + def : Pat<(frag GR16:$src1, (shiftMask16 CL)), (!cast(name # "16rCL") GR16:$src1)>; - def : Pat<(frag GR32:$src1, (and CL, immShift32)), + def : Pat<(frag GR32:$src1, (shiftMask32 CL)), (!cast(name # "32rCL") GR32:$src1)>; - def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst), + def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst), (!cast(name # "8mCL") addr:$dst)>; - def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst), + def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst), (!cast(name # "16mCL") addr:$dst)>; - def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast(name # "32mCL") addr:$dst)>; // (rot x (and y, 63)) ==> (rot x, y) - def : Pat<(frag GR64:$src1, (and CL, immShift64)), + def : Pat<(frag GR64:$src1, (shiftMask64 CL)), (!cast(name # "64rCL") GR64:$src1)>; - def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst), (!cast(name # "64mCL") addr:$dst)>; } @@ -1790,13 +1835,13 @@ defm : MaskedRotateAmountPats; // Double shift amount is implicitly masked. multiclass MaskedDoubleShiftAmountPats { // (shift x (and y, 31)) ==> (shift x, y) - def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)), + def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)), (!cast(name # "16rrCL") GR16:$src1, GR16:$src2)>; - def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)), + def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)), (!cast(name # "32rrCL") GR32:$src1, GR32:$src2)>; // (shift x (and y, 63)) ==> (shift x, y) - def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)), + def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)), (!cast(name # "64rrCL") GR64:$src1, GR64:$src2)>; } @@ -1805,57 +1850,57 @@ defm : MaskedDoubleShiftAmountPats; let Predicates = [HasBMI2] in { let AddedComplexity = 1 in { - def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)), + def : Pat<(sra GR32:$src1, (shiftMask32 GR8:$src2)), (SARX32rr GR32:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)), + def : Pat<(sra GR64:$src1, (shiftMask64 GR8:$src2)), (SARX64rr GR64:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)), + def : Pat<(srl GR32:$src1, (shiftMask32 GR8:$src2)), (SHRX32rr GR32:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)), + def : Pat<(srl GR64:$src1, (shiftMask64 GR8:$src2)), (SHRX64rr GR64:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)), + def : Pat<(shl GR32:$src1, (shiftMask32 GR8:$src2)), (SHLX32rr GR32:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)), + def : Pat<(shl GR64:$src1, (shiftMask64 GR8:$src2)), (SHLX64rr GR64:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; } - def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + def : Pat<(sra (loadi32 addr:$src1), (shiftMask32 GR8:$src2)), (SARX32rm addr:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + def : Pat<(sra (loadi64 addr:$src1), (shiftMask64 GR8:$src2)), (SARX64rm addr:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + def : Pat<(srl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)), (SHRX32rm addr:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + def : Pat<(srl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)), (SHRX64rm addr:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + def : Pat<(shl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)), (SHLX32rm addr:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + def : Pat<(shl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)), (SHLX64rm addr:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; @@ -1864,7 +1909,7 @@ let Predicates = [HasBMI2] in { // Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location. multiclass one_bit_patterns { + PatFrag ShiftMask> { def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)), (BTR RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; @@ -1876,20 +1921,20 @@ multiclass one_bit_patterns; // Similar to above, but removing unneeded masking of the shift amount. - def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))), + def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))), (BTR RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))), + def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))), (BTS RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))), + def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))), (BTC RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; } -defm : one_bit_patterns; -defm : one_bit_patterns; -defm : one_bit_patterns; +defm : one_bit_patterns; +defm : one_bit_patterns; +defm : one_bit_patterns; // (anyext (setcc_carry)) -> (setcc_carry) @@ -1974,8 +2019,6 @@ def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; // sub reg, relocImm def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2), (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>; -def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2), - (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>; // mul reg, reg def : Pat<(mul GR16:$src1, GR16:$src2), diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index a7c7aaab2285..f82e80965b7c 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -1,9 +1,8 @@ //===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -71,35 +70,40 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { } // Conditional Branches. -let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { - multiclass ICBr opc1, bits<8> opc4, string asm, PatFrag Cond> { - def _1 : Ii8PCRel ; - let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { - def _2 : Ii16PCRel, OpSize16, TB; - def _4 : Ii32PCRel, TB, OpSize32; - } +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump], + isCodeGenOnly = 1, ForceDisassemble = 1 in { + def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs), + (ins brtarget8:$dst, ccode:$cond), + "j${cond}\t$dst", + [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>; + let hasSideEffects = 0 in { + def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs), + (ins brtarget16:$dst, ccode:$cond), + "j${cond}\t$dst", + []>, OpSize16, TB; + def JCC_4 : Ii32PCRel<0x80, AddCCFrm, (outs), + (ins brtarget32:$dst, ccode:$cond), + "j${cond}\t$dst", + []>, TB, OpSize32; } } -defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; -defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>; -defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; -defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; -defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; -defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>; -defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>; -defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>; -defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>; -defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>; -defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>; -defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>; -defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>; -defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>; -defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; -defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; +def : InstAlias<"jo\t$dst", (JCC_1 brtarget8:$dst, 0), 0>; +def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst, 1), 0>; +def : InstAlias<"jb\t$dst", (JCC_1 brtarget8:$dst, 2), 0>; +def : InstAlias<"jae\t$dst", (JCC_1 brtarget8:$dst, 3), 0>; +def : InstAlias<"je\t$dst", (JCC_1 brtarget8:$dst, 4), 0>; +def : InstAlias<"jne\t$dst", (JCC_1 brtarget8:$dst, 5), 0>; +def : InstAlias<"jbe\t$dst", (JCC_1 brtarget8:$dst, 6), 0>; +def : InstAlias<"ja\t$dst", (JCC_1 brtarget8:$dst, 7), 0>; +def : InstAlias<"js\t$dst", (JCC_1 brtarget8:$dst, 8), 0>; +def : InstAlias<"jns\t$dst", (JCC_1 brtarget8:$dst, 9), 0>; +def : InstAlias<"jp\t$dst", (JCC_1 brtarget8:$dst, 10), 0>; +def : InstAlias<"jnp\t$dst", (JCC_1 brtarget8:$dst, 11), 0>; +def : InstAlias<"jl\t$dst", (JCC_1 brtarget8:$dst, 12), 0>; +def : InstAlias<"jge\t$dst", (JCC_1 brtarget8:$dst, 13), 0>; +def : InstAlias<"jle\t$dst", (JCC_1 brtarget8:$dst, 14), 0>; +def : InstAlias<"jg\t$dst", (JCC_1 brtarget8:$dst, 15), 0>; // jcx/jecx/jrcx instructions. let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in { diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index c24d6d5b8df1..06e605fe5db2 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -1,9 +1,8 @@ //===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,11 +28,11 @@ let hasSideEffects = 0 in { let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) def CDQE : RI<0x98, RawFrm, (outs), (ins), - "{cltq|cdqe}", []>, Sched<[WriteALU]>; + "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX) def CQO : RI<0x99, RawFrm, (outs), (ins), - "{cqto|cqo}", []>, Sched<[WriteALU]>; + "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; } // Sign/Zero extenders diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 1a8e529431af..0cca71bdc431 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -1,9 +1,8 @@ //===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -237,7 +236,8 @@ multiclass fma3s_rm_132 opc, string OpcodeStr, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } -let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, string OpStr, string PackTy, string Suff, SDNode OpNode, RegisterClass RC, @@ -263,8 +263,7 @@ multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, // the lowest element of the FMA*_Int instruction. Even though such analysis // may be not implemented yet we allow the routines doing the actual commute // transformation to decide if one or another instruction is commutable or not. -let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, - hasSideEffects = 0 in +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in multiclass fma3s_rm_int opc, string OpcodeStr, Operand memopr, RegisterClass RC, X86FoldableSchedWrite sched> { diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp index def732a2dd00..25bbdddb7a21 100644 --- a/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/lib/Target/X86/X86InstrFMA3Info.cpp @@ -1,9 +1,8 @@ //===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,7 +56,7 @@ using namespace llvm; #define FMA3GROUP_SCALAR(Name, Attrs) \ FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \ - FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) \ + FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) #define FMA3GROUP_FULL(Name, Attrs) \ FMA3GROUP_PACKED(Name, Attrs) \ @@ -159,11 +158,9 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) { // FMA 231 instructions have an opcode of 0xB6-0xBF unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3; - auto I = std::lower_bound(Table.begin(), Table.end(), Opcode, - [FormIndex](const X86InstrFMA3Group &Group, - unsigned Opcode) { - return Group.Opcodes[FormIndex] < Opcode; - }); + auto I = partition_point(Table, [=](const X86InstrFMA3Group &Group) { + return Group.Opcodes[FormIndex] < Opcode; + }); assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode && "Couldn't find FMA3 opcode!"); return I; diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h index 6eec1db98bf8..7fa6f5917862 100644 --- a/lib/Target/X86/X86InstrFMA3Info.h +++ b/lib/Target/X86/X86InstrFMA3Info.h @@ -1,9 +1,8 @@ //===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 5912a3199613..2ec6d50f9702 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -1,9 +1,8 @@ //===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,18 +16,13 @@ // FPStack specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, - SDTCisVT<1, f80>]>; -def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, - SDTCisPtrTy<1>, - SDTCisVT<2, OtherVT>]>; -def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, - SDTCisPtrTy<1>, - SDTCisVT<2, OtherVT>]>; -def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, - SDTCisVT<2, OtherVT>]>; +def SDTX86Fld : SDTypeProfile<1, 1, [SDTCisFP<0>, + SDTCisPtrTy<1>]>; +def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>]>; +def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>; +def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; -def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -42,17 +36,71 @@ def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, SDNPMemOperand]>; +def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, + SDNPMemOperand]>; def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; -def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; +def X86fstf32 : PatFrag<(ops node:$val, node:$ptr), + (X86fst node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::f32; +}]>; +def X86fstf64 : PatFrag<(ops node:$val, node:$ptr), + (X86fst node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::f64; +}]>; +def X86fstf80 : PatFrag<(ops node:$val, node:$ptr), + (X86fst node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::f80; +}]>; + +def X86fldf32 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::f32; +}]>; +def X86fldf64 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::f64; +}]>; +def X86fldf80 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::f80; +}]>; + +def X86fild16 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; +def X86fild32 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; +def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + +def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + +def X86fist64 : PatFrag<(ops node:$val, node:$ptr), + (X86fist node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + +def X86fp_to_i16mem : PatFrag<(ops node:$val, node:$ptr), + (X86fp_to_mem node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; +def X86fp_to_i32mem : PatFrag<(ops node:$val, node:$ptr), + (X86fp_to_mem node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; +def X86fp_to_i64mem : PatFrag<(ops node:$val, node:$ptr), + (X86fp_to_mem node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + //===----------------------------------------------------------------------===// // FPStack pattern fragments //===----------------------------------------------------------------------===// @@ -74,7 +122,9 @@ def fpimmneg1 : FPImmLeaf; // Some 'special' instructions - expanded after instruction selection. -let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { +// Clobbers EFLAGS due to OR instruction used internally. +// FIXME: Can we model this in SelectionDAG? +let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in { def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), @@ -139,7 +189,6 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // These instructions cannot address 80-bit memory. multiclass FPBinary { -let mayLoad = 1, hasSideEffects = 1 in { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, @@ -176,8 +225,10 @@ def _Fp80m64: FpI_<(outs RFP80:$dst), (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), (set RFP80:$dst, (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] @@ -185,52 +236,53 @@ def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, - (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (OpNode RFP32:$src1, (X86fild16 addr:$src2))), (set RFP32:$dst, - (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; + (OpNode (X86fild16 addr:$src2), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, - (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (OpNode RFP32:$src1, (X86fild32 addr:$src2))), (set RFP32:$dst, - (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; + (OpNode (X86fild32 addr:$src2), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, - (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (OpNode RFP64:$src1, (X86fild16 addr:$src2))), (set RFP64:$dst, - (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; + (OpNode (X86fild16 addr:$src2), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, - (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (OpNode RFP64:$src1, (X86fild32 addr:$src2))), (set RFP64:$dst, - (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; + (OpNode (X86fild32 addr:$src2), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, - (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (OpNode RFP80:$src1, (X86fild16 addr:$src2))), (set RFP80:$dst, - (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; + (OpNode (X86fild16 addr:$src2), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, - (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (OpNode RFP80:$src1, (X86fild32 addr:$src2))), (set RFP80:$dst, - (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; + (OpNode (X86fild32 addr:$src2), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), !strconcat("fi", asmstring, "{l}\t$src")>; -} // mayLoad = 1, hasSideEffects = 1 } -let Defs = [FPSW] in { +let Defs = [FPSW], Uses = [FPCW] in { // FPBinary_rr just defines pseudo-instructions, no need to set a scheduling // resources. let hasNoSchedulingInfo = 1 in { @@ -258,42 +310,42 @@ defm DIVR: FPBinary; } // Defs = [FPSW] class FPST0rInst - : FPI<0xD8, fp, (outs), (ins RST:$op), asm>; + : FPI<0xD8, fp, (outs), (ins RSTi:$op), asm>; class FPrST0Inst - : FPI<0xDC, fp, (outs), (ins RST:$op), asm>; + : FPI<0xDC, fp, (outs), (ins RSTi:$op), asm>; class FPrST0PInst - : FPI<0xDE, fp, (outs), (ins RST:$op), asm>; + : FPI<0xDE, fp, (outs), (ins RSTi:$op), asm>; // NOTE: GAS and apparently all other AT&T style assemblers have a broken notion // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. -let SchedRW = [WriteFAdd] in { -def ADD_FST0r : FPST0rInst ; -def ADD_FrST0 : FPrST0Inst ; -def ADD_FPrST0 : FPrST0PInst; -def SUBR_FST0r : FPST0rInst ; -def SUB_FrST0 : FPrST0Inst ; -def SUB_FPrST0 : FPrST0PInst; -def SUB_FST0r : FPST0rInst ; -def SUBR_FrST0 : FPrST0Inst ; -def SUBR_FPrST0 : FPrST0PInst; +let SchedRW = [WriteFAdd], Defs = [FPSW], Uses = [FPCW] in { +def ADD_FST0r : FPST0rInst ; +def ADD_FrST0 : FPrST0Inst ; +def ADD_FPrST0 : FPrST0PInst; +def SUBR_FST0r : FPST0rInst ; +def SUB_FrST0 : FPrST0Inst ; +def SUB_FPrST0 : FPrST0PInst; +def SUB_FST0r : FPST0rInst ; +def SUBR_FrST0 : FPrST0Inst ; +def SUBR_FPrST0 : FPrST0PInst; } // SchedRW -let SchedRW = [WriteFCom] in { +let SchedRW = [WriteFCom], Defs = [FPSW], Uses = [FPCW] in { def COM_FST0r : FPST0rInst ; def COMP_FST0r : FPST0rInst ; } // SchedRW -let SchedRW = [WriteFMul] in { -def MUL_FST0r : FPST0rInst ; -def MUL_FrST0 : FPrST0Inst ; -def MUL_FPrST0 : FPrST0PInst; +let SchedRW = [WriteFMul], Defs = [FPSW], Uses = [FPCW] in { +def MUL_FST0r : FPST0rInst ; +def MUL_FrST0 : FPrST0Inst ; +def MUL_FPrST0 : FPrST0PInst; } // SchedRW -let SchedRW = [WriteFDiv] in { -def DIVR_FST0r : FPST0rInst ; -def DIV_FrST0 : FPrST0Inst ; -def DIV_FPrST0 : FPrST0PInst; -def DIV_FST0r : FPST0rInst ; -def DIVR_FrST0 : FPrST0Inst ; -def DIVR_FPrST0 : FPrST0PInst; +let SchedRW = [WriteFDiv], Defs = [FPSW], Uses = [FPCW] in { +def DIVR_FST0r : FPST0rInst ; +def DIV_FrST0 : FPrST0Inst ; +def DIV_FPrST0 : FPrST0PInst; +def DIV_FST0r : FPST0rInst ; +def DIVR_FrST0 : FPrST0Inst ; +def DIVR_FPrST0 : FPrST0PInst; } // SchedRW // Unary operations. @@ -307,7 +359,7 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, def _F : FPI<0xD9, fp, (outs), (ins), asmstring>; } -let Defs = [FPSW] in { +let Defs = [FPSW], Uses = [FPCW] in { let SchedRW = [WriteFSign] in { defm CHS : FPUnary; @@ -335,7 +387,7 @@ def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">; // Versions of FP instructions that take a single memory operand. Added for the // disassembler; remove as they are included with patterns elsewhere. -let SchedRW = [WriteFComLd] in { +let SchedRW = [WriteFComLd], Defs = [FPSW], Uses = [FPCW] in { def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; @@ -398,32 +450,31 @@ defm CMOVNP : FPCMov; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. -def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), - "fcmovb\t{$op, %st(0)|st(0), $op}">; -def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), - "fcmovbe\t{$op, %st(0)|st(0), $op}">; -def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), - "fcmove\t{$op, %st(0)|st(0), $op}">; -def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), - "fcmovu\t{$op, %st(0)|st(0), $op}">; -def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), - "fcmovnb\t{$op, %st(0)|st(0), $op}">; -def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), - "fcmovnbe\t{$op, %st(0)|st(0), $op}">; -def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), - "fcmovne\t{$op, %st(0)|st(0), $op}">; -def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), - "fcmovnu\t{$op, %st(0)|st(0), $op}">; +def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op), + "fcmovb\t{$op, %st|st, $op}">; +def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RSTi:$op), + "fcmovbe\t{$op, %st|st, $op}">; +def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RSTi:$op), + "fcmove\t{$op, %st|st, $op}">; +def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RSTi:$op), + "fcmovu\t{$op, %st|st, $op}">; +def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RSTi:$op), + "fcmovnb\t{$op, %st|st, $op}">; +def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RSTi:$op), + "fcmovnbe\t{$op, %st|st, $op}">; +def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op), + "fcmovne\t{$op, %st|st, $op}">; +def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op), + "fcmovnu\t{$op, %st|st, $op}">; } // Predicates = [HasCMov] } // SchedRW // Floating point loads & stores. -let SchedRW = [WriteLoad] in { +let SchedRW = [WriteLoad], Uses = [FPCW] in { let canFoldAsLoad = 1 in { def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP32:$dst, (loadf32 addr:$src))]>; -let isReMaterializable = 1 in - def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, +def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, [(set RFP64:$dst, (loadf64 addr:$src))]>; def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, [(set RFP80:$dst, (loadf80 addr:$src))]>; @@ -435,26 +486,26 @@ def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, - [(set RFP32:$dst, (X86fild addr:$src, i16))]>; + [(set RFP32:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, - [(set RFP32:$dst, (X86fild addr:$src, i32))]>; + [(set RFP32:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, - [(set RFP32:$dst, (X86fild addr:$src, i64))]>; + [(set RFP32:$dst, (X86fild64 addr:$src))]>; def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, - [(set RFP64:$dst, (X86fild addr:$src, i16))]>; + [(set RFP64:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, - [(set RFP64:$dst, (X86fild addr:$src, i32))]>; + [(set RFP64:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, - [(set RFP64:$dst, (X86fild addr:$src, i64))]>; + [(set RFP64:$dst, (X86fild64 addr:$src))]>; def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, - [(set RFP80:$dst, (X86fild addr:$src, i16))]>; + [(set RFP80:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, - [(set RFP80:$dst, (X86fild addr:$src, i32))]>; + [(set RFP80:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, - [(set RFP80:$dst, (X86fild addr:$src, i64))]>; + [(set RFP80:$dst, (X86fild64 addr:$src))]>; } // SchedRW -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Uses = [FPCW] in { def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, [(store RFP32:$src, addr:$op)]>; def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, @@ -489,9 +540,9 @@ def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; } // mayStore -} // SchedRW +} // SchedRW, Uses = [FPCW] -let mayLoad = 1, SchedRW = [WriteLoad] in { +let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">; def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">; @@ -499,7 +550,7 @@ def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">; def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">; def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">; } -let mayStore = 1, SchedRW = [WriteStore] in { +let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in { def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">; def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">; def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">; @@ -513,7 +564,7 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">; } // FISTTP requires SSE3 even though it's a FPStack op. -let Predicates = [HasSSE3], SchedRW = [WriteStore] in { +let Predicates = [HasSSE3], SchedRW = [WriteStore], Uses = [FPCW] in { def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, @@ -534,22 +585,22 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; } // Predicates = [HasSSE3] -let mayStore = 1, SchedRW = [WriteStore] in { +let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in { def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">; def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">; def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">; } // FP Stack manipulation instructions. -let SchedRW = [WriteMove] in { -def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op">; -def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op">; -def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op">; -def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op">; +let SchedRW = [WriteMove], Uses = [FPCW] in { +def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">; +def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">; +def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">; +def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">; } // Floating point constant loads. -let isReMaterializable = 1, SchedRW = [WriteZero] in { +let SchedRW = [WriteZero], Uses = [FPCW] in { def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, [(set RFP32:$dst, fpimm0)]>; def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, @@ -564,13 +615,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm1)]>; } -let SchedRW = [WriteFLD0] in +let SchedRW = [WriteFLD0], Uses = [FPCW] in def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">; -let SchedRW = [WriteFLD1] in +let SchedRW = [WriteFLD1], Uses = [FPCW] in def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">; -let SchedRW = [WriteFLDC], Defs = [FPSW] in { +let SchedRW = [WriteFLDC], Uses = [FPCW] in { def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>; def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>; def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>; @@ -579,7 +630,7 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>; } // SchedRW // Floating point compares. -let SchedRW = [WriteFCom] in { +let SchedRW = [WriteFCom], Uses = [FPCW] in { def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, @@ -591,37 +642,37 @@ def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, let SchedRW = [WriteFCom] in { // CC = ST(0) cmp ST(i) -let Defs = [EFLAGS, FPSW] in { -let Predicates = [FPStackf32, HasCMov] in -def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, - [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; -let Predicates = [FPStackf64, HasCMov] in -def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, - [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>; -let Predicates = [HasCMov] in +let Defs = [EFLAGS, FPSW], Uses = [FPCW] in { +def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>, + Requires<[FPStackf32, HasCMov]>; +def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>, + Requires<[FPStackf64, HasCMov]>; def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, - [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; + [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>, + Requires<[HasCMov]>; } -let Defs = [FPSW], Uses = [ST0] in { +let Defs = [FPSW], Uses = [ST0, FPCW] in { def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i) - (outs), (ins RST:$reg), "fucom\t$reg">; + (outs), (ins RSTi:$reg), "fucom\t$reg">; def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop - (outs), (ins RST:$reg), "fucomp\t$reg">; + (outs), (ins RSTi:$reg), "fucomp\t$reg">; def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop (outs), (ins), "fucompp">; } -let Defs = [EFLAGS, FPSW], Uses = [ST0] in { +let Defs = [EFLAGS, FPSW], Uses = [ST0, FPCW] in { def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i) - (outs), (ins RST:$reg), "fucomi\t$reg">; + (outs), (ins RSTi:$reg), "fucomi\t{$reg, %st|st, $reg}">; def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop - (outs), (ins RST:$reg), "fucompi\t$reg">; -} + (outs), (ins RSTi:$reg), "fucompi\t{$reg, %st|st, $reg}">; -let Defs = [EFLAGS, FPSW] in { -def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg">; -def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg">; +def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RSTi:$reg), + "fcomi\t{$reg, %st|st, $reg}">; +def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg), + "fcompi\t{$reg, %st|st, $reg}">; } } // SchedRW @@ -631,12 +682,12 @@ let Defs = [AX], Uses = [FPSW] in def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))]>; -let Defs = [FPSW] in +let Defs = [FPSW], Uses = [FPCW] in def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", [(X86fp_cwd_get16 addr:$dst)]>; } // SchedRW -let Defs = [FPSW], mayLoad = 1 in +let Defs = [FPSW,FPCW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, Sched<[WriteLoad]>; @@ -645,8 +696,8 @@ def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] let SchedRW = [WriteMicrocoded] in { let Defs = [FPSW] in { def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>; -def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg">; -def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg">; +def FFREE : FPI<0xDD, MRM0r, (outs), (ins RSTi:$reg), "ffree\t$reg">; +def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RSTi:$reg), "ffreep\t$reg">; // Clear exceptions def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>; @@ -695,21 +746,17 @@ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src), //===----------------------------------------------------------------------===// // Required for RET of f32 / f64 / f80 values. -def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; -def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; -def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; +def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>; +def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>; +def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>; // Required for CALL which return f32 / f64 / f80 values. -def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; -def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, - RFP64:$src)>; -def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, - RFP80:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, - RFP80:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, - RFP80:$src)>; +def : Pat<(X86fstf32 RFP32:$src, addr:$op), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fstf32 RFP64:$src, addr:$op), (ST_Fp64m32 addr:$op, RFP64:$src)>; +def : Pat<(X86fstf64 RFP64:$src, addr:$op), (ST_Fp64m addr:$op, RFP64:$src)>; +def : Pat<(X86fstf32 RFP80:$src, addr:$op), (ST_Fp80m32 addr:$op, RFP80:$src)>; +def : Pat<(X86fstf64 RFP80:$src, addr:$op), (ST_Fp80m64 addr:$op, RFP80:$src)>; +def : Pat<(X86fstf80 RFP80:$src, addr:$op), (ST_FpP80m addr:$op, RFP80:$src)>; // Floating point constant -0.0 and -1.0 def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; @@ -720,7 +767,11 @@ def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; // Used to conv. i64 to f64 since there isn't a SSE version. -def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; +def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>; + +// Used to conv. between f80 and i64 for i64 atomic loads. +def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>; +def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>; // FP extensions map onto simple pseudo-value conversions if they are to/from // the FP stack. diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp index 7d31cfab4137..d42fec3770c7 100644 --- a/lib/Target/X86/X86InstrFoldTables.cpp +++ b/lib/Target/X86/X86InstrFoldTables.cpp @@ -1,9 +1,8 @@ //===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,6 +33,17 @@ using namespace llvm; // tables that would be incorrect. The manual review process allows us a chance // to catch these before they become observable bugs. static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { + { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, + { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, + { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, + { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, + { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, + { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, + { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, + { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, + { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, + { X86::ADD8ri_DB, X86::ADD8mi, TB_NO_REVERSE }, + { X86::ADD8rr_DB, X86::ADD8mr, TB_NO_REVERSE }, { X86::ADC16ri, X86::ADC16mi, 0 }, { X86::ADC16ri8, X86::ADC16mi8, 0 }, { X86::ADC16rr, X86::ADC16mr, 0 }, @@ -48,22 +58,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::ADC8rr, X86::ADC8mr, 0 }, { X86::ADD16ri, X86::ADD16mi, 0 }, { X86::ADD16ri8, X86::ADD16mi8, 0 }, - { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, - { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, { X86::ADD16rr, X86::ADD16mr, 0 }, - { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, { X86::ADD32ri, X86::ADD32mi, 0 }, { X86::ADD32ri8, X86::ADD32mi8, 0 }, - { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, - { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, { X86::ADD32rr, X86::ADD32mr, 0 }, - { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, { X86::ADD64ri32, X86::ADD64mi32, 0 }, - { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, { X86::ADD64ri8, X86::ADD64mi8, 0 }, - { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, { X86::ADD64rr, X86::ADD64mr, 0 }, - { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, { X86::ADD8ri, X86::ADD8mi, 0 }, { X86::ADD8ri8, X86::ADD8mi8, 0 }, { X86::ADD8rr, X86::ADD8mr, 0 }, @@ -247,7 +248,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::XOR64rr, X86::XOR64mr, 0 }, { X86::XOR8ri, X86::XOR8mi, 0 }, { X86::XOR8ri8, X86::XOR8mi8, 0 }, - { X86::XOR8rr, X86::XOR8mr, 0 } + { X86::XOR8rr, X86::XOR8mr, 0 }, }; static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { @@ -305,9 +306,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, - { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE }, - { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, - { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, + { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MOVSDto64rr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MOVSS2DIrr, X86::MOVSSmr, TB_FOLDED_STORE }, { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, @@ -321,22 +322,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, - { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, - { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, - { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, - { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, - { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, - { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, - { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, - { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, - { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, - { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, - { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, - { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, - { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, - { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, - { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, - { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, + { X86::SETCCr, X86::SETCCm, TB_FOLDED_STORE }, { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, @@ -403,12 +389,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIrr, X86::VMOVPDI2DImr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE }, - { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE }, - { X86::VMOVSDto64rr, X86::VMOVSDto64mr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, + { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVSDto64Zrr, X86::VMOVSDZmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVSDto64rr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVSS2DIZrr, X86::VMOVSSZmr, TB_FOLDED_STORE }, + { X86::VMOVSS2DIrr, X86::VMOVSSmr, TB_FOLDED_STORE }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, @@ -544,14 +530,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::MOV16rr, X86::MOV16rm, 0 }, { X86::MOV32rr, X86::MOV32rm, 0 }, { X86::MOV64rr, X86::MOV64rm, 0 }, - { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, - { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, + { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE }, + { X86::MOV64toSDrr, X86::MOVSDrm_alt, TB_NO_REVERSE }, { X86::MOV8rr, X86::MOV8rm, 0 }, { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, - { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, + { X86::MOVDI2SSrr, X86::MOVSSrm_alt, 0 }, { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, { X86::MOVDQUrr, X86::MOVDQUrm, 0 }, { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, @@ -628,7 +614,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::SQRTSSr, X86::SQRTSSm, 0 }, { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, - // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, @@ -663,7 +648,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE }, { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE }, - { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 }, + { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 }, { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 }, { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, @@ -671,6 +656,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0 }, { X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0 }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, + { X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0 }, + { X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0 }, + { X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2DQZ128rr, X86::VCVTPD2DQZ128rm, 0 }, { X86::VCVTPD2DQZ256rr, X86::VCVTPD2DQZ256rm, 0 }, @@ -830,10 +818,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 }, { X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 }, { X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 }, - { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, - { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, - { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 }, - { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, + { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, + { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE }, + { X86::VMOV64toSDZrr, X86::VMOVSDZrm_alt, TB_NO_REVERSE }, + { X86::VMOV64toSDrr, X86::VMOVSDrm_alt, TB_NO_REVERSE }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, @@ -851,8 +839,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 }, { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, - { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, + { X86::VMOVDI2SSZrr, X86::VMOVSSZrm_alt, 0 }, + { X86::VMOVDI2SSrr, X86::VMOVSSrm_alt, 0 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, @@ -1206,6 +1194,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { }; static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { + { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, + { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, + { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, + { X86::ADD8rr_DB, X86::ADD8rm, TB_NO_REVERSE }, { X86::ADC16rr, X86::ADC16rm, 0 }, { X86::ADC32rr, X86::ADC32rm, 0 }, { X86::ADC64rr, X86::ADC64rm, 0 }, @@ -1213,11 +1205,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::ADCX32rr, X86::ADCX32rm, 0 }, { X86::ADCX64rr, X86::ADCX64rm, 0 }, { X86::ADD16rr, X86::ADD16rm, 0 }, - { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, { X86::ADD32rr, X86::ADD32rm, 0 }, - { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, { X86::ADD64rr, X86::ADD64rm, 0 }, - { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, { X86::ADD8rr, X86::ADD8rm, 0 }, { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, @@ -1247,54 +1236,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, - { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, - { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, - { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, - { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, - { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, - { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, - { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, - { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, - { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, - { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, - { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, - { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, - { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, - { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, - { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, - { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, - { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, - { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, - { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, - { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, - { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, - { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, - { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, - { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, - { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, - { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, - { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, - { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, - { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, - { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, - { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, - { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, - { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, - { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, - { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, - { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, - { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, - { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, - { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, - { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, - { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, - { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, - { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, - { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, - { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, - { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, - { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, - { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, + { X86::CMOV16rr, X86::CMOV16rm, 0 }, + { X86::CMOV32rr, X86::CMOV32rm, 0 }, + { X86::CMOV64rr, X86::CMOV64rm, 0 }, { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, { X86::CMPSDrr, X86::CMPSDrm, 0 }, @@ -1421,6 +1365,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE }, { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, + { X86::MOVSDrr, X86::MOVLPDrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, @@ -1576,7 +1521,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, - // FIXME: TEST*rr -> swapped operand of TEST *mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, @@ -1697,6 +1641,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0 }, { X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0 }, { X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0 }, + { X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0 }, + { X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0 }, + { X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0 }, + { X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0 }, + { X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0 }, + { X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0 }, { X86::VCVTPD2DQZ128rrkz, X86::VCVTPD2DQZ128rmkz, 0 }, { X86::VCVTPD2DQZ256rrkz, X86::VCVTPD2DQZ256rmkz, 0 }, { X86::VCVTPD2DQZrrkz, X86::VCVTPD2DQZrmkz, 0 }, @@ -2030,6 +1980,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMOVDQU8Zrrkz, X86::VMOVDQU8Zrmkz, TB_NO_REVERSE }, { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE }, { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, + { X86::VMOVSDZrr, X86::VMOVLPDZ128rm, TB_NO_REVERSE }, + { X86::VMOVSDrr, X86::VMOVLPDrm, TB_NO_REVERSE }, { X86::VMOVSHDUPZ128rrkz, X86::VMOVSHDUPZ128rmkz, 0 }, { X86::VMOVSHDUPZ256rrkz, X86::VMOVSHDUPZ256rmkz, 0 }, { X86::VMOVSHDUPZrrkz, X86::VMOVSHDUPZrmkz, 0 }, @@ -2072,6 +2024,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, { X86::VORPSZrr, X86::VORPSZrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, + { X86::VP2INTERSECTDZ128rr, X86::VP2INTERSECTDZ128rm, 0 }, + { X86::VP2INTERSECTDZ256rr, X86::VP2INTERSECTDZ256rm, 0 }, + { X86::VP2INTERSECTDZrr, X86::VP2INTERSECTDZrm, 0 }, + { X86::VP2INTERSECTQZ128rr, X86::VP2INTERSECTQZ128rm, 0 }, + { X86::VP2INTERSECTQZ256rr, X86::VP2INTERSECTQZ256rm, 0 }, + { X86::VP2INTERSECTQZrr, X86::VP2INTERSECTQZrm, 0 }, { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 }, { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 }, { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 }, @@ -3074,6 +3032,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0 }, { X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0 }, { X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0 }, + { X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0 }, + { X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0 }, + { X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0 }, + { X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0 }, + { X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0 }, + { X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0 }, { X86::VCVTPD2DQZ128rrk, X86::VCVTPD2DQZ128rmk, 0 }, { X86::VCVTPD2DQZ256rrk, X86::VCVTPD2DQZ256rmk, 0 }, { X86::VCVTPD2DQZrrk, X86::VCVTPD2DQZrmk, 0 }, @@ -3162,6 +3126,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, + { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 }, + { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 }, + { X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0 }, { X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0 }, { X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0 }, { X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE }, @@ -4376,6 +4343,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 }, { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 }, { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, + { X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0 }, + { X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0 }, + { X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0 }, { X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE }, { X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE }, { X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 }, @@ -4389,6 +4359,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, + { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 }, + { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 }, + { X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0 }, + { X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 }, + { X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 }, + { X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 }, { X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 }, { X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 }, { X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 }, @@ -5315,9 +5291,7 @@ lookupFoldTableImpl(ArrayRef Table, unsigned RegOp) { } #endif - const X86MemoryFoldTableEntry *Data = std::lower_bound(Table.begin(), - Table.end(), - RegOp); + const X86MemoryFoldTableEntry *Data = llvm::lower_bound(Table, RegOp); if (Data != Table.end() && Data->KeyOp == RegOp && !(Data->Flags & TB_NO_FORWARD)) return Data; @@ -5404,7 +5378,7 @@ static ManagedStatic MemUnfoldTable; const X86MemoryFoldTableEntry * llvm::lookupUnfoldTable(unsigned MemOp) { auto &Table = MemUnfoldTable->Table; - auto I = std::lower_bound(Table.begin(), Table.end(), MemOp); + auto I = llvm::lower_bound(Table, MemOp); if (I != Table.end() && I->KeyOp == MemOp) return &*I; return nullptr; diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h index 90016baead96..419baf98f61d 100644 --- a/lib/Target/X86/X86InstrFoldTables.h +++ b/lib/Target/X86/X86InstrFoldTables.h @@ -1,9 +1,8 @@ //===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 47d4719d3060..e8f0d937dff4 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -1,9 +1,8 @@ //===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -27,10 +26,13 @@ def RawFrmDst : Format<5>; def RawFrmDstSrc : Format<6>; def RawFrmImm8 : Format<7>; def RawFrmImm16 : Format<8>; +def AddCCFrm : Format<9>; def MRMDestMem : Format<32>; def MRMSrcMem : Format<33>; def MRMSrcMem4VOp3 : Format<34>; def MRMSrcMemOp4 : Format<35>; +def MRMSrcMemCC : Format<36>; +def MRMXmCC: Format<38>; def MRMXm : Format<39>; def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>; def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>; @@ -39,6 +41,8 @@ def MRMDestReg : Format<48>; def MRMSrcReg : Format<49>; def MRMSrcReg4VOp3 : Format<50>; def MRMSrcRegOp4 : Format<51>; +def MRMSrcRegCC : Format<52>; +def MRMXrCC: Format<54>; def MRMXr : Format<55>; def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>; def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>; @@ -206,13 +210,10 @@ class TAPS : TA { Prefix OpPrefix = PS; } class TAPD : TA { Prefix OpPrefix = PD; } class TAXD : TA { Prefix OpPrefix = XD; } class VEX { Encoding OpEnc = EncVEX; } -class VEX_W { bits<2> VEX_WPrefix = 1; } -class VEX_WIG { bits<2> VEX_WPrefix = 2; } +class VEX_W { bit HasVEX_W = 1; } +class VEX_WIG { bit IgnoresVEX_W = 1; } // Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX. -// FIXME: We should consider adding separate bits for VEX_WIG and the extra -// part of W1X. This would probably simplify the tablegen emitters and -// the TSFlags creation below. -class VEX_W1X { bits<2> VEX_WPrefix = 3; } +class VEX_W1X { bit HasVEX_W = 1; bit EVEX_W1_VEX_W0 = 1; } class VEX_4V : VEX { bit hasVEX_4V = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } @@ -296,7 +297,10 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, bit hasREPPrefix = 0; // Does this inst have a REP prefix? Encoding OpEnc = EncNormal; // Encoding used by this instruction bits<2> OpEncBits = OpEnc.Value; - bits<2> VEX_WPrefix = 0; // Does this inst set the VEX_W field? + bit HasVEX_W = 0; // Does this inst set the VEX_W field? + bit IgnoresVEX_W = 0; // Does this inst ignore VEX_W field? + bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX + // instruction with VEX.W == 0. bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit @@ -311,11 +315,8 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix? - bits<2> EVEX_LL; - let EVEX_LL{0} = hasVEX_L; - let EVEX_LL{1} = hasEVEX_L2; // Vector size in bytes. - bits<7> VectSize = !shl(16, EVEX_LL); + bits<7> VectSize = !if(hasEVEX_L2, 64, !if(hasVEX_L, 32, 16)); // The scaling factor for AVX512's compressed displacement is either // - the size of a power-of-two number of elements or @@ -355,7 +356,7 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{29-28} = OpEncBits; let TSFlags{37-30} = Opcode; // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0. - let TSFlags{38} = VEX_WPrefix{0}; + let TSFlags{38} = HasVEX_W; let TSFlags{39} = hasVEX_4V; let TSFlags{40} = hasVEX_L; let TSFlags{41} = hasEVEX_K; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 11a27ba90586..096cc27861ca 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1,9 +1,8 @@ //===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -100,8 +99,10 @@ def X86insertps : SDNode<"X86ISD::INSERTPS", def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; -def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -127,21 +128,31 @@ def X86vfpext : SDNode<"X86ISD::VFPEXT", def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, - SDTCisSameSizeAs<0, 1>]>>; + SDTCisOpSmallerThanOp<0, 1>]>>; -def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND", +def X86frounds : SDNode<"X86ISD::VFPROUNDS", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f64>, + SDTCisSameSizeAs<0, 2>]>>; + +def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, SDTCisSameAs<0, 1>, SDTCVecEltisVT<2, f64>, SDTCisSameSizeAs<0, 2>, SDTCisVT<3, i32>]>>; -def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND", - SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>, +def X86fpexts : SDNode<"X86ISD::VFPEXTS", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, SDTCisSameAs<0, 1>, SDTCVecEltisVT<2, f32>, - SDTCisSameSizeAs<0, 2>, - SDTCisVT<3, i32>]>>; + SDTCisSameSizeAs<0, 2>]>>; +def X86fpextsSAE : SDNode<"X86ISD::VFPEXTS_SAE", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, + SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f32>, + SDTCisSameSizeAs<0, 2>]>>; def X86vmfpround: SDNode<"X86ISD::VMFPROUND", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, @@ -164,25 +175,14 @@ def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; -def X86CmpMaskCCRound : - SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, - SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>, - SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, - SDTCisVT<4, i32>]>; def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def X86CmpMaskCCScalarRound : - SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, - SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; - def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -// Hack to make CMPM commutable in tablegen patterns for load folding. -def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>; -def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; -def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; +def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>; def X86phminpos: SDNode<"X86ISD::PHMINPOS", SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>; @@ -198,6 +198,8 @@ def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>; def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<0>]>; +def X86vshlv : SDNode<"X86ISD::VSHLV", X86vshiftvariable>; +def X86vsrlv : SDNode<"X86ISD::VSRLV", X86vshiftvariable>; def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>; def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>; @@ -299,25 +301,15 @@ def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i32>]>; -def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisVT<3, i32>, - SDTCisVT<4, i32>]>; -def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisInt<3>, - SDTCisSameSizeAs<0, 3>, - SDTCisSameNumEltsAs<0, 3>, - SDTCisVT<4, i32>, - SDTCisVT<5, i32>]>; -def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, +def SDTFPTernaryOpImm: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisInt<3>, + SDTCisSameSizeAs<0, 3>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, i32>]>; +def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>]>; -def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, - SDTCisVT<3, i32>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, @@ -373,11 +365,23 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; -def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>; -def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>; - -def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>; -def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>; +def X86Movsd : SDNode<"X86ISD::MOVSD", + SDTypeProfile<1, 2, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v2f64>, + SDTCisVT<2, v2f64>]>>; +def X86Movss : SDNode<"X86ISD::MOVSS", + SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>>; + +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", + SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", + SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>>; def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>, SDTCisVec<1>, SDTCisInt<1>, @@ -421,16 +425,18 @@ def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; -def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>; -def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>; +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImm>; +def X86VFixupimmSAE : SDNode<"X86ISD::VFIXUPIMM_SAE", SDTFPTernaryOpImm>; +def X86VFixupimms : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImm>; +def X86VFixupimmSAEs : SDNode<"X86ISD::VFIXUPIMMS_SAE", SDTFPTernaryOpImm>; def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>; -def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>; +def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>; def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>; -def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>; +def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>; def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>; -def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>; +def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>; def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>; -def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>; +def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>; def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, SDTCisFP<1>, @@ -448,27 +454,42 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; +def X86Blendv : SDNode<"X86ISD::BLENDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<2, 3>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0, 1>]>>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86fadds : SDNode<"X86ISD::FADDS", SDTFPBinOp>; def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fsubs : SDNode<"X86ISD::FSUBS", SDTFPBinOp>; def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fmuls : SDNode<"X86ISD::FMULS", SDTFPBinOp>; def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fdivs : SDNode<"X86ISD::FDIVS", SDTFPBinOp>; def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; -def X86fmaxRnds : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; -def X86fminRnds : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>; -def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; -def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>; +def X86fmaxSAE : SDNode<"X86ISD::FMAX_SAE", SDTFPBinOp>; +def X86fmaxSAEs : SDNode<"X86ISD::FMAXS_SAE", SDTFPBinOp>; +def X86fminSAE : SDNode<"X86ISD::FMIN_SAE", SDTFPBinOp>; +def X86fminSAEs : SDNode<"X86ISD::FMINS_SAE", SDTFPBinOp>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOp>; +def X86scalefRnd : SDNode<"X86ISD::SCALEF_RND", SDTFPBinOpRound>; +def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOp>; +def X86scalefsRnd: SDNode<"X86ISD::SCALEFS_RND", SDTFPBinOpRound>; def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrts : SDNode<"X86ISD::FSQRTS", SDTFPBinOp>; def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>; -def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; -def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>; +def X86fgetexp : SDNode<"X86ISD::FGETEXP", SDTFPUnaryOp>; +def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>; +def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>; +def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>; def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; @@ -484,6 +505,10 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>; def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>; +def X86vp2intersect : SDNode<"X86ISD::VP2INTERSECT", + SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, + SDTCisVec<1>, SDTCisSameAs<1, 2>]>>; + def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; @@ -500,27 +525,36 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>; def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>; def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>; -def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; -def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; -def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>; +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>; +def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>; +def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>; +def X86rcp28SAE : SDNode<"X86ISD::RCP28_SAE", SDTFPUnaryOp>; +def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOp>; +def X86exp2SAE : SDNode<"X86ISD::EXP2_SAE", SDTFPUnaryOp>; def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>; def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>; -def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>; -def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOp>; +def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>; +def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOp>; +def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>; def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>; def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>; def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>; def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>; -def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>; -def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>; -def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>; -def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>; - -def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, - [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; -def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, - [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; +def X86RangesSAE : SDNode<"X86ISD::VRANGES_SAE", SDTFPBinOpImm>; +def X86RndScalesSAE : SDNode<"X86ISD::VRNDSCALES_SAE", SDTFPBinOpImm>; +def X86ReducesSAE : SDNode<"X86ISD::VREDUCES_SAE", SDTFPBinOpImm>; +def X86GetMantsSAE : SDNode<"X86ISD::VGETMANTS_SAE", SDTFPBinOpImm>; + +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<0, 3>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<0, 3>]>, []>; // vpshufbitqmb def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB", @@ -529,6 +563,8 @@ def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB", SDTCVecEltisVT<0,i1>, SDTCisSameNumEltsAs<0,1>]>>; +def SDTintToFP: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>]>; def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisVT<3, i32>]>; @@ -550,13 +586,15 @@ def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i32>]>; // Scalar +def X86SintToFp : SDNode<"X86ISD::SCALAR_SINT_TO_FP", SDTintToFP>; def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>; +def X86UintToFp : SDNode<"X86ISD::SCALAR_UINT_TO_FP", SDTintToFP>; def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>; def X86cvtts2Int : SDNode<"X86ISD::CVTTS2SI", SDTSFloatToInt>; def X86cvtts2UInt : SDNode<"X86ISD::CVTTS2UI", SDTSFloatToInt>; -def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>; -def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>; +def X86cvtts2IntSAE : SDNode<"X86ISD::CVTTS2SI_SAE", SDTSFloatToInt>; +def X86cvtts2UIntSAE : SDNode<"X86ISD::CVTTS2UI_SAE", SDTSFloatToInt>; def X86cvts2si : SDNode<"X86ISD::CVTS2SI", SDTSFloatToInt>; def X86cvts2usi : SDNode<"X86ISD::CVTS2UI", SDTSFloatToInt>; @@ -566,8 +604,8 @@ def X86cvts2usiRnd : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>; // Vector with rounding mode // cvtt fp-to-int staff -def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>; -def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>; +def X86cvttp2siSAE : SDNode<"X86ISD::CVTTP2SI_SAE", SDTFloatToInt>; +def X86cvttp2uiSAE : SDNode<"X86ISD::CVTTP2UI_SAE", SDTFloatToInt>; def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>; def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>; @@ -590,6 +628,13 @@ def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>; +// Masked versions of above +def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisInt<1>, + SDTCisSameSizeAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<1, 3>]>; def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>, SDTCisSameSizeAs<0, 1>, @@ -597,6 +642,9 @@ def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<1, 3>]>; +def X86VMSintToFP : SDNode<"X86ISD::MCVTSI2P", SDTMVintToFP>; +def X86VMUintToFP : SDNode<"X86ISD::MCVTUI2P", SDTMVintToFP>; + def X86mcvtp2Int : SDNode<"X86ISD::MCVTP2SI", SDTMFloatToInt>; def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>; def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>; @@ -607,10 +655,9 @@ def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, i16>]> >; -def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, i16>, - SDTCisVT<2, i32>]> >; +def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]> >; def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, @@ -623,17 +670,35 @@ def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", SDTCisSameAs<0, 3>, SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<1, 4>]> >; -def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, +def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, SDTCVecEltisVT<1, f32>, - SDTCisOpSmallerThanOp<1, 0>, - SDTCisVT<2, i32>]>>; + SDTCisOpSmallerThanOp<1, 0>]>>; def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, SDTCisOpSmallerThanOp<0, 1>, SDTCisVT<2, i32>]>>; +// cvt fp to bfloat16 +def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; +def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<1, 3>]>>; +def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>]>>; +def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0,1>, + SDTCVecEltisVT<2, i32>, + SDTCisSameAs<2,3>]>>; + // galois field arithmetic def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; @@ -653,18 +718,8 @@ def sse_load_f64 : ComplexPattern; -def ssmem : Operand { - let PrintMethod = "printf32mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); - let ParserMatchClass = X86Mem32AsmOperand; - let OperandType = "OPERAND_MEMORY"; -} -def sdmem : Operand { - let PrintMethod = "printf64mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); - let ParserMatchClass = X86Mem64AsmOperand; - let OperandType = "OPERAND_MEMORY"; -} +def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; //===----------------------------------------------------------------------===// // SSE pattern fragments @@ -695,9 +750,9 @@ def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments -def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; -def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; -def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; +def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; // Like 'store', but always requires vector size alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -884,15 +939,20 @@ def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; -def vzmovl_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; -def vzmovl_v4i32 : PatFrag<(ops node:$src), - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; +def X86vzload32 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 4; +}]>; -def vzload_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzload node:$src)))>; +def X86vzload64 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 8; +}]>; + +def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr), + (X86vextractst node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getStoreSize() == 8; +}]>; def fp32imm0 : PatLeaf<(f32 fpimm), [{ @@ -903,20 +963,6 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{ return N->isExactlyValue(+0.0); }]>; -def I8Imm : SDNodeXFormgetZExtValue(), SDLoc(N)); -}]>; - -def FROUND_NO_EXC : PatLeaf<(i32 8)>; -def FROUND_CURRENT : PatLeaf<(i32 4)>; - -// BYTE_imm - Transform bit immediates into byte immediates. -def BYTE_imm : SDNodeXForm> 3 - return getI32Imm(N->getZExtValue() >> 3, SDLoc(N)); -}]>; - // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index // to VEXTRACTF128/VEXTRACTI128 imm. def EXTRACT_get_vextract128_imm : SDNodeXForm; + node:$index), [{ + // Index 0 can be handled via extract_subreg. + return !isNullConstant(N->getOperand(1)); +}], EXTRACT_get_vextract128_imm>; def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), @@ -954,8 +1002,10 @@ def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, - node:$index), [{}], - EXTRACT_get_vextract256_imm>; + node:$index), [{ + // Index 0 can be handled via extract_subreg. + return !isNullConstant(N->getOperand(1)); +}], EXTRACT_get_vextract256_imm>; def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), @@ -963,70 +1013,46 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), [{}], INSERT_get_vinsert256_imm>; -def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_load node:$src1, node:$src2, node:$src3), [{ +def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_ld node:$src1, node:$src2, node:$src3), [{ return !cast(N)->isExpandingLoad() && cast(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; -def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mload node:$src1, node:$src2, node:$src3), [{ - return cast(N)->getAlignment() >= 16; -}]>; - -def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mload node:$src1, node:$src2, node:$src3), [{ - return cast(N)->getAlignment() >= 32; -}]>; - -def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mload node:$src1, node:$src2, node:$src3), [{ - return cast(N)->getAlignment() >= 64; -}]>; - -def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), +def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_load node:$src1, node:$src2, node:$src3), [{ - return !cast(N)->isExpandingLoad() && - cast(N)->getExtensionType() == ISD::NON_EXTLOAD; + // Use the node type to determine the size the alignment needs to match. + // We can't use memory VT because type widening changes the node VT, but + // not the memory VT. + auto *Ld = cast(N); + return Ld->getAlignment() >= Ld->getValueType(0).getStoreSize(); }]>; def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_load node:$src1, node:$src2, node:$src3), [{ + (masked_ld node:$src1, node:$src2, node:$src3), [{ return cast(N)->isExpandingLoad(); }]>; // Masked store fragments. // X86mstore can't be implemented in core DAG files because some targets // do not support vector types (llvm-tblgen will fail). -def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ +def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_st node:$src1, node:$src2, node:$src3), [{ return (!cast(N)->isTruncatingStore()) && (!cast(N)->isCompressingStore()); }]>; -def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return cast(N)->getAlignment() >= 16; -}]>; - -def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return cast(N)->getAlignment() >= 32; -}]>; - -def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return cast(N)->getAlignment() >= 64; -}]>; - -def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), +def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_store node:$src1, node:$src2, node:$src3), [{ - return (!cast(N)->isTruncatingStore()) && - (!cast(N)->isCompressingStore()); + // Use the node type to determine the size the alignment needs to match. + // We can't use memory VT because type widening changes the node VT, but + // not the memory VT. + auto *St = cast(N); + return St->getAlignment() >= St->getOperand(1).getValueType().getStoreSize(); }]>; def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (masked_st node:$src1, node:$src2, node:$src3), [{ return cast(N)->isCompressingStore(); }]>; @@ -1034,7 +1060,7 @@ def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), // X86mtruncstore can't be implemented in core DAG files because some targets // doesn't support vector type ( llvm-tblgen will fail) def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (masked_st node:$src1, node:$src2, node:$src3), [{ return cast(N)->isTruncatingStore(); }]>; def masked_truncstorevi8 : diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index ab14ee7fadf2..dbe45356c42b 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1,9 +1,8 @@ //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -220,16 +219,22 @@ static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { return true; case X86::MOV32rm: case X86::MOVSSrm: - case X86::VMOVSSZrm: + case X86::MOVSSrm_alt: case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: + case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::KMOVDkm: MemBytes = 4; return true; case X86::MOV64rm: case X86::LD_Fp64m: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::KMOVQkm: @@ -483,9 +488,10 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: - case X86::LD_Fp64m: case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: @@ -493,7 +499,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -510,7 +518,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MMX_MOVQ64rm: // AVX-512 case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: case X86::VMOVAPDZrm: @@ -590,96 +600,12 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return true; } -bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { - MachineBasicBlock::iterator E = MBB.end(); - - // For compile time consideration, if we are not able to determine the - // safety after visiting 4 instructions in each direction, we will assume - // it's not safe. - MachineBasicBlock::iterator Iter = I; - for (unsigned i = 0; Iter != E && i < 4; ++i) { - bool SeenDef = false; - for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { - MachineOperand &MO = Iter->getOperand(j); - if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) - SeenDef = true; - if (!MO.isReg()) - continue; - if (MO.getReg() == X86::EFLAGS) { - if (MO.isUse()) - return false; - SeenDef = true; - } - } - - if (SeenDef) - // This instruction defines EFLAGS, no need to look any further. - return true; - ++Iter; - // Skip over debug instructions. - while (Iter != E && Iter->isDebugInstr()) - ++Iter; - } - - // It is safe to clobber EFLAGS at the end of a block of no successor has it - // live in. - if (Iter == E) { - for (MachineBasicBlock *S : MBB.successors()) - if (S->isLiveIn(X86::EFLAGS)) - return false; - return true; - } - - MachineBasicBlock::iterator B = MBB.begin(); - Iter = I; - for (unsigned i = 0; i < 4; ++i) { - // If we make it to the beginning of the block, it's safe to clobber - // EFLAGS iff EFLAGS is not live-in. - if (Iter == B) - return !MBB.isLiveIn(X86::EFLAGS); - - --Iter; - // Skip over debug instructions. - while (Iter != B && Iter->isDebugInstr()) - --Iter; - - bool SawKill = false; - for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { - MachineOperand &MO = Iter->getOperand(j); - // A register mask may clobber EFLAGS, but we should still look for a - // live EFLAGS def. - if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) - SawKill = true; - if (MO.isReg() && MO.getReg() == X86::EFLAGS) { - if (MO.isDef()) return MO.isDead(); - if (MO.isKill()) SawKill = true; - } - } - - if (SawKill) - // This instruction kills EFLAGS and doesn't redefine it, so - // there's no need to look further. - return true; - } - - // Conservative answer. - return false; -} - void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { - bool ClobbersEFLAGS = false; - for (const MachineOperand &MO : Orig.operands()) { - if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { - ClobbersEFLAGS = true; - break; - } - } - + bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side // effects. @@ -796,11 +722,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, - LiveVariables *LV) const { + LiveVariables *LV, bool Is8BitOp) const { // We handle 8-bit adds and various 16-bit opcodes in the switch below. - bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri); MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); - assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( + assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) && "Unexpected type for LEA transform"); @@ -830,7 +755,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned Src = MI.getOperand(1).getReg(); bool IsDead = MI.getOperand(0).isDead(); bool IsKill = MI.getOperand(1).isKill(); - unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit; + unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization"); BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA); MachineInstr *InsMI = @@ -842,19 +767,23 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); + case X86::SHL8ri: case X86::SHL16ri: { unsigned ShAmt = MI.getOperand(2).getImm(); MIB.addReg(0).addImm(1ULL << ShAmt) .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); break; } + case X86::INC8r: case X86::INC16r: addRegOffset(MIB, InRegLEA, true, 1); break; + case X86::DEC8r: case X86::DEC16r: addRegOffset(MIB, InRegLEA, true, -1); break; case X86::ADD8ri: + case X86::ADD8ri_DB: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: @@ -862,6 +791,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); break; case X86::ADD8rr: + case X86::ADD8rr_DB: case X86::ADD16rr: case X86::ADD16rr_DB: { unsigned Src2 = MI.getOperand(2).getReg(); @@ -948,9 +878,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr *NewMI = nullptr; bool Is64Bit = Subtarget.is64Bit(); + bool Is8BitOp = false; unsigned MIOpc = MI.getOpcode(); switch (MIOpc) { - default: return nullptr; + default: llvm_unreachable("Unreachable!"); case X86::SHL64ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); @@ -1000,12 +931,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } + case X86::SHL8ri: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::SHL16ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); } case X86::INC64r: case X86::INC32r: { @@ -1029,8 +963,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI = addOffset(MIB, 1); break; } - case X86::INC16r: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); case X86::DEC64r: case X86::DEC32r: { assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); @@ -1054,8 +986,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } + case X86::DEC8r: + case X86::INC8r: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::DEC16r: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + case X86::INC16r: + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD32rr: @@ -1094,9 +1031,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::ADD8rr: + case X86::ADD8rr_DB: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::ADD16rr: case X86::ADD16rr_DB: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: @@ -1130,11 +1070,59 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::ADD8ri: + case X86::ADD8ri_DB: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); + case X86::SUB8ri: + case X86::SUB16ri8: + case X86::SUB16ri: + /// FIXME: Support these similar to ADD8ri/ADD16ri*. + return nullptr; + case X86::SUB32ri8: + case X86::SUB32ri: { + int64_t Imm = MI.getOperand(2).getImm(); + if (!isInt<32>(-Imm)) + return nullptr; + + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; + + bool isKill; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, ImplicitOp, LV)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .addReg(SrcReg, getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.add(ImplicitOp); + + NewMI = addOffset(MIB, -Imm); + break; + } + + case X86::SUB64ri8: + case X86::SUB64ri32: { + int64_t Imm = MI.getOperand(2).getImm(); + if (!isInt<32>(-Imm)) + return nullptr; + + assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); + + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), + get(X86::LEA64r)).add(Dest).add(Src); + NewMI = addOffset(MIB, -Imm); + break; + } + case X86::VMOVDQU8Z128rmk: case X86::VMOVDQU8Z256rmk: case X86::VMOVDQU8Zrmk: @@ -1522,7 +1510,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VBLENDPDrri: case X86::VBLENDPSrri: // If we're optimizing for size, try to use MOVSD/MOVSS. - if (MI.getParent()->getParent()->getFunction().optForSize()) { + if (MI.getParent()->getParent()->getFunction().hasOptSize()) { unsigned Mask, Opc; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); @@ -1548,47 +1536,90 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VPBLENDWrri: case X86::VPBLENDDYrri: case X86::VPBLENDWYrri:{ - unsigned Mask; + int8_t Mask; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); - case X86::BLENDPDrri: Mask = 0x03; break; - case X86::BLENDPSrri: Mask = 0x0F; break; - case X86::PBLENDWrri: Mask = 0xFF; break; - case X86::VBLENDPDrri: Mask = 0x03; break; - case X86::VBLENDPSrri: Mask = 0x0F; break; - case X86::VBLENDPDYrri: Mask = 0x0F; break; - case X86::VBLENDPSYrri: Mask = 0xFF; break; - case X86::VPBLENDDrri: Mask = 0x0F; break; - case X86::VPBLENDWrri: Mask = 0xFF; break; - case X86::VPBLENDDYrri: Mask = 0xFF; break; - case X86::VPBLENDWYrri: Mask = 0xFF; break; + case X86::BLENDPDrri: Mask = (int8_t)0x03; break; + case X86::BLENDPSrri: Mask = (int8_t)0x0F; break; + case X86::PBLENDWrri: Mask = (int8_t)0xFF; break; + case X86::VBLENDPDrri: Mask = (int8_t)0x03; break; + case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break; + case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break; + case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break; + case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break; + case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break; + case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break; + case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break; } // Only the least significant bits of Imm are used. - unsigned Imm = MI.getOperand(3).getImm() & Mask; + // Using int8_t to ensure it will be sign extended to the int64_t that + // setImm takes in order to match isel behavior. + int8_t Imm = MI.getOperand(3).getImm() & Mask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Mask ^ Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: { + unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + // We can commute insertps if we zero 2 of the elements, the insertion is + // "inline" and we don't override the insertion with a zero. + if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 && + countPopulation(ZMask) == 2) { + unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15); + assert(AltIdx < 4 && "Illegal insertion index"); + unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + return nullptr; + } case X86::MOVSDrr: case X86::MOVSSrr: case X86::VMOVSDrr: case X86::VMOVSSrr:{ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. - assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!"); + if (Subtarget.hasSSE41()) { + unsigned Mask, Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; + case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; + case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; + case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + } - unsigned Mask, Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unreachable!"); - case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; - case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; - case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; - case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } + // Convert to SHUFPD. + assert(MI.getOpcode() == X86::MOVSDrr && + "Can only commute MOVSDrr without SSE4.1"); + auto &WorkingMI = cloneIfNew(MI); - WorkingMI.setDesc(get(Opc)); - WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + WorkingMI.setDesc(get(X86::SHUFPDrri)); + WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + case X86::SHUFPDrri: { + // Commute to MOVSD. + assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(X86::MOVSDrr)); + WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1657,7 +1688,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // Flip permute source immediate. // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi. // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi. - unsigned Imm = MI.getOperand(3).getImm() & 0xFF; + int8_t Imm = MI.getOperand(3).getImm() & 0xFF; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm ^ 0x22); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, @@ -1686,76 +1717,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: - case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: - case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: - case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr: - case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr: - case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr: - case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr: - case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr: - case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr: - case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr: - case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr: - case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr: - case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr: - case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr: - case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: - case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: { - unsigned Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unreachable!"); - case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; - case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; - case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break; - case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break; - case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break; - case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break; - case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break; - case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break; - case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break; - case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break; - case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break; - case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break; - case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break; - case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break; - case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break; - case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break; - case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break; - case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break; - case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break; - case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break; - case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break; - case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break; - case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break; - case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break; - case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break; - case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break; - case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break; - case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break; - case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break; - case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break; - case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break; - case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break; - case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break; - case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break; - case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break; - case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break; - case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break; - case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break; - case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break; - case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break; - case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break; - case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break; - case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break; - case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break; - case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break; - case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break; - case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; - case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; - } + case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: { auto &WorkingMI = cloneIfNew(MI); - WorkingMI.setDesc(get(Opc)); + unsigned OpNo = MI.getDesc().getNumOperands() - 1; + X86::CondCode CC = static_cast(MI.getOperand(OpNo).getImm()); + WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1879,7 +1845,6 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // regardless of the FMA opcode. The FMA opcode is adjusted later. if (SrcOpIdx1 == CommuteAnyOperandIndex || SrcOpIdx2 == CommuteAnyOperandIndex) { - unsigned CommutableOpIdx1 = SrcOpIdx1; unsigned CommutableOpIdx2 = SrcOpIdx2; // At least one of operands to be commuted is not specified and @@ -1895,6 +1860,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); + + unsigned CommutableOpIdx1; for (CommutableOpIdx1 = LastCommutableVecOp; CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { // Just ignore and skip the k-mask operand. @@ -1946,28 +1913,43 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: - case X86::VCMPPSZ256rri: { + case X86::VCMPPSZ256rri: + case X86::VCMPPDZrrik: + case X86::VCMPPSZrrik: + case X86::VCMPPDZ128rrik: + case X86::VCMPPSZ128rrik: + case X86::VCMPPDZ256rrik: + case X86::VCMPPSZ256rrik: { + unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0; + // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests - unsigned Imm = MI.getOperand(3).getImm() & 0x7; + unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; switch (Imm) { case 0x00: // EQUAL case 0x03: // UNORDERED case 0x04: // NOT EQUAL case 0x07: // ORDERED - // The indices of the commutable operands are 1 and 2. + // The indices of the commutable operands are 1 and 2 (or 2 and 3 + // when masked). // Assign them to the returned operand indices here. - return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, + 2 + OpOffset); } return false; } - case X86::MOVSDrr: case X86::MOVSSrr: - case X86::VMOVSDrr: - case X86::VMOVSSrr: + // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can + // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since + // AVX implies sse4.1. if (Subtarget.hasSSE41()) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; + case X86::SHUFPDrri: + // We can commute this to MOVSD. + if (MI.getOperand(3).getImm() == 0x02) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; case X86::MOVHLPSrr: case X86::UNPCKHPDrr: case X86::VMOVHLPSrr: @@ -2089,125 +2071,33 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, return false; } -X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) { - switch (BrOpc) { +X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::JE_1: return X86::COND_E; - case X86::JNE_1: return X86::COND_NE; - case X86::JL_1: return X86::COND_L; - case X86::JLE_1: return X86::COND_LE; - case X86::JG_1: return X86::COND_G; - case X86::JGE_1: return X86::COND_GE; - case X86::JB_1: return X86::COND_B; - case X86::JBE_1: return X86::COND_BE; - case X86::JA_1: return X86::COND_A; - case X86::JAE_1: return X86::COND_AE; - case X86::JS_1: return X86::COND_S; - case X86::JNS_1: return X86::COND_NS; - case X86::JP_1: return X86::COND_P; - case X86::JNP_1: return X86::COND_NP; - case X86::JO_1: return X86::COND_O; - case X86::JNO_1: return X86::COND_NO; - } -} - -/// Return condition code of a SET opcode. -X86::CondCode X86::getCondFromSETOpc(unsigned Opc) { - switch (Opc) { + case X86::JCC_1: + return static_cast( + MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); + } +} + +/// Return condition code of a SETCC opcode. +X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::SETAr: case X86::SETAm: return X86::COND_A; - case X86::SETAEr: case X86::SETAEm: return X86::COND_AE; - case X86::SETBr: case X86::SETBm: return X86::COND_B; - case X86::SETBEr: case X86::SETBEm: return X86::COND_BE; - case X86::SETEr: case X86::SETEm: return X86::COND_E; - case X86::SETGr: case X86::SETGm: return X86::COND_G; - case X86::SETGEr: case X86::SETGEm: return X86::COND_GE; - case X86::SETLr: case X86::SETLm: return X86::COND_L; - case X86::SETLEr: case X86::SETLEm: return X86::COND_LE; - case X86::SETNEr: case X86::SETNEm: return X86::COND_NE; - case X86::SETNOr: case X86::SETNOm: return X86::COND_NO; - case X86::SETNPr: case X86::SETNPm: return X86::COND_NP; - case X86::SETNSr: case X86::SETNSm: return X86::COND_NS; - case X86::SETOr: case X86::SETOm: return X86::COND_O; - case X86::SETPr: case X86::SETPm: return X86::COND_P; - case X86::SETSr: case X86::SETSm: return X86::COND_S; + case X86::SETCCr: case X86::SETCCm: + return static_cast( + MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } /// Return condition code of a CMov opcode. -X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { - switch (Opc) { +X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: - case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr: - return X86::COND_A; - case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm: - case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr: - return X86::COND_AE; - case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm: - case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr: - return X86::COND_B; - case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm: - case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr: - return X86::COND_BE; - case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm: - case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr: - return X86::COND_E; - case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm: - case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr: - return X86::COND_G; - case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm: - case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr: - return X86::COND_GE; - case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm: - case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr: - return X86::COND_L; - case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm: - case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr: - return X86::COND_LE; - case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm: - case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr: - return X86::COND_NE; - case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm: - case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr: - return X86::COND_NO; - case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm: - case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr: - return X86::COND_NP; - case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm: - case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr: - return X86::COND_NS; - case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm: - case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr: - return X86::COND_O; - case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm: - case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr: - return X86::COND_P; - case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm: - case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr: - return X86::COND_S; - } -} - -unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { - switch (CC) { - default: llvm_unreachable("Illegal condition code!"); - case X86::COND_E: return X86::JE_1; - case X86::COND_NE: return X86::JNE_1; - case X86::COND_L: return X86::JL_1; - case X86::COND_LE: return X86::JLE_1; - case X86::COND_G: return X86::JG_1; - case X86::COND_GE: return X86::JGE_1; - case X86::COND_B: return X86::JB_1; - case X86::COND_BE: return X86::JBE_1; - case X86::COND_A: return X86::JA_1; - case X86::COND_AE: return X86::JAE_1; - case X86::COND_S: return X86::JS_1; - case X86::COND_NS: return X86::JNS_1; - case X86::COND_P: return X86::JP_1; - case X86::COND_NP: return X86::JNP_1; - case X86::COND_O: return X86::JO_1; - case X86::COND_NO: return X86::JNO_1; + case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: + case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm: + return static_cast( + MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } @@ -2293,78 +2183,18 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) { return std::make_pair(CC, NeedSwap); } -/// Return a set opcode for the given condition and -/// whether it has memory operand. -unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { - static const uint16_t Opc[16][2] = { - { X86::SETAr, X86::SETAm }, - { X86::SETAEr, X86::SETAEm }, - { X86::SETBr, X86::SETBm }, - { X86::SETBEr, X86::SETBEm }, - { X86::SETEr, X86::SETEm }, - { X86::SETGr, X86::SETGm }, - { X86::SETGEr, X86::SETGEm }, - { X86::SETLr, X86::SETLm }, - { X86::SETLEr, X86::SETLEm }, - { X86::SETNEr, X86::SETNEm }, - { X86::SETNOr, X86::SETNOm }, - { X86::SETNPr, X86::SETNPm }, - { X86::SETNSr, X86::SETNSm }, - { X86::SETOr, X86::SETOm }, - { X86::SETPr, X86::SETPm }, - { X86::SETSr, X86::SETSm } - }; - - assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes"); - return Opc[CC][HasMemoryOperand ? 1 : 0]; -} - -/// Return a cmov opcode for the given condition, -/// register size in bytes, and operand type. -unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, - bool HasMemoryOperand) { - static const uint16_t Opc[32][3] = { - { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, - { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, - { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr }, - { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr }, - { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr }, - { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr }, - { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr }, - { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr }, - { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr }, - { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr }, - { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr }, - { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr }, - { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr }, - { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr }, - { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr }, - { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr }, - { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm }, - { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm }, - { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm }, - { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm }, - { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm }, - { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm }, - { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm }, - { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm }, - { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm }, - { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm }, - { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm }, - { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm }, - { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm }, - { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm }, - { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm }, - { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm } - }; +/// Return a setcc opcode based on whether it has memory operand. +unsigned X86::getSETOpc(bool HasMemoryOperand) { + return HasMemoryOperand ? X86::SETCCr : X86::SETCCm; +} - assert(CC < 16 && "Can only handle standard cond codes"); - unsigned Idx = HasMemoryOperand ? 16+CC : CC; +/// Return a cmov opcode for the given register size in bytes, and operand type. +unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) { switch(RegBytes) { default: llvm_unreachable("Illegal register size!"); - case 2: return Opc[Idx][0]; - case 4: return Opc[Idx][1]; - case 8: return Opc[Idx][2]; + case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr; + case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr; + case 8: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV64rr; } } @@ -2490,7 +2320,7 @@ void X86InstrInfo::replaceBranchWithTailCall( if (!I->isBranch()) assert(0 && "Can't find the branch to replace!"); - X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode()); + X86::CondCode CC = X86::getCondFromBranch(*I); assert(BranchCond.size() == 1); if (CC != BranchCond[0].getImm()) continue; @@ -2597,13 +2427,13 @@ bool X86InstrInfo::AnalyzeBranchImpl( } // Handle conditional branches. - X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode()); + X86::CondCode BranchCode = X86::getCondFromBranch(*I); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. // In practice we should never have an undef eflags operand, if we do // abort here as we are not prepared to preserve the flag. - if (I->getOperand(1).isUndef()) + if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef()) return true; // Working from the bottom, handle the first conditional branch. @@ -2629,11 +2459,11 @@ bool X86InstrInfo::AnalyzeBranchImpl( // Which is a bit more efficient. // We conditionally jump to the fall-through block. BranchCode = GetOppositeBranchCondition(BranchCode); - unsigned JNCC = GetCondBranchFromCond(BranchCode); MachineBasicBlock::iterator OldInst = I; - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) - .addMBB(UnCondBrIter->getOperand(0).getMBB()); + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1)) + .addMBB(UnCondBrIter->getOperand(0).getMBB()) + .addImm(BranchCode); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) .addMBB(TargetBB); @@ -2798,7 +2628,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, if (I->isDebugInstr()) continue; if (I->getOpcode() != X86::JMP_1 && - X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + X86::getCondFromBranch(*I) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); @@ -2837,9 +2667,9 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, switch (CC) { case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. - BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE); ++Count; - BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P); ++Count; break; case X86::COND_E_AND_NP: @@ -2850,14 +2680,13 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, "body is a fall-through."); } // Synthesize COND_E_AND_NP with two branches. - BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE); ++Count; - BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP); ++Count; break; default: { - unsigned Opc = GetCondBranchFromCond(CC); - BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC); ++Count; } } @@ -2880,7 +2709,7 @@ canInsertSelect(const MachineBasicBlock &MBB, if (Cond.size() != 1) return false; // We cannot do the composite conditions, at least not in SSA form. - if ((X86::CondCode)Cond[0].getImm() > X86::COND_S) + if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND) return false; // Check register classes. @@ -2915,10 +2744,12 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); assert(Cond.size() == 1 && "Invalid Cond array"); - unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), - TRI.getRegSizeInBits(RC) / 8, - false /*HasMemoryOperand*/); - BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); + unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8, + false /*HasMemoryOperand*/); + BuildMI(MBB, I, DL, get(Opc), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addImm(Cond[0].getImm()); } /// Test if the given register is a physical h register. @@ -2984,22 +2815,22 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return X86::MMX_MOVD64to64rr; } - // SrcReg(FR32) -> DestReg(GR32) - // SrcReg(GR32) -> DestReg(FR32) + // SrcReg(VR128) -> DestReg(GR32) + // SrcReg(GR32) -> DestReg(VR128) if (X86::GR32RegClass.contains(DestReg) && - X86::FR32XRegClass.contains(SrcReg)) - // Copy from a FR32 register to a GR32 register. - return HasAVX512 ? X86::VMOVSS2DIZrr : - HasAVX ? X86::VMOVSS2DIrr : - X86::MOVSS2DIrr; + X86::VR128XRegClass.contains(SrcReg)) + // Copy from a VR128 register to a GR32 register. + return HasAVX512 ? X86::VMOVPDI2DIZrr : + HasAVX ? X86::VMOVPDI2DIrr : + X86::MOVPDI2DIrr; - if (X86::FR32XRegClass.contains(DestReg) && + if (X86::VR128XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) - // Copy from a GR32 register to a FR32 register. - return HasAVX512 ? X86::VMOVDI2SSZrr : - HasAVX ? X86::VMOVDI2SSrr : - X86::MOVDI2SSrr; + // Copy from a VR128 register to a VR128 register. + return HasAVX512 ? X86::VMOVDI2PDIZrr : + HasAVX ? X86::VMOVDI2PDIrr : + X86::MOVDI2PDIrr; return 0; } @@ -3129,22 +2960,38 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::MOV32rm : X86::MOV32mr; if (X86::FR32XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : - (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + (HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt) : + (HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : + X86::MOVSSmr); if (X86::RFP32RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp32m : X86::ST_Fp32m; if (X86::VK32RegClass.hasSubClassEq(RC)) { assert(STI.hasBWI() && "KMOVD requires BWI"); return load ? X86::KMOVDkm : X86::KMOVDmk; } + // All of these mask pair classes have the same spill size, the same kind + // of kmov instructions can be used with all of them. + if (X86::VK1PAIRRegClass.hasSubClassEq(RC) || + X86::VK2PAIRRegClass.hasSubClassEq(RC) || + X86::VK4PAIRRegClass.hasSubClassEq(RC) || + X86::VK8PAIRRegClass.hasSubClassEq(RC) || + X86::VK16PAIRRegClass.hasSubClassEq(RC)) + return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) return load ? X86::MOV64rm : X86::MOV64mr; if (X86::FR64XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : - (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + (HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt) : + (HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : + X86::MOVSDmr); if (X86::VR64RegClass.hasSubClassEq(RC)) return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; if (X86::RFP64RegClass.hasSubClassEq(RC)) @@ -3219,7 +3066,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } bool X86InstrInfo::getMemOperandWithOffset( - MachineInstr &MemOp, MachineOperand *&BaseOp, int64_t &Offset, + const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); @@ -3572,25 +3419,39 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) { static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::LZCNT16rr: case X86::LZCNT16rm: - case X86::LZCNT32rr: case X86::LZCNT32rm: - case X86::LZCNT64rr: case X86::LZCNT64rm: + case X86::NEG8r: + case X86::NEG16r: + case X86::NEG32r: + case X86::NEG64r: + return X86::COND_AE; + case X86::LZCNT16rr: + case X86::LZCNT32rr: + case X86::LZCNT64rr: return X86::COND_B; - case X86::POPCNT16rr:case X86::POPCNT16rm: - case X86::POPCNT32rr:case X86::POPCNT32rm: - case X86::POPCNT64rr:case X86::POPCNT64rm: + case X86::POPCNT16rr: + case X86::POPCNT32rr: + case X86::POPCNT64rr: return X86::COND_E; - case X86::TZCNT16rr: case X86::TZCNT16rm: - case X86::TZCNT32rr: case X86::TZCNT32rm: - case X86::TZCNT64rr: case X86::TZCNT64rm: + case X86::TZCNT16rr: + case X86::TZCNT32rr: + case X86::TZCNT64rr: return X86::COND_B; - case X86::BSF16rr: case X86::BSF16rm: - case X86::BSF32rr: case X86::BSF32rm: - case X86::BSF64rr: case X86::BSF64rm: - case X86::BSR16rr: case X86::BSR16rm: - case X86::BSR32rr: case X86::BSR32rm: - case X86::BSR64rr: case X86::BSR64rm: + case X86::BSF16rr: + case X86::BSF32rr: + case X86::BSF64rr: + case X86::BSR16rr: + case X86::BSR32rr: + case X86::BSR64rr: return X86::COND_E; + case X86::BLSI32rr: + case X86::BLSI64rr: + return X86::COND_AE; + case X86::BLSR32rr: + case X86::BLSR64rr: + case X86::BLSMSK32rr: + case X86::BLSMSK64rr: + return X86::COND_B; + // TODO: TBM instructions. } } @@ -3602,7 +3463,6 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, const MachineRegisterInfo *MRI) const { // Check whether we can replace SUB with CMP. - unsigned NewOpcode = 0; switch (CmpInstr.getOpcode()) { default: break; case X86::SUB64ri32: @@ -3623,6 +3483,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; // There is no use of the destination register, we can replace SUB with CMP. + unsigned NewOpcode = 0; switch (CmpInstr.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; @@ -3746,7 +3607,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // If we are done with the basic block, we need to check whether EFLAGS is // live-out. bool IsSafe = false; - SmallVector, 4> OpsToUpdate; + SmallVector, 4> OpsToUpdate; MachineBasicBlock::iterator E = CmpInstr.getParent()->end(); for (++I; I != E; ++I) { const MachineInstr &Instr = *I; @@ -3763,17 +3624,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // EFLAGS is used by this instruction. X86::CondCode OldCC = X86::COND_INVALID; - bool OpcIsSET = false; if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) - OldCC = X86::getCondFromBranchOpc(Instr.getOpcode()); + OldCC = X86::getCondFromBranch(Instr); else { - OldCC = X86::getCondFromSETOpc(Instr.getOpcode()); - if (OldCC != X86::COND_INVALID) - OpcIsSET = true; - else - OldCC = X86::getCondFromCMovOpc(Instr.getOpcode()); + OldCC = X86::getCondFromSETCC(Instr); + if (OldCC == X86::COND_INVALID) + OldCC = X86::getCondFromCMov(Instr); } if (OldCC == X86::COND_INVALID) return false; } @@ -3818,24 +3676,10 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) { - // Synthesize the new opcode. - bool HasMemoryOperand = Instr.hasOneMemOperand(); - unsigned NewOpc; - if (Instr.isBranch()) - NewOpc = GetCondBranchFromCond(ReplacementCC); - else if(OpcIsSET) - NewOpc = getSETFromCond(ReplacementCC, HasMemoryOperand); - else { - unsigned DstReg = Instr.getOperand(0).getReg(); - const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); - NewOpc = getCMovFromCond(ReplacementCC, TRI->getRegSizeInBits(*DstRC)/8, - HasMemoryOperand); - } - // Push the MachineInstr to OpsToUpdate. // If it is safe to remove CmpInstr, the condition code of these // instructions will be modified. - OpsToUpdate.push_back(std::make_pair(&*I, NewOpc)); + OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC)); } if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { // It is safe to remove CmpInstr if EFLAGS is updated again or killed. @@ -3876,21 +3720,17 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } // Make sure Sub instruction defines EFLAGS and mark the def live. - unsigned i = 0, e = Sub->getNumOperands(); - for (; i != e; ++i) { - MachineOperand &MO = Sub->getOperand(i); - if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { - MO.setIsDead(false); - break; - } - } - assert(i != e && "Unable to locate a def EFLAGS operand"); + MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS); + assert(FlagDef && "Unable to locate a def EFLAGS operand"); + FlagDef->setIsDead(false); CmpInstr.eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. - for (auto &Op : OpsToUpdate) - Op.first->setDesc(get(Op.second)); + for (auto &Op : OpsToUpdate) { + Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1) + .setImm(Op.second); + } return true; } @@ -4128,6 +3968,20 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB, return true; } + +static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { + MIB->setDesc(Desc); + int64_t ShiftAmt = MIB->getOperand(2).getImm(); + // Temporarily remove the immediate so we can add another source register. + MIB->RemoveOperand(2); + // Add the register. Don't copy the kill flag if there is one. + MIB.addReg(MIB->getOperand(1).getReg(), + getUndefRegState(MIB->getOperand(1).isUndef())); + // Add back the immediate. + MIB.addImm(ShiftAmt); + return true; +} + bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -4193,6 +4047,12 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MIB.addReg(SrcReg, RegState::ImplicitDefine); return true; } + if (MI.getOpcode() == X86::AVX512_256_SET0) { + // No VLX so we must reference a zmm. + unsigned ZReg = + TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); + MIB->getOperand(0).setReg(ZReg); + } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } case X86::V_SETALLONES: @@ -4282,6 +4142,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::XOR64_FP: case X86::XOR32_FP: return expandXorFP(MIB, *this); + case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8)); + case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8)); + case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8)); + case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8)); + case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break; + case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break; + case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break; + case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break; + case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break; + case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break; + case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break; + case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; + case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break; + case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break; + case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break; } return false; } @@ -4303,7 +4178,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { /// FIXME: This should be turned into a TSFlags. /// static bool hasPartialRegUpdate(unsigned Opcode, - const X86Subtarget &Subtarget) { + const X86Subtarget &Subtarget, + bool ForLoadFold = false) { switch (Opcode) { case X86::CVTSI2SSrr: case X86::CVTSI2SSrm: @@ -4313,6 +4189,9 @@ static bool hasPartialRegUpdate(unsigned Opcode, case X86::CVTSI2SDrm: case X86::CVTSI642SDrr: case X86::CVTSI642SDrm: + // Load folding won't effect the undef register update since the input is + // a GPR. + return !ForLoadFold; case X86::CVTSD2SSrr: case X86::CVTSD2SSrm: case X86::CVTSS2SDrr: @@ -4389,7 +4268,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. -static bool hasUndefRegUpdate(unsigned Opcode) { +static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { switch (Opcode) { case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: @@ -4407,38 +4286,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTSI642SDrm: case X86::VCVTSI642SDrr_Int: case X86::VCVTSI642SDrm_Int: - case X86::VCVTSD2SSrr: - case X86::VCVTSD2SSrm: - case X86::VCVTSD2SSrr_Int: - case X86::VCVTSD2SSrm_Int: - case X86::VCVTSS2SDrr: - case X86::VCVTSS2SDrm: - case X86::VCVTSS2SDrr_Int: - case X86::VCVTSS2SDrm_Int: - case X86::VRCPSSr: - case X86::VRCPSSr_Int: - case X86::VRCPSSm: - case X86::VRCPSSm_Int: - case X86::VROUNDSDr: - case X86::VROUNDSDm: - case X86::VROUNDSDr_Int: - case X86::VROUNDSDm_Int: - case X86::VROUNDSSr: - case X86::VROUNDSSm: - case X86::VROUNDSSr_Int: - case X86::VROUNDSSm_Int: - case X86::VRSQRTSSr: - case X86::VRSQRTSSr_Int: - case X86::VRSQRTSSm: - case X86::VRSQRTSSm_Int: - case X86::VSQRTSSr: - case X86::VSQRTSSr_Int: - case X86::VSQRTSSm: - case X86::VSQRTSSm_Int: - case X86::VSQRTSDr: - case X86::VSQRTSDr_Int: - case X86::VSQRTSDm: - case X86::VSQRTSDm_Int: // AVX-512 case X86::VCVTSI2SSZrr: case X86::VCVTSI2SSZrm: @@ -4453,7 +4300,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTSI2SDZrr: case X86::VCVTSI2SDZrm: case X86::VCVTSI2SDZrr_Int: - case X86::VCVTSI2SDZrrb_Int: case X86::VCVTSI2SDZrm_Int: case X86::VCVTSI642SDZrr: case X86::VCVTSI642SDZrm: @@ -4479,6 +4325,42 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTUSI642SDZrr_Int: case X86::VCVTUSI642SDZrrb_Int: case X86::VCVTUSI642SDZrm_Int: + // Load folding won't effect the undef register update since the input is + // a GPR. + return !ForLoadFold; + case X86::VCVTSD2SSrr: + case X86::VCVTSD2SSrm: + case X86::VCVTSD2SSrr_Int: + case X86::VCVTSD2SSrm_Int: + case X86::VCVTSS2SDrr: + case X86::VCVTSS2SDrm: + case X86::VCVTSS2SDrr_Int: + case X86::VCVTSS2SDrm_Int: + case X86::VRCPSSr: + case X86::VRCPSSr_Int: + case X86::VRCPSSm: + case X86::VRCPSSm_Int: + case X86::VROUNDSDr: + case X86::VROUNDSDm: + case X86::VROUNDSDr_Int: + case X86::VROUNDSDm_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSm: + case X86::VROUNDSSr_Int: + case X86::VROUNDSSm_Int: + case X86::VRSQRTSSr: + case X86::VRSQRTSSr_Int: + case X86::VRSQRTSSm: + case X86::VRSQRTSSm_Int: + case X86::VSQRTSSr: + case X86::VSQRTSSr_Int: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSDr_Int: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + // AVX-512 case X86::VCVTSD2SSZrr: case X86::VCVTSD2SSZrr_Int: case X86::VCVTSD2SSZrrb_Int: @@ -4759,7 +4641,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size <= RCSize && 4 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = @@ -4783,7 +4665,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size <= RCSize && 8 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : @@ -4794,13 +4676,29 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( } } break; - }; + case X86::UNPCKLPDrr: + // If we won't be able to fold this to the memory form of UNPCKL, use + // MOVHPD instead. Done as custom because we can't have this in the load + // table twice. + if (OpNum == 2) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) { + MachineInstr *NewMI = + FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this); + return NewMI; + } + } + break; + } return nullptr; } -static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { - if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) || +static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, + MachineInstr &MI) { + if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) || !MI.getOperand(1).isReg()) return false; @@ -4828,15 +4726,15 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // For CPUs that favor the register form of a call or push, // do not fold loads into calls or pushes, unless optimizing for size // aggressively. - if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() && + if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() && (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || MI.getOpcode() == X86::PUSH64r)) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. - if (!MF.getFunction().optForSize() && - (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + if (!MF.getFunction().hasOptSize() && + (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; @@ -4899,6 +4797,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size < RCSize) { + // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) @@ -4937,9 +4836,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI.getDesc().getNumDefs(); - unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0; - unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); - unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); + Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register(); + Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); + Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); bool Tied1 = 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied2 = @@ -4997,14 +4896,15 @@ MachineInstr * X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, - int FrameIndex, LiveIntervals *LIS) const { + int FrameIndex, LiveIntervals *LIS, + VirtRegMap *VRM) const { // Check switch flag if (NoFusing) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. - if (!MF.getFunction().optForSize() && - (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + if (!MF.getFunction().hasOptSize() && + (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; @@ -5073,7 +4973,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); unsigned RegSize = TRI.getRegSizeInBits(*RC); - if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) && + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm || + Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt || + Opc == X86::VMOVSSZrm_alt) && RegSize > 32) { // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes), and its user @@ -5087,6 +4989,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: + case X86::VCMPSSZrr_Intk: case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz: case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz: @@ -5124,7 +5027,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } } - if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) && + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm || + Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt || + Opc == X86::VMOVSDZrm_alt) && RegSize > 64) { // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes), and its user @@ -5138,6 +5043,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: + case X86::VCMPSDZrr_Intk: case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz: case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz: @@ -5203,8 +5109,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (NoFusing) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. - if (!MF.getFunction().optForSize() && - (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + if (!MF.getFunction().hasOptSize() && + (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; @@ -5359,10 +5265,7 @@ extractLoadMMOs(ArrayRef MMOs, MachineFunction &MF) { } else { // Clone the MMO and unset the store flag. LoadMMOs.push_back(MF.getMachineMemOperand( - MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOStore, - MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr, - MMO->getSyncScopeID(), MMO->getOrdering(), - MMO->getFailureOrdering())); + MMO, MMO->getFlags() & ~MachineMemOperand::MOStore)); } } @@ -5383,10 +5286,7 @@ extractStoreMMOs(ArrayRef MMOs, MachineFunction &MF) { } else { // Clone the MMO and unset the load flag. StoreMMOs.push_back(MF.getMachineMemOperand( - MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOLoad, - MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr, - MMO->getSyncScopeID(), MMO->getOrdering(), - MMO->getFailureOrdering())); + MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad)); } } @@ -5668,7 +5568,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: @@ -5679,7 +5581,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -5694,7 +5598,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQUYrm: // AVX512 load instructions case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: @@ -5745,7 +5651,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: @@ -5756,7 +5664,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -5771,7 +5681,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQUYrm: // AVX512 load instructions case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: @@ -5943,7 +5855,9 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr }, { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr }, { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm }, + { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm }, { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm }, + { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -5973,7 +5887,9 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr }, { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr }, { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm }, + { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm }, { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm }, + { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, @@ -6012,13 +5928,17 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr }, { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr }, { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm }, + { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm }, { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm }, + { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm }, { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r }, { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m }, { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r }, { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m }, { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr }, { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm }, + { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128r }, + { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128m }, { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r }, { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr }, @@ -6109,6 +6029,8 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr}, + { X86::VMOVDDUPrm, X86::VMOVDDUPrm, X86::VPBROADCASTQrm}, + { X86::VMOVDDUPrr, X86::VMOVDDUPrr, X86::VPBROADCASTQrr}, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, @@ -6128,6 +6050,19 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr }, }; +static const uint16_t ReplaceableInstrsFP[][3] = { + //PackedSingle PackedDouble + { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END }, + { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END }, + { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END }, + { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END }, + { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END }, +}; + static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = { //PackedSingle PackedDouble PackedInt { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, @@ -6368,7 +6303,7 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { }; // NOTE: These should only be used by the custom domain methods. -static const uint16_t ReplaceableCustomInstrs[][3] = { +static const uint16_t ReplaceableBlendInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi }, { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri }, @@ -6377,7 +6312,7 @@ static const uint16_t ReplaceableCustomInstrs[][3] = { { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi }, { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri }, }; -static const uint16_t ReplaceableCustomAVX2Instrs[][3] = { +static const uint16_t ReplaceableBlendAVX2Instrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi }, { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri }, @@ -6552,6 +6487,8 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const { MI.getOperand(2).getSubReg() == 0) return 0x6; return 0; + case X86::SHUFPDrri: + return 0x6; } return 0; } @@ -6571,9 +6508,9 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm); unsigned NewImm = Imm; - const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs); + const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs); if (!table) - table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs); + table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); if (Domain == 1) { // PackedSingle AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); @@ -6583,7 +6520,7 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, if (Subtarget.hasAVX2()) { // If we are already VPBLENDW use that, else use VPBLENDD. if ((ImmWidth / (Is256 ? 2 : 1)) != 8) { - table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs); + table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); } } else { @@ -6672,6 +6609,18 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, // We must always return true for MOVHLPSrr. if (Opcode == X86::MOVHLPSrr) return true; + break; + case X86::SHUFPDrri: { + if (Domain == 1) { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned NewImm = 0x44; + if (Imm & 1) NewImm |= 0x0a; + if (Imm & 2) NewImm |= 0xa0; + MI.getOperand(3).setImm(NewImm); + MI.setDesc(get(X86::SHUFPSrri)); + } + return true; + } } return false; } @@ -6691,6 +6640,8 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { validDomains = 0xe; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookup(opcode, domain, ReplaceableInstrsFP)) { + validDomains = 0x6; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) { // Insert/extract instructions should only effect domain if AVX2 // is enabled. @@ -6730,6 +6681,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { "256-bit vector operations only available in AVX2"); table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); } + if (!table) { // try the FP table + table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP); + assert((!table || Domain < 3) && + "Can only select PackedSingle or PackedDouble"); + } if (!table) { // try the other table assert(Subtarget.hasAVX2() && "256-bit insert/extract only available in AVX2"); @@ -7140,6 +7096,20 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::PADDWrr: case X86::PADDDrr: case X86::PADDQrr: + case X86::PMULLWrr: + case X86::PMULLDrr: + case X86::PMAXSBrr: + case X86::PMAXSDrr: + case X86::PMAXSWrr: + case X86::PMAXUBrr: + case X86::PMAXUDrr: + case X86::PMAXUWrr: + case X86::PMINSBrr: + case X86::PMINSDrr: + case X86::PMINSWrr: + case X86::PMINUBrr: + case X86::PMINUDrr: + case X86::PMINUWrr: case X86::VPANDrr: case X86::VPANDYrr: case X86::VPANDDZ128rr: @@ -7243,6 +7213,78 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VPMULLQZ128rr: case X86::VPMULLQZ256rr: case X86::VPMULLQZrr: + case X86::VPMAXSBrr: + case X86::VPMAXSBYrr: + case X86::VPMAXSBZ128rr: + case X86::VPMAXSBZ256rr: + case X86::VPMAXSBZrr: + case X86::VPMAXSDrr: + case X86::VPMAXSDYrr: + case X86::VPMAXSDZ128rr: + case X86::VPMAXSDZ256rr: + case X86::VPMAXSDZrr: + case X86::VPMAXSQZ128rr: + case X86::VPMAXSQZ256rr: + case X86::VPMAXSQZrr: + case X86::VPMAXSWrr: + case X86::VPMAXSWYrr: + case X86::VPMAXSWZ128rr: + case X86::VPMAXSWZ256rr: + case X86::VPMAXSWZrr: + case X86::VPMAXUBrr: + case X86::VPMAXUBYrr: + case X86::VPMAXUBZ128rr: + case X86::VPMAXUBZ256rr: + case X86::VPMAXUBZrr: + case X86::VPMAXUDrr: + case X86::VPMAXUDYrr: + case X86::VPMAXUDZ128rr: + case X86::VPMAXUDZ256rr: + case X86::VPMAXUDZrr: + case X86::VPMAXUQZ128rr: + case X86::VPMAXUQZ256rr: + case X86::VPMAXUQZrr: + case X86::VPMAXUWrr: + case X86::VPMAXUWYrr: + case X86::VPMAXUWZ128rr: + case X86::VPMAXUWZ256rr: + case X86::VPMAXUWZrr: + case X86::VPMINSBrr: + case X86::VPMINSBYrr: + case X86::VPMINSBZ128rr: + case X86::VPMINSBZ256rr: + case X86::VPMINSBZrr: + case X86::VPMINSDrr: + case X86::VPMINSDYrr: + case X86::VPMINSDZ128rr: + case X86::VPMINSDZ256rr: + case X86::VPMINSDZrr: + case X86::VPMINSQZ128rr: + case X86::VPMINSQZ256rr: + case X86::VPMINSQZrr: + case X86::VPMINSWrr: + case X86::VPMINSWYrr: + case X86::VPMINSWZ128rr: + case X86::VPMINSWZ256rr: + case X86::VPMINSWZrr: + case X86::VPMINUBrr: + case X86::VPMINUBYrr: + case X86::VPMINUBZ128rr: + case X86::VPMINUBZ256rr: + case X86::VPMINUBZrr: + case X86::VPMINUDrr: + case X86::VPMINUDYrr: + case X86::VPMINUDZ128rr: + case X86::VPMINUDZ256rr: + case X86::VPMINUDZrr: + case X86::VPMINUQZ128rr: + case X86::VPMINUQZ256rr: + case X86::VPMINUQZrr: + case X86::VPMINUWrr: + case X86::VPMINUWYrr: + case X86::VPMINUWZ128rr: + case X86::VPMINUWZ256rr: + case X86::VPMINUWZrr: // Normal min/max instructions are not commutative because of NaN and signed // zero semantics, but these are. Thus, there's no need to check for global // relaxed math; the instructions themselves have the properties we need. @@ -7698,7 +7740,7 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF, // Does the function use a red zone? If it does, then we can't risk messing // with the stack. - if (!F.hasFnAttribute(Attribute::NoRedZone)) { + if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) { // It could have a red zone. If it does, then we don't want to touch it. const X86MachineFunctionInfo *X86FI = MF.getInfo(); if (!X86FI || X86FI->getUsesRedZone()) diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 159cb50afc5c..13ca17139494 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -1,9 +1,8 @@ //===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -36,62 +35,24 @@ enum AsmComments { AC_EVEX_2_VEX = MachineInstr::TAsmComments }; -// X86 specific condition code. These correspond to X86_*_COND in -// X86InstrInfo.td. They must be kept in synch. -enum CondCode { - COND_A = 0, - COND_AE = 1, - COND_B = 2, - COND_BE = 3, - COND_E = 4, - COND_G = 5, - COND_GE = 6, - COND_L = 7, - COND_LE = 8, - COND_NE = 9, - COND_NO = 10, - COND_NP = 11, - COND_NS = 12, - COND_O = 13, - COND_P = 14, - COND_S = 15, - LAST_VALID_COND = COND_S, - - // Artificial condition codes. These are used by AnalyzeBranch - // to indicate a block terminated with two conditional branches that together - // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE, - // which can't be represented on x86 with a single condition. These - // are never used in MachineInstrs and are inverses of one another. - COND_NE_OR_P, - COND_E_AND_NP, - - COND_INVALID -}; - -// Turn condition code into conditional branch opcode. -unsigned GetCondBranchFromCond(CondCode CC); - /// Return a pair of condition code for the given predicate and whether /// the instruction operands should be swaped to match the condition code. std::pair getX86ConditionCode(CmpInst::Predicate Predicate); -/// Return a set opcode for the given condition and whether it has -/// a memory operand. -unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); +/// Return a setcc opcode based on whether it has a memory operand. +unsigned getSETOpc(bool HasMemoryOperand = false); -/// Return a cmov opcode for the given condition, register size in -/// bytes, and operand type. -unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, - bool HasMemoryOperand = false); +/// Return a cmov opcode for the given register size in bytes, and operand type. +unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false); -// Turn jCC opcode into condition code. -CondCode getCondFromBranchOpc(unsigned Opc); +// Turn jCC instruction into condition code. +CondCode getCondFromBranch(const MachineInstr &MI); -// Turn setCC opcode into condition code. -CondCode getCondFromSETOpc(unsigned Opc); +// Turn setCC instruction into condition code. +CondCode getCondFromSETCC(const MachineInstr &MI); -// Turn CMov opcode into condition code. -CondCode getCondFromCMovOpc(unsigned Opc); +// Turn CMov instruction into condition code. +CondCode getCondFromCMov(const MachineInstr &MI); /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. @@ -327,7 +288,8 @@ public: SmallVectorImpl &Cond, bool AllowModify) const override; - bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const override; bool analyzeBranchPredicate(MachineBasicBlock &MBB, @@ -388,7 +350,8 @@ public: foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS = nullptr) const override; + LiveIntervals *LIS = nullptr, + VirtRegMap *VRM = nullptr) const override; /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific @@ -453,7 +416,10 @@ public: /// conservative. If it cannot definitely determine the safety after visiting /// a few instructions in each direction it assumes it's not safe. bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; + MachineBasicBlock::iterator I) const { + return MBB.computeRegisterLiveness(&RI, X86::EFLAGS, I, 4) == + MachineBasicBlock::LQR_Dead; + } /// True if MI has a condition code def, e.g. EFLAGS, that is /// not marked dead. @@ -590,7 +556,8 @@ private: MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, - LiveVariables *LV) const; + LiveVariables *LV, + bool Is8BitOp) const; /// Handles memory folding for special case instructions, for instance those /// requiring custom manipulation of the address. diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index e53f83baa3c6..8e05dd8ec5c1 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -1,9 +1,8 @@ //===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,6 +63,10 @@ def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86rdpkru : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; +def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -124,6 +127,9 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; +def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>; + def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, [SDNPHasChain,SDNPSideEffect]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, @@ -152,6 +158,11 @@ def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, [SDNPHasChain, SDNPSideEffect]>; +def X86rdpkru : SDNode<"X86ISD::RDPKRU", SDTX86rdpkru, + [SDNPHasChain, SDNPSideEffect]>; +def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru, + [SDNPHasChain, SDNPSideEffect]>; + def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -206,13 +217,6 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; -def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; - def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; @@ -306,6 +310,11 @@ def X86tpause : SDNode<"X86ISD::TPAUSE", SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, [SDNPHasChain, SDNPSideEffect]>; +def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD, + [SDNPHasChain, SDNPSideEffect]>; +def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD, + [SDNPHasChain, SDNPSideEffect]>; + //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -371,37 +380,35 @@ def anymem : X86MemOperand<"printanymem">; // restrict to only unsized memory. def opaquemem : X86MemOperand<"printopaquemem">; -def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; -def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; -def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; -def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; -def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; -def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; -def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; -def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; -def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; -def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; -def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; -def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; -def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; - -def v512mem : X86VMemOperand; +def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; // Gather mem operands -def vx64mem : X86VMemOperand; -def vx128mem : X86VMemOperand; -def vx256mem : X86VMemOperand; -def vy128mem : X86VMemOperand; -def vy256mem : X86VMemOperand; - -def vx64xmem : X86VMemOperand; -def vx128xmem : X86VMemOperand; -def vx256xmem : X86VMemOperand; -def vy128xmem : X86VMemOperand; -def vy256xmem : X86VMemOperand; -def vy512xmem : X86VMemOperand; -def vz256mem : X86VMemOperand; -def vz512mem : X86VMemOperand; +def vx64mem : X86VMemOperand; +def vx128mem : X86VMemOperand; +def vx256mem : X86VMemOperand; +def vy128mem : X86VMemOperand; +def vy256mem : X86VMemOperand; + +def vx64xmem : X86VMemOperand; +def vx128xmem : X86VMemOperand; +def vx256xmem : X86VMemOperand; +def vy128xmem : X86VMemOperand; +def vy256xmem : X86VMemOperand; +def vy512xmem : X86VMemOperand; +def vz256mem : X86VMemOperand; +def vz512mem : X86VMemOperand; // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead // of a plain GPR, so that it doesn't potentially require a REX prefix. @@ -409,7 +416,7 @@ def ptr_rc_norex : PointerLikeRegClass<2>; def ptr_rc_norex_nosp : PointerLikeRegClass<3>; def i8mem_NOREX : Operand { - let PrintMethod = "printi8mem"; + let PrintMethod = "printbytemem"; let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem8AsmOperand; @@ -424,7 +431,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<4>; // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. def i32mem_TC : Operand { - let PrintMethod = "printi32mem"; + let PrintMethod = "printdwordmem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem32AsmOperand; @@ -435,7 +442,7 @@ def i32mem_TC : Operand { // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. def i64mem_TC : Operand { - let PrintMethod = "printi64mem"; + let PrintMethod = "printqwordmem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem64AsmOperand; @@ -603,24 +610,10 @@ def offset64_32 : X86MemOffsOperand; -def SSECC : Operand { - let PrintMethod = "printSSEAVXCC"; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def AVXCC : Operand { - let PrintMethod = "printSSEAVXCC"; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def AVX512ICC : Operand { - let PrintMethod = "printSSEAVXCC"; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def XOPCC : Operand { - let PrintMethod = "printXOPCC"; - let OperandType = "OPERAND_IMMEDIATE"; +def ccode : Operand { + let PrintMethod = "printCondCode"; + let OperandNamespace = "X86"; + let OperandType = "OPERAND_COND_CODE"; } class ImmSExtAsmOperandClass : AsmOperandClass { @@ -640,7 +633,8 @@ def AVX512RCOperand : AsmOperandClass { } def AVX512RC : Operand { let PrintMethod = "printRoundingControl"; - let OperandType = "OPERAND_IMMEDIATE"; + let OperandNamespace = "X86"; + let OperandType = "OPERAND_ROUNDING_CONTROL"; let ParserMatchClass = AVX512RCOperand; } @@ -718,6 +712,14 @@ def u8imm : Operand { let OperandType = "OPERAND_IMMEDIATE"; } +// 16-bit immediate but only 8-bits are significant and they are unsigned. +// Used by BT instructions. +def i16u8imm : Operand { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 32-bit immediate but only 8-bits are significant and they are unsigned. // Used by some SSE/AVX instructions that use intrinsics. def i32u8imm : Operand { @@ -726,6 +728,14 @@ def i32u8imm : Operand { let OperandType = "OPERAND_IMMEDIATE"; } +// 64-bit immediate but only 8-bits are significant and they are unsigned. +// Used by BT instructions. +def i64u8imm : Operand { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 64-bits but only 32 bits are significant, and those bits are treated as being // pc relative. def i64i32imm_pcrel : Operand { @@ -747,6 +757,33 @@ def lea64mem : Operand { let ParserMatchClass = X86MemAsmOperand; } +let RenderMethod = "addMaskPairOperands" in { + def VK1PairAsmOperand : AsmOperandClass { let Name = "VK1Pair"; } + def VK2PairAsmOperand : AsmOperandClass { let Name = "VK2Pair"; } + def VK4PairAsmOperand : AsmOperandClass { let Name = "VK4Pair"; } + def VK8PairAsmOperand : AsmOperandClass { let Name = "VK8Pair"; } + def VK16PairAsmOperand : AsmOperandClass { let Name = "VK16Pair"; } +} + +def VK1Pair : RegisterOperand { + let ParserMatchClass = VK1PairAsmOperand; +} + +def VK2Pair : RegisterOperand { + let ParserMatchClass = VK2PairAsmOperand; +} + +def VK4Pair : RegisterOperand { + let ParserMatchClass = VK4PairAsmOperand; +} + +def VK8Pair : RegisterOperand { + let ParserMatchClass = VK8PairAsmOperand; +} + +def VK16Pair : RegisterOperand { + let ParserMatchClass = VK16PairAsmOperand; +} //===----------------------------------------------------------------------===// // X86 Complex Pattern Definitions. @@ -833,6 +870,8 @@ def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; def PKU : Predicate<"Subtarget->hasPKU()">; def HasVNNI : Predicate<"Subtarget->hasVNNI()">; +def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">; def HasBITALG : Predicate<"Subtarget->hasBITALG()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; @@ -894,8 +933,10 @@ def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; def HasRDPID : Predicate<"Subtarget->hasRDPID()">; def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; +def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; +def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, @@ -928,12 +969,12 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; // the Function object through the Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def OptForSize : Predicate<"MF->getFunction().optForSize()">; - def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">; - def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; + def OptForSize : Predicate<"MF->getFunction().hasOptSize()">; + def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " - "MF->getFunction().optForSize()">; - def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || " + "MF->getFunction().hasOptSize()">; + def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || " "!Subtarget->hasSSE41()">; } @@ -959,22 +1000,22 @@ include "X86InstrFormats.td" // X86 specific condition code. These correspond to CondCode in // X86InstrInfo.h. They must be kept in synch. -def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE -def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_O : PatLeaf<(i8 0)>; +def X86_COND_NO : PatLeaf<(i8 1)>; def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C -def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_AE : PatLeaf<(i8 3)>; // alt. COND_NC def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z -def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE -def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL -def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE -def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG -def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ -def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NE : PatLeaf<(i8 5)>; // alt. COND_NZ +def X86_COND_BE : PatLeaf<(i8 6)>; // alt. COND_NA +def X86_COND_A : PatLeaf<(i8 7)>; // alt. COND_NBE +def X86_COND_S : PatLeaf<(i8 8)>; +def X86_COND_NS : PatLeaf<(i8 9)>; +def X86_COND_P : PatLeaf<(i8 10)>; // alt. COND_PE def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO -def X86_COND_NS : PatLeaf<(i8 12)>; -def X86_COND_O : PatLeaf<(i8 13)>; -def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE -def X86_COND_S : PatLeaf<(i8 15)>; +def X86_COND_L : PatLeaf<(i8 12)>; // alt. COND_NGE +def X86_COND_GE : PatLeaf<(i8 13)>; // alt. COND_NL +def X86_COND_LE : PatLeaf<(i8 14)>; // alt. COND_NG +def X86_COND_G : PatLeaf<(i8 15)>; // alt. COND_NLE def i16immSExt8 : ImmLeaf(Imm); }]>; def i32immSExt8 : ImmLeaf(Imm); }]>; @@ -1007,16 +1048,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ // Eventually, it would be nice to allow ConstantHoisting to merge constants // globally for potentially added savings. // -def imm8_su : PatLeaf<(i8 relocImm), [{ +def relocImm8_su : PatLeaf<(i8 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def imm16_su : PatLeaf<(i16 relocImm), [{ +def relocImm16_su : PatLeaf<(i16 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def imm32_su : PatLeaf<(i32 relocImm), [{ - return !shouldAvoidImmediateInstFormsForSize(N); -}]>; -def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ +def relocImm32_su : PatLeaf<(i32 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; @@ -1121,7 +1159,19 @@ def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; -def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + +// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known +// to be 4 byte aligned or better. +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType != ISD::EXTLOAD) + return false; + if (LD->getMemoryVT() == MVT::i32) + return true; + + return LD->getAlignment() >= 4 && !LD->isVolatile(); +}]>; // An 'and' node with a single use. @@ -1517,16 +1567,16 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm8_su:$src), addr:$dst)]>; + [(store (i8 relocImm8_su:$src), addr:$dst)]>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16; + [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32; + [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store i64immSExt32_su:$src, addr:$dst)]>, + [(store i64relocImmSExt32_su:$src, addr:$dst)]>, Requires<[In64BitMode]>; } // SchedRW @@ -1773,36 +1823,36 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteBitTestRegLd] in { } let SchedRW = [WriteBitTest] in { -def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16u8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, + [(set EFLAGS, (X86bt GR16:$src1, imm:$src2))]>, OpSize16, TB; -def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32u8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, + [(set EFLAGS, (X86bt GR32:$src1, imm:$src2))]>, OpSize32, TB; -def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64u8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB; + [(set EFLAGS, (X86bt GR64:$src1, imm:$src2))]>, TB; } // SchedRW // Note that these instructions aren't slow because that only applies when the // other operand is in a register. When it's an immediate, bt is still fast. let SchedRW = [WriteBitTestImmLd] in { -def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi16 addr:$src1), - i16immSExt8:$src2))]>, + imm:$src2))]>, OpSize16, TB; -def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi32 addr:$src1), - i32immSExt8:$src2))]>, + imm:$src2))]>, OpSize32, TB; -def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi64 addr:$src1), - i64immSExt8:$src2))]>, TB, + imm:$src2))]>, TB, Requires<[In64BitMode]>; } // SchedRW @@ -1832,20 +1882,20 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), } let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in { -def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in { -def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, Requires<[In64BitMode]>; } @@ -1875,24 +1925,24 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), } let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in { -def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in { -def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, Requires<[In64BitMode]>; } @@ -1922,20 +1972,20 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), } let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in { -def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in { -def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, Requires<[In64BitMode]>; } @@ -2090,12 +2140,13 @@ def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), - "cmpxchg8b\t$dst", []>, TB; + "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>; let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +// NOTE: In64BitMode check needed for the AssemblerPredicate. def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), "cmpxchg16b\t$dst", []>, - TB, Requires<[HasCmpxchg16b, In64BitMode]>; + TB, Requires<[HasCmpxchg16b,In64BitMode]>; } // SchedRW, mayLoad, mayStore, hasSideEffects @@ -2388,6 +2439,11 @@ def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs), return hasNoCarryFlagUses(SDValue(N, 1)); }]>; +def and_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs), + (X86and_flag node:$lhs, node:$rhs), [{ + return hasNoCarryFlagUses(SDValue(N, 1)); +}]>; + let Predicates = [HasBMI] in { // FIXME: patterns for the load versions are not implemented def : Pat<(and GR32:$src, (add GR32:$src, -1)), @@ -2406,12 +2462,20 @@ let Predicates = [HasBMI] in { (BLSI64rr GR64:$src)>; // Versions to match flag producing ops. - // X86and_flag nodes are rarely created. Those should use CMP+AND. We do - // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed. + def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, -1)), + (BLSR32rr GR32:$src)>; + def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, -1)), + (BLSR64rr GR64:$src)>; + def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)), (BLSMSK32rr GR32:$src)>; def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)), (BLSMSK64rr GR64:$src)>; + + def : Pat<(and_flag_nocf GR32:$src, (ineg GR32:$src)), + (BLSI32rr GR32:$src)>; + def : Pat<(and_flag_nocf GR64:$src, (ineg GR64:$src)), + (BLSI64rr GR64:$src)>; } multiclass bmi_bextr opc, string mnemonic, RegisterClass RC, @@ -2653,16 +2717,12 @@ defm LWPVAL64 : lwpval_intr, VEX_W; // MONITORX/MWAITX Instructions // let SchedRW = [ WriteSystem ] in { - let usesCustomInserter = 1 in { - def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), - [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>, - Requires<[ HasMWAITX ]>; - } - - let Uses = [ EAX, ECX, EDX ] in { - def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, - TB, Requires<[ HasMWAITX ]>; - } + let Uses = [ EAX, ECX, EDX ] in + def MONITORX32rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, + TB, Requires<[ HasMWAITX, Not64BitMode ]>; + let Uses = [ RAX, ECX, EDX ] in + def MONITORX64rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, + TB, Requires<[ HasMWAITX, In64BitMode ]>; let Uses = [ ECX, EAX, EBX ] in { def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", @@ -2676,9 +2736,9 @@ def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>, def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>, Requires<[ In64BitMode ]>; -def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORX32rrr)>, Requires<[ Not64BitMode ]>; -def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORX64rrr)>, Requires<[ In64BitMode ]>; //===----------------------------------------------------------------------===// @@ -2737,22 +2797,51 @@ def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>; } // SchedRW +//===----------------------------------------------------------------------===// +// ENQCMD/S - Enqueue 64-byte command as user with 64-byte write atomicity +// +let SchedRW = [WriteStore], Defs = [EFLAGS] in { + def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src), + "enqcmd\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>, + T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>; + def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src), + "enqcmd\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>, + T8XD, AdSize32, Requires<[HasENQCMD]>; + def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), + "enqcmd\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>, + T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>; + + def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src), + "enqcmds\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>, + T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>; + def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src), + "enqcmds\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>, + T8XS, AdSize32, Requires<[HasENQCMD]>; + def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), + "enqcmds\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>, + T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>; +} + //===----------------------------------------------------------------------===// // CLZERO Instruction // let SchedRW = [WriteSystem] in { let Uses = [EAX] in - def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, - TB, Requires<[HasCLZERO]>; - - let usesCustomInserter = 1 in { - def CLZERO : PseudoI<(outs), (ins i32mem:$src1), - [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>; - } + def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, + TB, Requires<[HasCLZERO, Not64BitMode]>; + let Uses = [RAX] in + def CLZERO64r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, + TB, Requires<[HasCLZERO, In64BitMode]>; } // SchedRW -def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>; -def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; +def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>; +def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. @@ -2812,8 +2901,6 @@ let Predicates = [HasTBM] in { (TZMSK64rr GR64:$src)>; // Patterns to match flag producing ops. - // X86and_flag nodes are rarely created. Those should use CMP+AND. We do - // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed. def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))), (BLCI32rr GR32:$src)>; def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))), @@ -2825,6 +2912,11 @@ let Predicates = [HasTBM] in { def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)), (BLCI64rr GR64:$src)>; + def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, 1)), + (BLCIC32rr GR32:$src)>; + def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, 1)), + (BLCIC64rr GR64:$src)>; + def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)), (BLCMSK32rr GR32:$src)>; def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)), @@ -2849,6 +2941,11 @@ let Predicates = [HasTBM] in { (T1MSKC32rr GR32:$src)>; def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)), (T1MSKC64rr GR64:$src)>; + + def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, -1)), + (TZMSK32rr GR32:$src)>; + def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, -1)), + (TZMSK64rr GR64:$src)>; } // HasTBM //===----------------------------------------------------------------------===// @@ -3231,39 +3328,39 @@ def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. multiclass FpUnaryAlias { - def : InstAlias; - def : InstAlias; + def : InstAlias; } -defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"fadd", ADD_FST0r, 0>; defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; -defm : FpUnaryAlias<"fsub", SUB_FST0r>; -defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; -defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; -defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; -defm : FpUnaryAlias<"fmul", MUL_FST0r>; -defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; -defm : FpUnaryAlias<"fdiv", DIV_FST0r>; -defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; -defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; -defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r, 0>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0, 0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r, 0>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0, 0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r, 0>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0, 0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r, 0>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0, 0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r, 0>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0, 0>; defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; -defm : FpUnaryAlias<"fcompi", COM_FIPr>; -defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; +defm : FpUnaryAlias<"fcompi", COM_FIPr, 0>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr, 0>; -// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// Handle "f{mulp,addp} $op, %st(0)" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; -def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; -def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; -def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; -def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; -def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; +def : InstAlias<"faddp\t{$op, %st|st, $op}", (ADD_FPrST0 RSTi:$op), 0>; +def : InstAlias<"fmulp\t{$op, %st|st, $op}", (MUL_FPrST0 RSTi:$op), 0>; +def : InstAlias<"fsub{|r}p\t{$op, %st|st, $op}", (SUBR_FPrST0 RSTi:$op), 0>; +def : InstAlias<"fsub{r|}p\t{$op, %st|st, $op}", (SUB_FPrST0 RSTi:$op), 0>; +def : InstAlias<"fdiv{|r}p\t{$op, %st|st, $op}", (DIVR_FPrST0 RSTi:$op), 0>; +def : InstAlias<"fdiv{r|}p\t{$op, %st|st, $op}", (DIV_FPrST0 RSTi:$op), 0>; def : InstAlias<"fnstsw" , (FNSTSW16r), 0>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 8f3357170576..57835b1a256a 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -1,9 +1,8 @@ //===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -153,7 +152,9 @@ multiclass sse12_cvt_pint_3addr opc, RegisterClass SrcRC, // MMX EMMS Instruction //===----------------------------------------------------------------------===// -let SchedRW = [WriteEMMS] in +let SchedRW = [WriteEMMS], + Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>; //===----------------------------------------------------------------------===// @@ -544,7 +545,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td index c1a8cc7c5fbf..f7d931510fe2 100644 --- a/lib/Target/X86/X86InstrMPX.td +++ b/lib/Target/X86/X86InstrMPX.td @@ -1,9 +1,8 @@ //===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td index 488cc4438076..747f5aa86653 100644 --- a/lib/Target/X86/X86InstrSGX.td +++ b/lib/Target/X86/X86InstrSGX.td @@ -1,9 +1,8 @@ //===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e2bcd18ce660..7d0a5b87baf4 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1,9 +1,8 @@ //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,6 +21,7 @@ multiclass sse12_fp_scalar opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, Domain d, X86FoldableSchedWrite sched, bit Is2Addr = 1> { +let isCodeGenOnly = 1 in { let isCommutable = 1 in { def rr : SI opc, string OpcodeStr, SDNode OpNode, [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class multiclass sse12_fp_scalar_int opc, string OpcodeStr, @@ -44,7 +45,7 @@ multiclass sse12_fp_scalar_int opc, string OpcodeStr, ValueType VT, string asm, Operand memopr, ComplexPattern mem_cpat, Domain d, X86FoldableSchedWrite sched, bit Is2Addr = 1> { -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let hasSideEffects = 0 in { def rr_Int : SI_Int { - def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), +multiclass sse12_move_rm { + def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], d>, + [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; - def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], d>, + [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, Sched<[WriteFLoad]>; + + // _alt version uses FR32/FR64 register class. + let isCodeGenOnly = 1 in { + def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], d>, + VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; + def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], d>, + Sched<[WriteFLoad]>; + } } defm MOVSS : sse12_move, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { - defm MOVSS : sse12_move_rm, XS; - defm MOVSD : sse12_move_rm, XD; } // Patterns let Predicates = [UseAVX] in { - // MOVSSrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; - - // MOVSDrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (VMOVSSrm addr:$src)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (VMOVSDrm addr:$src)>; // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; - def : Pat<(v8f32 (X86vzload addr:$src)), + def : Pat<(v8f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; - - // Extract and store. - def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), - addr:$dst), - (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; } let Predicates = [UseAVX, OptForSize] in { @@ -304,59 +294,24 @@ let Predicates = [UseAVX, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSrr (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), - sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDrr (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), - sub_xmm)>; } -let Predicates = [UseSSE1] in { - let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVSS to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; - } - - // MOVSSrm already zeros the high parts of the register. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - - // Extract and store. - def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), - addr:$dst), - (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; +let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { +// Move scalar to XMM zero-extended, zeroing a VR128 then do a +// MOVSS to the lower bits. +def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; +def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; } -let Predicates = [UseSSE2] in { - // MOVSDrm already zeros the high parts of the register. - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; -} - -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; -def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; +let Predicates = [UseSSE2] in +def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (MOVSDrm addr:$src)>; + +let Predicates = [UseSSE1] in +def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (MOVSSrm addr:$src)>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions @@ -504,25 +459,6 @@ let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { } // SchedRW } // Predicate -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", - (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", - (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", - (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", - (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", - (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", - (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", - (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", - (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; - // Reversed version with ".s" suffix for GAS compatibility. def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; @@ -700,10 +636,10 @@ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; let SchedRW = [WriteFStore] in { let Predicates = [UseAVX] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>, + []>, VEX, VEX_WIG; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", @@ -711,10 +647,10 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; }// UseAVX +let mayStore = 1, hasSideEffects = 0 in def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>; + []>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 VR128:$src), @@ -722,16 +658,19 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), } // SchedRW let Predicates = [UseSSE1] in { - // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), - (iPTR 0))), addr:$src1), - (MOVLPSmr addr:$src1, VR128:$src2)>; - // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)), + def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1, + (i8 -28)), (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), + (MOVLPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(v4f32 (X86vzload64 addr:$src)), + (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; + def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), + (MOVLPSmr addr:$dst, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -744,24 +683,20 @@ let SchedRW = [WriteFStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. let Predicates = [UseAVX] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), - (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; + []>, VEX, VEX_WIG; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; } // UseAVX +let mayStore = 1, hasSideEffects = 0 in def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), - (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)]>; + []>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt @@ -775,19 +710,31 @@ let Predicates = [UseAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; + + // MOVLPD patterns + def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE1] in { // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)), + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), + addr:$dst), + (MOVHPSmr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { @@ -798,11 +745,24 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), + (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; + + // MOVLPD patterns + def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { + // Use MOVLPD to load into the low bits from a full vector unless we can use + // BLENDPD. + def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// @@ -847,13 +807,16 @@ let Constraints = "$src1 = $dst" in { multiclass sse12_cvt_s opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm, X86FoldableSchedWrite sched> { - def rr : SI, - Sched<[sched]>; - def rm : SI, - Sched<[sched.Folded]>; + string asm, string mem, X86FoldableSchedWrite sched, + SchedRead Int2Fpu = ReadDefault> { + def rr : SI, + Sched<[sched, Int2Fpu]>; + def rm : SI, + Sched<[sched.Folded]>; } multiclass sse12_cvt_p opc, RegisterClass RC, X86MemOperand x86memop, @@ -872,74 +835,55 @@ let hasSideEffects = 0 in { } multiclass sse12_vcvt_avx opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm, + X86MemOperand x86memop, string asm, string mem, X86FoldableSchedWrite sched> { let hasSideEffects = 0, Predicates = [UseAVX] in { def rr : SI, - Sched<[sched]>; + Sched<[sched, ReadDefault, ReadInt2Fpu]>; let mayLoad = 1 in def rm : SI, + asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // hasSideEffects = 0 } -let Predicates = [UseAVX] in { +let isCodeGenOnly = 1, Predicates = [UseAVX] in { defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; - -def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; -def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; -def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; -def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; } + // The assembler can recognize rr 64-bit instructions by seeing a rxx // register, but the same isn't true when only using memory operands, // provide other assembly "l" and "q" forms to address this explicitly // where appropriate to do so. -defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}", +let isCodeGenOnly = 1 in { +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; -defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}", +defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; -defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}", +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; -defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}", +defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; +} // isCodeGenOnly = 1 let Predicates = [UseAVX] in { - def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; - def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; - def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -959,52 +903,32 @@ let Predicates = [UseAVX] in { (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } +let isCodeGenOnly = 1 in { defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS; defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS, REX_W; defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD; defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD, REX_W; defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, - "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", - WriteCvtI2SS>, XS; + "cvtsi2ss", "cvtsi2ss{l}", + WriteCvtI2SS, ReadInt2Fpu>, XS; defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, - "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", - WriteCvtI2SS>, XS, REX_W; + "cvtsi2ss", "cvtsi2ss{q}", + WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W; defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, - "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", - WriteCvtI2SD>, XD; + "cvtsi2sd", "cvtsi2sd{l}", + WriteCvtI2SD, ReadInt2Fpu>, XD; defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, - "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", - WriteCvtI2SD>, XD, REX_W; - -def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; -def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; -def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; -def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; -def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; -def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; - -def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", - (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">; -def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", - (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">; + "cvtsi2sd", "cvtsi2sd{q}", + WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W; +} // isCodeGenOnly = 1 // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). @@ -1025,20 +949,20 @@ multiclass sse12_cvt_sint opc, RegisterClass SrcRC, RegisterClass DstRC, multiclass sse12_cvt_sint_3addr opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, - string asm, X86FoldableSchedWrite sched, + string asm, string mem, X86FoldableSchedWrite sched, bit Is2Addr = 1> { let hasSideEffects = 0 in { def rr_Int : SI, Sched<[sched]>; + []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; let mayLoad = 1 in def rm_Int : SI, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -1057,48 +981,73 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; -let isCodeGenOnly = 1 in { - let Predicates = [UseAVX] in { - defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V; - defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W; - defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V; - defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W; - } - let Constraints = "$src1 = $dst" in { - defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS; - defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W; - defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD; - defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W; - } -} // isCodeGenOnly = 1 +let Predicates = [UseAVX] in { +defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG; +defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W; +defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG; +defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W; +} +let Constraints = "$src1 = $dst" in { + defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS; + defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W; + defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD; + defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W; +} + +def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; +def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; +def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; +def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; + +def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; +def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; + +def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", + (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; +def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", + (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; +def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", + (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; +def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", + (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; + +def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", + (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; +def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", + (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; /// SSE 1 Only // Aliases for intrinsics -let isCodeGenOnly = 1 in { let Predicates = [UseAVX] in { defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", - WriteCvtSS2I>, XS, VEX; + WriteCvtSS2I>, XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", WriteCvtSS2I>, - XS, VEX, VEX_W; + XS, VEX, VEX_LIG, VEX_W; defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", - WriteCvtSS2I>, XD, VEX; + WriteCvtSS2I>, XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSS2I>, - XD, VEX, VEX_W; + XD, VEX, VEX_LIG, VEX_W; } defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", @@ -1112,7 +1061,40 @@ defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSD2I>, XD, REX_W; -} // isCodeGenOnly = 1 + +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; + +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; let Predicates = [UseAVX] in { defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, @@ -1143,7 +1125,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, SSEPackedSingle, WriteCvtI2PS>, PS, Requires<[UseSSE2]>; -let Predicates = [UseAVX] in { +// AVX aliases def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1160,8 +1142,8 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; -} +// SSE aliases def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1182,7 +1164,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", /// SSE 2 Only // Convert scalar double to scalar single -let hasSideEffects = 0, Predicates = [UseAVX] in { +let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -1200,6 +1182,7 @@ def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; +let isCodeGenOnly = 1 in { def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fpround FR64:$src))]>, @@ -1209,42 +1192,41 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, XD, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtSD2SS.Folded]>; +} -let isCodeGenOnly = 1 in { def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, - XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, + XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, Sched<[WriteCvtSD2SS]>; def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (int_x86_sse2_cvtsd2ss - VR128:$src1, sse_load_f64:$src2))]>, - XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + [(set VR128:$dst, + (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>, + XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, + (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_cvtsd2ss - VR128:$src1, sse_load_f64:$src2))]>, + [(set VR128:$dst, + (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; } -} // isCodeGenOnly = 1 // Convert scalar single to scalar double // SSE2 instructions with XS prefix -let hasSideEffects = 0 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -1257,51 +1239,36 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), XS, VEX_4V, VEX_LIG, VEX_WIG, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, Requires<[UseAVX, OptForSize]>; -} +} // isCodeGenOnly = 1, hasSideEffects = 0 def : Pat<(f64 (fpextend FR32:$src)), (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[UseAVX, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[UseAVX, OptForSpeed]>; - +let isCodeGenOnly = 1 in { def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fpextend FR32:$src))]>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (extloadf32 addr:$src))]>, + [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>, XS, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtSS2SD.Folded]>; +} // isCodeGenOnly = 1 -// extload f32 -> f64. This matches load+fpextend because we have a hack in -// the isel (PreprocessForFPConvert) that can introduce loads after dag -// combine. -// Since these loads aren't folded into the fpextend, we have to match it -// explicitly here. -def : Pat<(fpextend (loadf32 addr:$src)), - (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; - -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let hasSideEffects = 0 in { def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, VEX_4V, VEX_WIG, + []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; let mayLoad = 1 in def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, @@ -1316,7 +1283,7 @@ def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, []>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; } -} // isCodeGenOnly = 1 +} // hasSideEffects = 0 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary @@ -1476,15 +1443,11 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; // XMM only -def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; -def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; // YMM only def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), @@ -1497,12 +1460,13 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; -def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; -def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; } +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; + def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1540,17 +1504,6 @@ def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src) Sched<[WriteCvtPS2IYLd]>, VEX_WIG; } -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (VCVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2DQrm addr:$src)>; - def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), - (VCVTTPS2DQYrr VR256:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2DQYrm addr:$src)>; -} - def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1562,39 +1515,23 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, Sched<[WriteCvtPS2ILd]>; -let Predicates = [UseSSE2] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (CVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), - (CVTTPS2DQrm addr:$src)>; -} - -let Predicates = [HasAVX, NoVLX] in +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +let Predicates = [HasAVX, NoVLX] in { +// XMM only def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; - -// The assembler can recognize rr 256-bit instructions by seeing a ymm -// register, but the same isn't true when using memory operands instead. -// Provide other assembly rr and rm forms to address this explicitly. - -// XMM only -def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; - -let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; -def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; // YMM only -let Predicates = [HasAVX, NoVLX] in { def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1605,11 +1542,12 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), [(set VR128:$dst, (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; -} -def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; +} // Predicates = [HasAVX, NoVLX] + +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; + (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), @@ -1618,21 +1556,6 @@ let Predicates = [HasAVX, NoVLX] in { (VCVTTPD2DQYrm addr:$src)>; } -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (VCVTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), - (VCVTPD2DQrm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), - (VCVTTPD2DQrm addr:$src)>; -} // Predicates = [HasAVX, NoVLX] - def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1644,21 +1567,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, Sched<[WriteCvtPD2ILd]>; -let Predicates = [UseSSE2] in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (CVTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), - (CVTPD2DQrm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (CVTTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), - (CVTTPD2DQrm addr:$src)>; -} // Predicates = [UseSSE2] - // Convert packed single to packed double let Predicates = [HasAVX, NoVLX] in { // SSE2 instructions without OpSize prefix @@ -1697,7 +1605,10 @@ let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, + (v2f64 (X86VSintToFP + (bc_v4i32 + (v2i64 (scalar_to_vector + (loadi64 addr:$src)))))))]>, VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1721,7 +1632,10 @@ let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, + (v2f64 (X86VSintToFP + (bc_v4i32 + (v2i64 (scalar_to_vector + (loadi64 addr:$src)))))))]>, Sched<[WriteCvtI2PDLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1731,17 +1645,13 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // AVX register conversion intrinsics let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (VCVTDQ2PDrm addr:$src)>; - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDrm addr:$src)>; } // Predicates = [HasAVX, NoVLX] // SSE2 register conversion intrinsics let Predicates = [UseSSE2] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (CVTDQ2PDrm addr:$src)>; - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (CVTDQ2PDrm addr:$src)>; } // Predicates = [UseSSE2] @@ -1749,38 +1659,31 @@ let Predicates = [UseSSE2] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. -let Predicates = [HasAVX, NoVLX] in +let Predicates = [HasAVX, NoVLX] in { +// XMM only def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; - -// XMM only -def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", - (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; -let Predicates = [HasAVX, NoVLX] in def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; -def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", - (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">; -// YMM only -let Predicates = [HasAVX, NoVLX] in { def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (fpround VR256:$src))]>, + [(set VR128:$dst, (X86vfpround VR256:$src))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>, + [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; -} -def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", - (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; +} // Predicates = [HasAVX, NoVLX] + +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", - (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">; + (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", @@ -1791,28 +1694,11 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, Sched<[WriteCvtPD2PS.Folded]>; -// AVX 256-bit register conversion intrinsics -// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below -// whenever possible to avoid declaring two versions of each one. - let Predicates = [HasAVX, NoVLX] in { - // Match fpround and fpextend for 128/256-bit conversions - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (VCVTPD2PSrr VR128:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), - (VCVTPD2PSrm addr:$src)>; -} - -let Predicates = [UseSSE2] in { - // Match fpround and fpextend for 128 conversions - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (CVTPD2PSrr VR128:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), - (CVTPD2PSrm addr:$src)>; + def : Pat<(v4f32 (fpround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; + def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -1821,94 +1707,80 @@ let Predicates = [UseSSE2] in { // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar { let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, Sched<[sched]>; def rm : SIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), (ld_frag addr:$src2), imm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; - - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>, - Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>, - Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; - } } -let ExeDomain = SSEPackedSingle in -defm VCMPSS : sse12_cmp_scalar, XS, VEX_4V, VEX_LIG, VEX_WIG; -let ExeDomain = SSEPackedDouble in -defm VCMPSD : sse12_cmp_scalar, - XD, VEX_4V, VEX_LIG, VEX_WIG; - -let Constraints = "$src1 = $dst" in { +let isCodeGenOnly = 1 in { let ExeDomain = SSEPackedSingle in - defm CMPSS : sse12_cmp_scalar, XS; + defm VCMPSS : sse12_cmp_scalar, XS, VEX_4V, VEX_LIG, VEX_WIG; let ExeDomain = SSEPackedDouble in - defm CMPSD : sse12_cmp_scalar, XD; + defm VCMPSD : sse12_cmp_scalar, + XD, VEX_4V, VEX_LIG, VEX_WIG; + + let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in + defm CMPSS : sse12_cmp_scalar, XS; + let ExeDomain = SSEPackedDouble in + defm CMPSD : sse12_cmp_scalar, XD; + } } -multiclass sse12_cmp_scalar_int { def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src, CC:$cc), asm, + (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, VR128:$src, imm:$cc))]>, Sched<[sched]>; let mayLoad = 1 in def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, memop:$src, CC:$cc), asm, + (ins VR128:$src1, memop:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, mem_cpat:$src, imm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -let isCodeGenOnly = 1 in { - // Aliases to match intrinsics which expect XMM operand(s). +// Aliases to match intrinsics which expect XMM operand(s). +let ExeDomain = SSEPackedSingle in +defm VCMPSS : sse12_cmp_scalar_int, + XS, VEX_4V, VEX_LIG, VEX_WIG; +let ExeDomain = SSEPackedDouble in +defm VCMPSD : sse12_cmp_scalar_int, + XD, VEX_4V, VEX_LIG, VEX_WIG; +let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in - defm VCMPSS : sse12_cmp_scalar_int, XS, VEX_4V; + defm CMPSS : sse12_cmp_scalar_int, XS; let ExeDomain = SSEPackedDouble in - defm VCMPSD : sse12_cmp_scalar_int, - XD, VEX_4V; - let Constraints = "$src1 = $dst" in { - let ExeDomain = SSEPackedSingle in - defm CMPSS : sse12_cmp_scalar_int, XS; - let ExeDomain = SSEPackedDouble in - defm CMPSD : sse12_cmp_scalar_int, XD; -} + defm CMPSD : sse12_cmp_scalar_int, XD; } @@ -1962,14 +1834,14 @@ let Defs = [EFLAGS] in { let isCodeGenOnly = 1 in { defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG; + sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG; + sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG; + sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG; + sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss", WriteFCom>, PS; @@ -1998,56 +1870,38 @@ let Defs = [EFLAGS] in { // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed { let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, Sched<[sched]>; def rmi : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; - - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), - asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rmi_alt : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), - asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } } -defm VCMPPS : sse12_cmp_packed, PS, VEX_4V, VEX_WIG; -defm VCMPPD : sse12_cmp_packed, PD, VEX_4V, VEX_WIG; -defm VCMPPSY : sse12_cmp_packed, PS, VEX_4V, VEX_L, VEX_WIG; -defm VCMPPDY : sse12_cmp_packed, PD, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in { - defm CMPPS : sse12_cmp_packed, PS; - defm CMPPD : sse12_cmp_packed, PD; } @@ -2111,12 +1965,14 @@ let Predicates = [UseSSE1] in { /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle { + X86FoldableSchedWrite sched, Domain d, + bit IsCommutable = 0> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, @@ -2148,7 +2004,7 @@ let Constraints = "$src1 = $dst" in { memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle, PD; + memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; } //===----------------------------------------------------------------------===// @@ -2238,6 +2094,13 @@ let Predicates = [HasAVX1Only] in { (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; } +let Predicates = [UseSSE2] in { + // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (v2f64 (nonvolatile_load addr:$src2)))), + (MOVHPDrm VR128:$src1, addr:$src2)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Extract Floating-Point Sign mask //===----------------------------------------------------------------------===// @@ -2523,99 +2386,6 @@ let Predicates = [HasAVX1Only] in { (VANDNPSYrm VR256:$src1, addr:$src2)>; } -let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { - // Use packed logical operations for scalar ops. - def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - - def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; -} - -let Predicates = [UseSSE1] in { - // Use packed logical operations for scalar ops. - def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; -} - -let Predicates = [UseSSE2] in { - // Use packed logical operations for scalar ops. - def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; -} - let Predicates = [HasAVX, NoVLX] in { def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), (VPANDrr VR128:$src1, VR128:$src2)>; @@ -2908,7 +2678,8 @@ let isCodeGenOnly = 1 in { // patterns we have to try to match. multiclass scalar_math_patterns { + RegisterClass RC, PatFrag ld_frag, + Predicate BasePredicate> { let Predicates = [BasePredicate] in { // extracted scalar math op with insert via movss/movsd def : Pat<(VT (Move (VT VR128:$dst), @@ -2917,6 +2688,11 @@ multiclass scalar_math_patterns(OpcPrefix#rr_Int) VT:$dst, (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + (ld_frag addr:$src)))))), + (!cast(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; } // Repeat for AVX versions of the instructions. @@ -2928,18 +2704,23 @@ multiclass scalar_math_patterns("V"#OpcPrefix#rr_Int) VT:$dst, (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + (ld_frag addr:$src)))))), + (!cast("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; } } -defm : scalar_math_patterns; -defm : scalar_math_patterns; -defm : scalar_math_patterns; -defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; -defm : scalar_math_patterns; -defm : scalar_math_patterns; -defm : scalar_math_patterns; -defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to @@ -2956,7 +2737,7 @@ multiclass sse_fp_unop_s opc, string OpcodeStr, RegisterClass RC, ValueType ScalarVT, X86MemOperand x86memop, Operand intmemop, SDNode OpNode, Domain d, X86FoldableSchedWrite sched, Predicate target> { - let hasSideEffects = 0 in { + let isCodeGenOnly = 1, hasSideEffects = 0 in { def r : I, Sched<[sched]>, @@ -2967,8 +2748,9 @@ multiclass sse_fp_unop_s opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (OpNode (load addr:$src1)))], d>, Sched<[sched.Folded]>, Requires<[target, OptForSize]>; + } - let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { + let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { def r_Int : I, Sched<[sched]>; @@ -2977,7 +2759,6 @@ multiclass sse_fp_unop_s opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - } } @@ -3022,7 +2803,7 @@ multiclass avx_fp_unop_s opc, string OpcodeStr, RegisterClass RC, ValueType ScalarVT, X86MemOperand x86memop, Operand intmemop, SDNode OpNode, Domain d, X86FoldableSchedWrite sched, Predicate target> { - let hasSideEffects = 0 in { + let isCodeGenOnly = 1, hasSideEffects = 0 in { def r : I, Sched<[sched]>; @@ -3030,7 +2811,8 @@ multiclass avx_fp_unop_s opc, string OpcodeStr, RegisterClass RC, def m : I, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isCodeGenOnly = 1, ExeDomain = d in { + } + let hasSideEffects = 0, ExeDomain = d in { def r_Int : I opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - } // We don't want to fold scalar loads into these instructions unless // optimizing for size. This is because the folded instruction will have a @@ -3197,23 +2978,6 @@ multiclass scalar_unary_math_patterns ImmV, - Predicate BasePredicate> { - let Predicates = [BasePredicate] in { - def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, 0))))), - (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; - } - - // Repeat for AVX versions of the instructions. - let Predicates = [UseAVX] in { - def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, 0))))), - (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; - } -} - defm : scalar_unary_math_patterns; defm : scalar_unary_math_patterns; @@ -3388,16 +3152,20 @@ def : Pat<(X86MFence), (MFENCE)>; // SSE 1 & 2 - Load/Store XCSR register //===----------------------------------------------------------------------===// +let mayLoad=1, hasSideEffects=1 in def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; +let mayStore=1, hasSideEffects=1 in def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; +let mayLoad=1, hasSideEffects=1 in def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, TB, Sched<[WriteLDMXCSR]>; +let mayStore=1, hasSideEffects=1 in def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, TB, Sched<[WriteSTMXCSR]>; @@ -3529,17 +3297,6 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), } // ExeDomain = SSEPackedInt -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", - (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", - (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", - (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", - (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>; - // Reversed version with ".s" suffix for GAS compatibility. def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; @@ -4118,7 +3875,7 @@ multiclass sse2_pinsrw { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -4138,7 +3895,7 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))]>, - PD, VEX, Sched<[WriteVecExtract]>; + PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4148,7 +3905,7 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, // Insert let Predicates = [HasAVX, NoBWI] in -defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; +defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in defm PINSRW : sse2_pinsrw, PD; @@ -4279,19 +4036,11 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX, Sched<[WriteVecMoveFromGpr]>; - def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, - VEX, Sched<[WriteVecLoad]>; def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>, Sched<[WriteVecMoveFromGpr]>; - def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, - Sched<[WriteVecLoad]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 //===---------------------------------------------------------------------===// @@ -4353,32 +4102,15 @@ def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), // Bitcast FR64 <-> GR64 // let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { - let Predicates = [UseAVX] in - def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, - VEX, Sched<[WriteVecLoad]>; def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))]>, VEX, Sched<[WriteVecMoveToGpr]>; - def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), - "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, - VEX, Sched<[WriteVecStore]>; - def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, - Sched<[WriteVecLoad]>; def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))]>, Sched<[WriteVecMoveToGpr]>; - def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), - "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, - Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 //===---------------------------------------------------------------------===// @@ -4389,18 +4121,10 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX, Sched<[WriteVecMoveToGpr]>; - def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, - VEX, Sched<[WriteVecStore]>; def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, Sched<[WriteVecMoveToGpr]>; - def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, - Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 let Predicates = [UseAVX] in { @@ -4410,28 +4134,14 @@ let Predicates = [UseAVX] in { def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (VMOV64toPQIrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; - def : Pat<(v8i32 (X86vzload addr:$src)), + def : Pat<(v8i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; } let Predicates = [UseSSE2] in { @@ -4442,11 +4152,7 @@ let Predicates = [UseSSE2] in { (MOV64toPQIrr GR64:$src)>; def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (MOVDI2PDIrm addr:$src)>; } @@ -4508,32 +4214,26 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", []>; } -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}", - (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>; - def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; let Predicates = [UseAVX] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + def : Pat<(v2i64 (X86vzload64 addr:$src)), (VMOVQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), - (VMOVQI2PQIrm addr:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; - def : Pat<(v4i64 (X86vzload addr:$src)), + def : Pat<(v4i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; + + def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), + (VMOVPQI2QImr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (MOVQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; + + def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), + (MOVPQI2QImr addr:$dst, VR128:$src)>; } //===---------------------------------------------------------------------===// @@ -4560,6 +4260,19 @@ let Predicates = [UseSSE2] in { (MOVZPQILo2PQIrr VR128:$src)>; } +let Predicates = [UseAVX] in { + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -4667,17 +4380,17 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86Movddup (loadv2f64 addr:$src)), + def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; } let Predicates = [UseSSE3] in { // No need for aligned memory as this only loads 64-bits. - def : Pat<(X86Movddup (loadv2f64 addr:$src)), + def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (MOVDDUPrm addr:$src)>; } @@ -5130,15 +4843,12 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in //===---------------------------------------------------------------------===// let SchedRW = [WriteSystem] in { -let usesCustomInserter = 1 in { -def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), - [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, - Requires<[HasSSE3]>; -} - let Uses = [EAX, ECX, EDX] in -def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, - TB, Requires<[HasSSE3]>; +def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, + TB, Requires<[HasSSE3, Not64BitMode]>; +let Uses = [RAX, ECX, EDX] in +def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, + TB, Requires<[HasSSE3, In64BitMode]>; let Uses = [ECX, EAX] in def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", @@ -5148,13 +4858,14 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; -def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, Requires<[Not64BitMode]>; -def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // SSE4.1 - Packed Move with Sign/Zero Extend +// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp //===----------------------------------------------------------------------===// multiclass SS41I_pmovx_rrrm opc, string OpcodeStr, X86MemOperand MemOp, @@ -5202,71 +4913,38 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; -// Patterns that we also need for any_extend. -// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg. -multiclass SS41I_pmovx_avx2_patterns_base { - // Register-Register patterns - let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), - (!cast(OpcPrefix#BWYrr) VR128:$src)>; - } - - let Predicates = [HasAVX2, NoVLX] in { - def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), - (!cast(OpcPrefix#WDYrr) VR128:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), - (!cast(OpcPrefix#DQYrr) VR128:$src)>; - } - - // AVX2 Register-Memory patterns - let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - } - - let Predicates = [HasAVX2, NoVLX] in { - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - } -} - // AVX2 Patterns multiclass SS41I_pmovx_avx2_patterns : - SS41I_pmovx_avx2_patterns_base { - + SDNode ExtOp, SDNode InVecOp> { // Register-Register patterns + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BWYrr) VR128:$src)>; + } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BDYrr) VR128:$src)>; def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BQYrr) VR128:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WDYrr) VR128:$src)>; def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), (!cast(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast(OpcPrefix#DQYrr) VR128:$src)>; } // Simple Register-Memory patterns let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (!cast(ExtTy#"extloadvi8") addr:$src)), (!cast(OpcPrefix#BWYrm) addr:$src)>; + + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; } + let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v8i32 (!cast(ExtTy#"extloadvi8") addr:$src)), (!cast(OpcPrefix#BDYrm) addr:$src)>; @@ -5284,38 +4962,31 @@ multiclass SS41I_pmovx_avx2_patterns(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; } } defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; -defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>; // SSE4.1/AVX patterns. multiclass SS41I_pmovx_patterns(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWrm) addr:$src)>; @@ -5371,19 +5040,13 @@ multiclass SS41I_pmovx_patterns(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), (!cast(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQrm) addr:$src)>; @@ -5391,18 +5054,14 @@ multiclass SS41I_pmovx_patterns(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQrm) addr:$src)>; @@ -5411,9 +5070,7 @@ multiclass SS41I_pmovx_patterns(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQrm) addr:$src)>; @@ -5451,7 +5108,7 @@ multiclass SS41I_extract8 opc, string OpcodeStr> { } let Predicates = [HasAVX, NoBWI] in - defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; + defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; defm PEXTRB : SS41I_extract8<0x14, "pextrb">; @@ -5475,7 +5132,7 @@ multiclass SS41I_extract16 opc, string OpcodeStr> { } let Predicates = [HasAVX, NoBWI] in - defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; + defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -5548,18 +5205,6 @@ let ExeDomain = SSEPackedSingle in { defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; } -// Also match an EXTRACTPS store when the store is done as f32 instead of i32. -def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), - imm:$src2))), - addr:$dst), - (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[HasAVX]>; -def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), - imm:$src2))), - addr:$dst), - (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[UseSSE41]>; - //===----------------------------------------------------------------------===// // SSE4.1 - Insert Instructions //===----------------------------------------------------------------------===// @@ -5573,7 +5218,7 @@ multiclass SS41I_insert8 opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8 opc, string asm, bit Is2Addr = 1> { } let Predicates = [HasAVX, NoBWI] in - defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; + defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -5599,7 +5244,7 @@ multiclass SS41I_insert32 opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8 opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8 opc, string asm, bit Is2Addr = 1> { + let isCommutable = 1 in def rr : SS4AIi8, VEX_4V, VEX_LIG, VEX_WIG; @@ -5862,141 +5508,17 @@ let Predicates = [HasAVX, NoAVX512] in { } let Predicates = [UseAVX] in { - def : Pat<(ffloor FR32:$src), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; - def : Pat<(f32 (fceil FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; - def : Pat<(f32 (frint FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; - def : Pat<(f64 (fceil FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; - def : Pat<(f64 (frint FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; + def : Pat<(X86VRndScale FR32:$src1, imm:$src2), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, imm:$src2), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>; } let Predicates = [UseAVX, OptForSize] in { - def : Pat<(ffloor (loadf32 addr:$src)), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; - def : Pat<(f32 (fceil (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; - def : Pat<(f32 (frint (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; - def : Pat<(f64 (fceil (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; - def : Pat<(f64 (frint (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; -} - -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4f32 (ffloor VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0xB))>; - - def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0xB))>; - - def : Pat<(v8f32 (ffloor VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0x9))>; - def : Pat<(v8f32 (fnearbyint VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0xC))>; - def : Pat<(v8f32 (fceil VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0xA))>; - def : Pat<(v8f32 (frint VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0x4))>; - def : Pat<(v8f32 (ftrunc VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0xB))>; - - def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0x9))>; - def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0xC))>; - def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0xA))>; - def : Pat<(v8f32 (frint (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0x4))>; - def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0xB))>; - - def : Pat<(v4f64 (ffloor VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0x9))>; - def : Pat<(v4f64 (fnearbyint VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0xC))>; - def : Pat<(v4f64 (fceil VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0xA))>; - def : Pat<(v4f64 (frint VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0x4))>; - def : Pat<(v4f64 (ftrunc VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0xB))>; - - def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0x9))>; - def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0xC))>; - def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0xA))>; - def : Pat<(v4f64 (frint (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0x4))>; - def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0xB))>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; } let ExeDomain = SSEPackedSingle in @@ -6013,108 +5535,19 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { - def : Pat<(ffloor FR32:$src), - (ROUNDSSr FR32:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0xC))>; - def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0xA))>; - def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0xC))>; - def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0xA))>; - def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0xB))>; + def : Pat<(X86VRndScale FR32:$src1, imm:$src2), + (ROUNDSSr FR32:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, imm:$src2), + (ROUNDSDr FR64:$src1, imm:$src2)>; } let Predicates = [UseSSE41, OptForSize] in { - def : Pat<(ffloor (loadf32 addr:$src)), - (ROUNDSSm addr:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0xC))>; - def : Pat<(f32 (fceil (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0xA))>; - def : Pat<(f32 (frint (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0xC))>; - def : Pat<(f64 (fceil (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0xA))>; - def : Pat<(f64 (frint (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0xB))>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), + (ROUNDSSm addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), + (ROUNDSDm addr:$src1, imm:$src2)>; } -let Predicates = [UseSSE41] in { - def : Pat<(v4f32 (ffloor VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0xB))>; - - def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0xB))>; -} - -defm : scalar_unary_math_imm_patterns; -defm : scalar_unary_math_imm_patterns; -defm : scalar_unary_math_imm_patterns; -defm : scalar_unary_math_imm_patterns; - //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// @@ -6449,6 +5882,72 @@ def BlendCommuteImm8 : SDNodeXForm; +// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. +def BlendScaleImm4 : SDNodeXFormgetZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 4; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. +def BlendScaleImm2 : SDNodeXFormgetZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0xf << (i * 4); + } + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. +def BlendScaleImm2to4 : SDNodeXFormgetZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. +def BlendScaleCommuteImm4 : SDNodeXFormgetZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 4; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm ^ 0xff, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. +def BlendScaleCommuteImm2 : SDNodeXFormgetZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0xf << (i * 4); + } + return getI8Imm(NewImm ^ 0xff, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. +def BlendScaleCommuteImm2to4 : SDNodeXFormgetZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm ^ 0xf, SDLoc(N)); +}]>; + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -6559,6 +6058,42 @@ let Predicates = [HasAVX2] in { VEX_4V, VEX_L, VEX_WIG; } +// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. +// ExecutionDomainFixPass will cleanup domains later on. +let Predicates = [HasAVX1Only] in { +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; + +// Use pblendw for 128-bit integer to keep it in the integer domain and prevent +// it from becoming movsd via commuting under optsize. +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; + +def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; + +// Use pblendw for 128-bit integer to keep it in the integer domain and prevent +// it from becoming movss via commuting under optsize. +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +} + defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; @@ -6569,6 +6104,24 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; +let Predicates = [UseSSE41] in { +// Use pblendw for 128-bit integer to keep it in the integer domain and prevent +// it from becoming movss via commuting under optsize. +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; + +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +} + // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { @@ -6580,18 +6133,25 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), + (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xc)>; +def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } -/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, Intrinsic IntId, - X86FoldableSchedWrite sched> { +/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_avx opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag mem_frag, SDNode OpNode, + X86FoldableSchedWrite sched> { def rr : Ii8Reg, TAPD, VEX_4V, Sched<[sched]>; @@ -6600,8 +6160,8 @@ multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, (mem_frag addr:$src2), - RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, + (OpNode RC:$src3, (mem_frag addr:$src2), + RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold, // x86memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -6612,68 +6172,47 @@ multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { -defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - load, int_x86_sse41_blendvpd, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - loadv4f64, int_x86_avx_blendv_pd_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, + v2f64, loadv2f64, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, + v4f64, loadv4f64, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - load, int_x86_sse41_blendvps, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - loadv8f32, int_x86_avx_blendv_ps_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, + v4f32, loadv4f32, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, + v8f32, loadv8f32, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedSingle -defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - load, int_x86_sse41_pblendvb, - SchedWriteVarBlend.XMM>; +defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, + v16i8, loadv16i8, X86Blendv, + SchedWriteVarBlend.XMM>; } let Predicates = [HasAVX2] in { -defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - load, int_x86_avx2_pblendvb, - SchedWriteVarBlend.YMM>, VEX_L; +defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, + v32i8, loadv32i8, X86Blendv, + SchedWriteVarBlend.YMM>, VEX_L; } let Predicates = [HasAVX] in { - def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), - (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), - (v8i32 VR256:$src2))), + def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), - (v8f32 VR256:$src2))), - (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), - (v4i64 VR256:$src2))), - (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), - (v4f64 VR256:$src2))), + def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; } -let Predicates = [HasAVX2] in { - def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; -} - // Prefer a movss or movsd over a blendps when optimizing for size. these were // changed to use blends because blends have better throughput on sandybridge // and haswell, but movs[s/d] are 1-2 byte shorter instructions. @@ -6708,17 +6247,6 @@ let Predicates = [HasAVX, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were @@ -6747,16 +6275,17 @@ let Predicates = [UseSSE41, OptForSpeed] in { } -/// SS41I_ternary_int - SSE 4.1 ternary operator +/// SS41I_ternary - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { - multiclass SS41I_ternary_int opc, string OpcodeStr, PatFrag mem_frag, - X86MemOperand x86memop, Intrinsic IntId, - X86FoldableSchedWrite sched> { + multiclass SS41I_ternary opc, string OpcodeStr, ValueType VT, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, X86FoldableSchedWrite sched> { def rr0 : SS48I, + [(set VR128:$dst, + (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, Sched<[sched]>; def rm0 : SS48I, + (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem, - int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; +defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem, - int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem, - int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; +defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; +defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, + X86Blendv, SchedWriteVarBlend.XMM>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", @@ -6794,20 +6322,11 @@ def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; let Predicates = [UseSSE41] in { - def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), - (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; } @@ -7451,17 +6970,6 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; -let Predicates = [HasAVX2, NoVLX] in { -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), - (VBROADCASTI128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), - (VBROADCASTI128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), - (VBROADCASTI128 addr:$src)>; -} - let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), (VBROADCASTF128 addr:$src)>; @@ -7469,7 +6977,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), (VBROADCASTF128 addr:$src)>; } -let Predicates = [HasAVX1Only] in { +// NOTE: We're using FP instructions here, but execution domain fixing can +// convert to integer when profitable. +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTF128 addr:$src)>; def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), @@ -7765,12 +7275,10 @@ let Predicates = [HasF16C, NoVLX] in { WriteCvtPS2PHYSt>, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), - (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert - (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 + (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt @@ -7835,6 +7343,7 @@ multiclass AVX2_blend_rmi opc, string OpcodeStr, SDNode OpNode, (commuteXForm imm:$src3))>; } +let Predicates = [HasAVX2] in { defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, SchedWriteBlend.XMM, VR128, i128mem, BlendCommuteImm4>; @@ -7842,28 +7351,26 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, SchedWriteBlend.YMM, VR256, i256mem, BlendCommuteImm8>, VEX_L; -// For insertion into the zero index (low half) of a 256-bit vector, it is -// more efficient to generate a blend with immediate instead of an insert*128. -let Predicates = [HasAVX2] in { -def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; -def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; -def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; -def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), + (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; + +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>; } -let Predicates = [HasAVX1Only] in { +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +// NOTE: We're using FP instructions here, but exeuction domain fixing should +// take care of using integer instructions when profitable. +let Predicates = [HasAVX] in { def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), @@ -7880,6 +7387,19 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } //===----------------------------------------------------------------------===// @@ -7930,9 +7450,9 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, let Predicates = [HasAVX2, NoVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), @@ -7951,9 +7471,15 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { (VPBROADCASTWrm addr:$src)>; def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; @@ -8038,7 +7564,7 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDDUPrr VR128:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), (VMOVDDUPrm addr:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), + def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPrm addr:$src)>; } @@ -8236,19 +7762,14 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", multiclass maskmov_lowering { // masked store - def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), + def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), (!cast(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; // masked load - def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), (!cast(InstrStr#"rm") RC:$mask, addr:$ptr)>; - def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), - (VT (bitconvert (ZeroVT immAllZerosV))))), + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), + (VT immAllZerosV))), (!cast(InstrStr#"rm") RC:$mask, addr:$ptr)>; - def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), - (!cast(BlendStr#"rr") - RC:$src0, - (VT (!cast(InstrStr#"rm") RC:$mask, addr:$ptr)), - RC:$mask)>; } let Predicates = [HasAVX] in { defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; @@ -8275,21 +7796,6 @@ let Predicates = [HasAVX2] in { // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. -let Predicates = [HasAVX2, NoVLX] in { -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v2i64 VR128:$src), 1)>; -def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v4i32 VR128:$src), 1)>; -def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v8i16 VR128:$src), 1)>; -def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v16i8 VR128:$src), 1)>; -} - let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), @@ -8299,7 +7805,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), (v4f32 VR128:$src), 1)>; } -let Predicates = [HasAVX1Only] in { +// NOTE: We're using FP instructions here, but execution domain fixing can +// convert to integer when profitable. +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), (v2i64 VR128:$src), 1)>; @@ -8350,20 +7858,11 @@ multiclass avx2_var_shift opc, string OpcodeStr, SDNode OpNode, } let Predicates = [HasAVX2, NoVLX] in { - defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; - defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; - defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; - defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; - defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; - - def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), - (VPSRAVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))), - (VPSRAVDrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), - (VPSRAVDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))), - (VPSRAVDYrm VR256:$src1, addr:$src2)>; + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; } //===----------------------------------------------------------------------===// @@ -8393,7 +7892,7 @@ multiclass avx2_gather opc, string OpcodeStr, ValueType VTx, VEX, VEX_L, Sched<[WriteLoad]>; } -let Predicates = [UseAVX2] in { +let Predicates = [HasAVX2] in { let mayLoad = 1, hasSideEffects = 0, Constraints = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td index 2dc6e8b43667..82c8e74156b2 100644 --- a/lib/Target/X86/X86InstrSVM.td +++ b/lib/Target/X86/X86InstrSVM.td @@ -1,9 +1,8 @@ //===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index 7cd63a6dd820..9d974b716dda 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -1,9 +1,8 @@ //===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,11 +30,11 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (shl GR64:$src1, CL))]>; } // Uses = [CL], SchedRW +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>; -let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, @@ -473,17 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>; def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16; + [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>, + OpSize16; def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32; + [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>, + OpSize32; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>; // Rotate by 1 def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), @@ -586,16 +587,16 @@ def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), // Rotate by 1 def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "ror{b}\t$dst", - [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>; + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>; def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "ror{w}\t$dst", - [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16; + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize16; def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "ror{l}\t$dst", - [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32; + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>, OpSize32; def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "ror{q}\t$dst", - [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>; + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>; } // Constraints = "$src = $dst", SchedRW let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in { @@ -634,18 +635,18 @@ def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), // Rotate by 1 def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), "ror{b}\t$dst", - [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>; + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t$dst", - [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>, + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, OpSize16; def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t$dst", - [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>, + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>, OpSize32; def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t$dst", - [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>, + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>, Requires<[In64BitMode]>; } // SchedRW @@ -807,13 +808,54 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem, } // Defs = [EFLAGS] +// Use the opposite rotate if allows us to use the rotate by 1 instruction. +def : Pat<(rotl GR8:$src1, (i8 7)), (ROR8r1 GR8:$src1)>; +def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>; +def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>; +def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>; +def : Pat<(rotr GR8:$src1, (i8 7)), (ROL8r1 GR8:$src1)>; +def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>; +def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>; +def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>; + +def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst), + (ROR8m1 addr:$dst)>; +def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst), + (ROR16m1 addr:$dst)>; +def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst), + (ROR32m1 addr:$dst)>; +def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst), + (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>; + +def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst), + (ROL8m1 addr:$dst)>; +def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst), + (ROL16m1 addr:$dst)>; +def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst), + (ROL32m1 addr:$dst)>; +def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst), + (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>; + // Sandy Bridge and newer Intel processors support faster rotates using // SHLD to avoid a partial flag update on the normal rotate instructions. -let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in { - def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), - (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>; - def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), - (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>; +// Use a pseudo so that TwoInstructionPass and register allocation will see +// this as unary instruction. +let Predicates = [HasFastSHLDRotate], AddedComplexity = 5, + Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteSHDrri], + Constraints = "$src1 = $dst" in { + def SHLDROT32ri : I<0, Pseudo, (outs GR32:$dst), + (ins GR32:$src1, u8imm:$shamt), "", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$shamt)))]>; + def SHLDROT64ri : I<0, Pseudo, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$shamt), "", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$shamt)))]>; + + def SHRDROT32ri : I<0, Pseudo, (outs GR32:$dst), + (ins GR32:$src1, u8imm:$shamt), "", + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$shamt)))]>; + def SHRDROT64ri : I<0, Pseudo, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$shamt), "", + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$shamt)))]>; } def ROT32L2R_imm8 : SDNodeXForm; + def : Pat<(rotr GR64:$src, (i8 imm:$shamt)), + (RORX64ri GR64:$src, imm:$shamt)>; + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; } + def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)), + (RORX32mi addr:$src, imm:$shamt)>; + def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)), + (RORX64mi addr:$src, imm:$shamt)>; + def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)), (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not - // immedidate shift, i.e. the following code is considered better + // immediate shift, i.e. the following code is considered better // // mov %edi, %esi // shl $imm, %esi diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 35ee00b9e016..7050e1917494 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -1,9 +1,8 @@ //===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,10 +14,10 @@ let SchedRW = [WriteSystem] in { let Defs = [RAX, RDX] in - def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB; +def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", []>, TB; let Defs = [RAX, RCX, RDX] in - def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB; +def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; // CPU flow control instructions @@ -411,7 +410,7 @@ let Defs = [EAX, EDX], Uses = [ECX] in def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB; let Defs = [RAX, RDX], Uses = [ECX] in - def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB; +def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB; def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", []>, OpSize16, TB; @@ -588,18 +587,13 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in //==-----------------------------------------------------------------------===// // PKU - enable protection key -let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - def WRPKRU : PseudoI<(outs), (ins GR32:$src), - [(int_x86_wrpkru GR32:$src)]>; - def RDPKRU : PseudoI<(outs GR32:$dst), (ins), - [(set GR32:$dst, (int_x86_rdpkru))]>; -} - let SchedRW = [WriteSystem] in { let Defs = [EAX, EDX], Uses = [ECX] in - def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", + [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB; let Uses = [EAX, ECX, EDX] in - def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", + [(X86wrpkru EAX, EDX, ECX)]>, TB; } // SchedRW //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 10c6eef78639..fc0da845299f 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -1,9 +1,8 @@ //===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td index 06a438ebfcad..37bc4ce2e053 100644 --- a/lib/Target/X86/X86InstrVMX.td +++ b/lib/Target/X86/X86InstrVMX.td @@ -1,9 +1,8 @@ //===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td index c417dc99b84d..e98843bd3ae3 100644 --- a/lib/Target/X86/X86InstrVecCompiler.td +++ b/lib/Target/X86/X86InstrVecCompiler.td @@ -1,9 +1,8 @@ //===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -99,76 +98,6 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; -multiclass subvector_store_lowering { - def : Pat<(alignedstore (DstTy (extract_subvector - (SrcTy RC:$src), (iPTR 0))), addr:$dst), - (!cast("VMOV"#AlignedStr#"mr") addr:$dst, - (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; - - def : Pat<(store (DstTy (extract_subvector - (SrcTy RC:$src), (iPTR 0))), addr:$dst), - (!cast("VMOV"#UnalignedStr#"mr") addr:$dst, - (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; -} - -let Predicates = [HasAVX, NoVLX] in { - defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; - defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; -} - -let Predicates = [HasVLX] in { - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, - sub_xmm>; - defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, - sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64, - v4i64, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32, - v8i32, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16, - v16i16, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8, - v32i8, sub_xmm>; - - // Special patterns for storing subvector extracts of lower 128-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, - sub_xmm>; - defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, - sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64, - v8i64, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32, - v16i32, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16, - v32i16, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8, - v64i8, sub_xmm>; - - // Special patterns for storing subvector extracts of lower 256-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, - sub_ymm>; - defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, - sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64, - v8i64, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32, - v16i32, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16, - v32i16, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8, - v64i8, sub_ymm>; -} - // If we're inserting into an all zeros vector, just use a plain move which // will zero the upper bits. A post-isel hook will take care of removing // any moves that we can prove are unnecessary. @@ -176,7 +105,7 @@ multiclass subvec_zero_lowering { - def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + def : Pat<(DstTy (insert_subvector immAllZerosV, (SrcTy RC:$src), (iPTR 0))), (SUBREG_TO_REG (i64 0), (SrcTy (!cast("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>; @@ -398,7 +327,7 @@ let Predicates = [HasBWI, HasDQI] in { (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>; } -let Predicates = [HasBWI, HasVLX] in { +let Predicates = [HasBWI] in { def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), (v1i1 VK1:$mask), (iPTR 0))), (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32), @@ -487,7 +416,7 @@ def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), (XORPSrr VR128:$src1, VR128:$src2)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))), (VANDPSrm VR128:$src1, f128mem:$src2)>; @@ -507,3 +436,24 @@ def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))), def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), (VXORPSrr VR128:$src1, VR128:$src2)>; } + +let Predicates = [HasVLX] in { +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(f128 (X86fand VR128X:$src1, (loadf128 addr:$src2))), + (VANDPSZ128rm VR128X:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86fand VR128X:$src1, VR128X:$src2)), + (VANDPSZ128rr VR128X:$src1, VR128X:$src2)>; + +def : Pat<(f128 (X86for VR128X:$src1, (loadf128 addr:$src2))), + (VORPSZ128rm VR128X:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86for VR128X:$src1, VR128X:$src2)), + (VORPSZ128rr VR128X:$src1, VR128X:$src2)>; + +def : Pat<(f128 (X86fxor VR128X:$src1, (loadf128 addr:$src2))), + (VXORPSZ128rm VR128X:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86fxor VR128X:$src1, VR128X:$src2)), + (VXORPSZ128rr VR128X:$src1, VR128X:$src2)>; +} diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 9d810a675e3b..66ca78556b82 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -1,9 +1,8 @@ //===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -247,36 +246,22 @@ multiclass xopvpcom opc, string Suffix, SDNode OpNode, ValueType vt128, let ExeDomain = SSEPackedInt in { // SSE integer instructions let isCommutable = 1 in def ri : IXOPi8, XOP_4V, Sched<[sched]>; def mi : IXOPi8, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def ri_alt : IXOPi8, XOP_4V, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def mi_alt : IXOPi8, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } } def : Pat<(OpNode (load addr:$src2), diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index c20336387b2d..892a083f4d1a 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -1,9 +1,8 @@ //===- X86InstructionSelector.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -419,18 +418,22 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, if (X86::GPRRegBankID == RB.getID()) return Isload ? X86::MOV32rm : X86::MOV32mr; if (X86::VECRRegBankID == RB.getID()) - return Isload ? (HasAVX512 ? X86::VMOVSSZrm - : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) - : (HasAVX512 ? X86::VMOVSSZmr - : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + return Isload ? (HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt) + : (HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : + X86::MOVSSmr); } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) { if (X86::GPRRegBankID == RB.getID()) return Isload ? X86::MOV64rm : X86::MOV64mr; if (X86::VECRRegBankID == RB.getID()) - return Isload ? (HasAVX512 ? X86::VMOVSDZrm - : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) - : (HasAVX512 ? X86::VMOVSDZmr - : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + return Isload ? (HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt) + : (HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : + X86::MOVSDmr); } else if (Ty.isVector() && Ty.getSizeInBits() == 128) { if (Alignment >= 16) return Isload ? (HasVLX ? X86::VMOVAPSZ128rm @@ -513,10 +516,22 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, LLT Ty = MRI.getType(DefReg); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + assert(I.hasOneMemOperand()); auto &MemOp = **I.memoperands_begin(); - if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { - LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); - return false; + if (MemOp.isAtomic()) { + // Note: for unordered operations, we rely on the fact the appropriate MMO + // is already on the instruction we're mutating, and thus we don't need to + // make any changes. So long as we select an opcode which is capable of + // loading or storing the appropriate size atomically, the rest of the + // backend is required to respect the MMO state. + if (!MemOp.isUnordered()) { + LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n"); + return false; + } + if (MemOp.getAlignment() < Ty.getSizeInBits()/8) { + LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n"); + return false; + } } unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment()); @@ -936,7 +951,6 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode( (CmpInst::Predicate)I.getOperand(1).getPredicate()); - unsigned OpSet = X86::getSETFromCond(CC); unsigned LHS = I.getOperand(2).getReg(); unsigned RHS = I.getOperand(3).getReg(); @@ -970,7 +984,7 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, .addReg(RHS); MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(OpSet), I.getOperand(0).getReg()); + TII.get(X86::SETCCr), I.getOperand(0).getReg()).addImm(CC); constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI); @@ -991,8 +1005,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { - {X86::SETEr, X86::SETNPr, X86::AND8rr}, - {X86::SETNEr, X86::SETPr, X86::OR8rr}}; + {X86::COND_E, X86::COND_NP, X86::AND8rr}, + {X86::COND_NE, X86::COND_P, X86::OR8rr}}; const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: @@ -1032,9 +1046,9 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass); unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass); MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SETFOpc[0]), FlagReg1); + TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]); MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SETFOpc[1]), FlagReg2); + TII.get(X86::SETCCr), FlagReg2).addImm(SETFOpc[1]); MachineInstr &Set3 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SETFOpc[2]), ResultReg) .addReg(FlagReg1) @@ -1052,7 +1066,6 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) std::swap(LhsReg, RhsReg); @@ -1064,7 +1077,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, .addReg(RhsReg); MachineInstr &Set = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc), ResultReg); + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), ResultReg).addImm(CC); constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); constrainSelectedInstRegOperands(Set, TII, TRI, RBI); I.eraseFromParent(); @@ -1409,8 +1422,8 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I, *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri)) .addReg(CondReg) .addImm(1); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JNE_1)) - .addMBB(DestMBB); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JCC_1)) + .addMBB(DestMBB).addImm(X86::COND_NE); constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI); @@ -1530,15 +1543,14 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, const static struct ShiftEntry { unsigned SizeInBits; - unsigned CReg; unsigned OpLSHR; unsigned OpASHR; unsigned OpSHL; } OpTable[] = { - {8, X86::CL, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8 - {16, X86::CX, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16 - {32, X86::ECX, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32 - {64, X86::RCX, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64 + {8, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8 + {16, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16 + {32, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32 + {64, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64 }; if (DstRB.getID() != X86::GPRRegBankID) @@ -1551,7 +1563,6 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, if (ShiftEntryIt == std::end(OpTable)) return false; - unsigned CReg = ShiftEntryIt->CReg; unsigned Opcode = 0; switch (I.getOpcode()) { case TargetOpcode::G_SHL: @@ -1570,16 +1581,11 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, unsigned Op0Reg = I.getOperand(1).getReg(); unsigned Op1Reg = I.getOperand(2).getReg(); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), - ShiftEntryIt->CReg) - .addReg(Op1Reg); + assert(MRI.getType(Op1Reg).getSizeInBits() == 8); - // The shift instruction uses X86::CL. If we defined a super-register - // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. - if (CReg != X86::CL) - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::KILL), - X86::CL) - .addReg(CReg, RegState::Kill); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), + X86::CL) + .addReg(Op1Reg); MachineInstr &ShiftInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg) @@ -1608,8 +1614,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) && "Arguments and return value types must match"); - const RegisterBank &RegRB = *RBI.getRegBank(DstReg, MRI, TRI); - if (RegRB.getID() != X86::GPRRegBankID) + const RegisterBank *RegRB = RBI.getRegBank(DstReg, MRI, TRI); + if (!RegRB || RegRB->getID() != X86::GPRRegBankID) return false; const static unsigned NumTypes = 4; // i8, i16, i32, i64 @@ -1707,7 +1713,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, const DivRemEntry &TypeEntry = *OpEntryIt; const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; - const TargetRegisterClass *RegRC = getRegClass(RegTy, RegRB); + const TargetRegisterClass *RegRC = getRegClass(RegTy, *RegRB); if (!RBI.constrainGenericRegister(Op1Reg, *RegRC, MRI) || !RBI.constrainGenericRegister(Op2Reg, *RegRC, MRI) || !RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) { diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp index 28940754a203..8f74a8fe041d 100644 --- a/lib/Target/X86/X86InterleavedAccess.cpp +++ b/lib/Target/X86/X86InterleavedAccess.cpp @@ -1,9 +1,8 @@ //===- X86InterleavedAccess.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -194,7 +193,7 @@ void X86InterleavedAccessGroup::decompose( // Decompose the load instruction. LoadInst *LI = cast(VecInst); - Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *VecBaseTy, *VecBasePtrTy; Value *VecBasePtr; unsigned int NumLoads = NumSubVectors; // In the case of stride 3 with a vector of 32 elements load the information @@ -202,18 +201,22 @@ void X86InterleavedAccessGroup::decompose( // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] unsigned VecLength = DL.getTypeSizeInBits(VecWidth); if (VecLength == 768 || VecLength == 1536) { - Type *VecTran = - VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo(); - VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran); + VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16); + VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); NumLoads = NumSubVectors * (VecLength / 384); - } else + } else { + VecBaseTy = SubVecTy; + VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); + } // Generate N loads of T type. for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. - Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); + Value *NewBasePtr = + Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = - Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment()); + Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment()); DecomposedVectors.push_back(NewLoad); } } @@ -416,7 +419,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4( } reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16), - NumOfElm, 4, Builder); + NumOfElm, 4, Builder); } // createShuffleStride returns shuffle mask of size N. diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 151e1b9136c4..40141d894629 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -1,9 +1,8 @@ //===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,21 +19,22 @@ namespace llvm { enum IntrinsicType : uint16_t { + CVTNEPS2BF16_MASK, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, - CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, - CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, - INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, - INTR_TYPE_3OP_MASK, - IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, - INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, + CVTPD2PS_MASK, + INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE, + INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE, + INTR_TYPE_1OP_MASK, INTR_TYPE_2OP_MASK, + IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_SAE, + INTR_TYPE_SCALAR_MASK_RND, + INTR_TYPE_3OP_SCALAR_MASK_SAE, COMPRESS_EXPAND_IN_REG, - TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2I_MASK, + TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, GATHER_AVX2, + FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2, ROUNDP, ROUNDS }; @@ -64,47 +64,47 @@ struct IntrinsicData { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithChain[] = { - X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0), - X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), @@ -115,30 +115,30 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), - X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), @@ -249,47 +249,47 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNCUS, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, 0, 0), - X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm), X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm, @@ -298,24 +298,24 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86::VSCATTERPF1QPDm), X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm), - X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), - X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), + X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0), X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0), X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0), - X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0), - X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0), - X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0), + X86_INTRINSIC_DATA(rdtsc, RDTSC, X86::RDTSC, 0), + X86_INTRINSIC_DATA(rdtscp, RDTSC, X86::RDTSCP, 0), + X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0), X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), }; @@ -340,9 +340,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), + X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), - X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,INTR_TYPE_1OP, X86ISD::VFPROUND, 0), X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), @@ -369,6 +371,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -389,10 +394,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0), X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), @@ -405,39 +410,45 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND), + X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE), X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND), - X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE), + X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_q_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_q_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_q_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), - X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE), X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), @@ -448,80 +459,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_kadd_d, INTR_TYPE_2OP, X86ISD::KADD, 0), X86_INTRINSIC_DATA(avx512_kadd_q, INTR_TYPE_2OP, X86ISD::KADD, 0), X86_INTRINSIC_DATA(avx512_kadd_w, INTR_TYPE_2OP, X86ISD::KADD, 0), - X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FADDS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FADDS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FADDS, X86ISD::FADDS_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FADDS, X86ISD::FADDS_RND), X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, - X86ISD::FSETCCM, X86ISD::FSETCCM_RND), + X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, - X86ISD::FSETCCM, X86ISD::FSETCCM_RND), + X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), - X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG, + X86_INTRINSIC_DATA(avx512_mask_compress, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er - X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2DQ_MASK, X86ISD::CVTP2SI, X86ISD::MCVTP2SI), X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, CVTPD2PS_MASK, X86ISD::VFPROUND, X86ISD::VMFPROUND), - X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_RND_MASK, - ISD::FP_ROUND, X86ISD::VFPROUND_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, X86ISD::VFPROUND_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2DQ_MASK, X86ISD::CVTP2UI, X86ISD::MCVTP2UI), X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2UI, 0), @@ -539,8 +502,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, - ISD::FP_EXTEND, X86ISD::VFPEXT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK_SAE, + ISD::FP_EXTEND, X86ISD::VFPEXT_SAE), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, @@ -559,164 +522,116 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTSI2P, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VFPROUNDS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VFPEXTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, CVTQQ2PS_MASK, + X86ISD::CVTSI2P, X86ISD::MCVTSI2P), + X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RND, + X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2DQ_MASK, X86ISD::CVTTP2SI, X86ISD::MCVTTP2SI), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2DQ_MASK, X86ISD::CVTTP2UI, X86ISD::MCVTTP2UI), X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTUI2P, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FDIVS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FDIVS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG, + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, CVTQQ2PS_MASK, + X86ISD::CVTUI2P, X86ISD::MCVTUI2P), + X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512_mask_expand, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0), X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FGETEXPS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FGETEXPS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::FGETEXP, X86ISD::FGETEXP_SAE), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::FGETEXP, X86ISD::FGETEXP_SAE), + X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE), + X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK, - X86ISD::VGETMANT, X86ISD::VGETMANT_RND), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, X86ISD::VGETMANT_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK, - X86ISD::VGETMANT, X86ISD::VGETMANT_RND), - X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK, - X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), - X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK, - X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMAXS, X86ISD::FMAXS_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMAXS, X86ISD::FMAXS_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMINS, X86ISD::FMINS_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMINS, X86ISD::FMINS_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMULS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMULS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, X86ISD::VGETMANT_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_SAE, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_SAE, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMAXS, X86ISD::FMAXS_SAE), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMAXS, X86ISD::FMAXS_SAE), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMINS, X86ISD::FMINS_SAE), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMINS, X86ISD::FMINS_SAE), + X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMULS, X86ISD::FMULS_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMULS, X86ISD::FMULS_RND), X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, TRUNCATE_TO_REG, @@ -737,10 +652,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), - X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), - X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, TRUNCATE_TO_REG, @@ -749,10 +660,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { ISD::TRUNCATE, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), - X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), - X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, TRUNCATE_TO_REG, X86ISD::VTRUNCS, X86ISD::VMTRUNCS), X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, TRUNCATE_TO_REG, @@ -825,62 +732,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), - X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), - X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND), - X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE), X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK, - X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND), + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK, - X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND), - X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, X86ISD::SCALEF_RND), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM, - X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, - X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSUBS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSUBS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, X86ISD::SCALEF_RND), + X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::SCALEFS, X86ISD::SCALEFS_RND), + X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::SCALEFS, X86ISD::SCALEFS_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::FSQRTS, X86ISD::FSQRTS_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::FSQRTS, X86ISD::FSQRTS_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FSUBS, X86ISD::FSUBS_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FSUBS, X86ISD::FSUBS_RND), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK, X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK, @@ -893,28 +800,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ, - X86ISD::VFIXUPIMM, 0), + X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ, - X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ, - X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ, - X86ISD::VFIXUPIMMS, 0), + X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), + X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMM_MASKZ, + X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), + X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMM_MASKZ, + X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), - X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), + X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), + X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), + X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -943,11 +852,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0), X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0), @@ -971,11 +880,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), @@ -990,10 +899,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), - X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), @@ -1002,14 +911,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE), + X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND), @@ -1071,6 +982,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), + // bfloat16 + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), @@ -1111,6 +1032,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvtsd2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), X86_INTRINSIC_DATA(sse2_cvtsd2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), + X86_INTRINSIC_DATA(sse2_cvtsd2ss, INTR_TYPE_2OP, X86ISD::VFPROUNDS, 0), X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0), @@ -1123,6 +1045,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1156,8 +1080,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), @@ -1200,14 +1127,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP, X86ISD::GF2P8MULB, 0), - X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), - X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), - X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), - X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 4a49fa68dd06..00fb1b573858 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -134,9 +133,15 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { // Shifts and SDIV getActionDefinitionsBuilder( - {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM}) - .legalFor({s8, s16, s32}) - .clampScalar(0, s8, s32); + {G_SDIV, G_SREM, G_UDIV, G_UREM}) + .legalFor({s8, s16, s32}) + .clampScalar(0, s8, s32); + + getActionDefinitionsBuilder( + {G_SHL, G_LSHR, G_ASHR}) + .legalFor({{s8, s8}, {s16, s8}, {s32, s8}}) + .clampScalar(0, s8, s32) + .clampScalar(1, s8, s8); } // Control-flow @@ -236,12 +241,19 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { .clampScalar(1, s32, s64) .widenScalarToNextPow2(1); - // Shifts and SDIV + // Divisions getActionDefinitionsBuilder( - {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM}) + {G_SDIV, G_SREM, G_UDIV, G_UREM}) .legalFor({s8, s16, s32, s64}) .clampScalar(0, s8, s64); + // Shifts + getActionDefinitionsBuilder( + {G_SHL, G_LSHR, G_ASHR}) + .legalFor({{s8, s8}, {s16, s8}, {s32, s8}, {s64, s8}}) + .clampScalar(0, s8, s64) + .clampScalar(1, s8, s8); + // Merge/Unmerge setAction({G_MERGE_VALUES, s128}, Legal); setAction({G_UNMERGE_VALUES, 1, s128}, Legal); diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h index 135950a95f84..d21707b9ab9b 100644 --- a/lib/Target/X86/X86LegalizerInfo.h +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -1,10 +1,9 @@ //===- X86LegalizerInfo.h ------------------------------------------*- C++ //-*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 2816f8c62bfb..b1fefaa84be4 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -1,9 +1,8 @@ //===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86ATTInstPrinter.h" -#include "InstPrinter/X86InstComments.h" +#include "MCTargetDesc/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86InstComments.h" #include "MCTargetDesc/X86TargetStreamer.h" #include "Utils/X86ShuffleDecode.h" #include "X86AsmPrinter.h" @@ -101,9 +100,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), - EnablePrintSchedInfo && - !(Inst.getFlags() & X86::NO_SCHED_INFO)); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -438,7 +435,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(MaybeMCOp.getValue()); // Handle a few special cases to eliminate operand modifiers. -ReSimplify: switch (OutMI.getOpcode()) { case X86::LEA64_32r: case X86::LEA64r: @@ -554,11 +550,6 @@ ReSimplify: case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode; - case X86::TAILJMPd_CC: - case X86::TAILJMPd64_CC: - Opcode = X86::GetCondBranchFromCond( - static_cast(MI->getOperand(1).getImm())); - goto SetTailJmpOpcode; SetTailJmpOpcode: MCOperand Saved = OutMI.getOperand(0); @@ -568,6 +559,17 @@ ReSimplify: break; } + case X86::TAILJMPd_CC: + case X86::TAILJMPd64_CC: { + MCOperand Saved = OutMI.getOperand(0); + MCOperand Saved2 = OutMI.getOperand(1); + OutMI = MCInst(); + OutMI.setOpcode(X86::JCC_1); + OutMI.addOperand(Saved); + OutMI.addOperand(Saved2); + break; + } + case X86::DEC16r: case X86::DEC32r: case X86::INC16r: @@ -586,19 +588,6 @@ ReSimplify: } break; - // These are pseudo-ops for OR to help with the OR->ADD transformation. We do - // this with an ugly goto in case the resultant OR uses EAX and needs the - // short form. - case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify; - case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify; - case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify; - case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify; - case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify; - case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify; - case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify; - case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; - case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; - // We don't currently select the correct instruction form for instructions // which have a short %eax, etc. form. Handle this by custom lowering, for // now. @@ -694,16 +683,9 @@ ReSimplify: void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI) { - - bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || + bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 || MI.getOpcode() == X86::TLS_base_addr64; - - bool needsPadding = MI.getOpcode() == X86::TLS_addr64; - - MCContext &context = OutStreamer->getContext(); - - if (needsPadding) - EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + MCContext &Ctx = OutStreamer->getContext(); MCSymbolRefExpr::VariantKind SRVK; switch (MI.getOpcode()) { @@ -721,51 +703,86 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, llvm_unreachable("unexpected opcode"); } - MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); - const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context); - - MCInst LEA; - if (is64Bits) { - LEA.setOpcode(X86::LEA64r); - LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest - LEA.addOperand(MCOperand::createReg(X86::RIP)); // base - LEA.addOperand(MCOperand::createImm(1)); // scale - LEA.addOperand(MCOperand::createReg(0)); // index - LEA.addOperand(MCOperand::createExpr(symRef)); // disp - LEA.addOperand(MCOperand::createReg(0)); // seg - } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) { - LEA.setOpcode(X86::LEA32r); - LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest - LEA.addOperand(MCOperand::createReg(X86::EBX)); // base - LEA.addOperand(MCOperand::createImm(1)); // scale - LEA.addOperand(MCOperand::createReg(0)); // index - LEA.addOperand(MCOperand::createExpr(symRef)); // disp - LEA.addOperand(MCOperand::createReg(0)); // seg + const MCSymbolRefExpr *Sym = MCSymbolRefExpr::create( + MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)), SRVK, Ctx); + + // As of binutils 2.32, ld has a bogus TLS relaxation error when the GD/LD + // code sequence using R_X86_64_GOTPCREL (instead of R_X86_64_GOTPCRELX) is + // attempted to be relaxed to IE/LE (binutils PR24784). Work around the bug by + // only using GOT when GOTPCRELX is enabled. + // TODO Delete the workaround when GOTPCRELX becomes commonplace. + bool UseGot = MMI->getModule()->getRtLibUseGOT() && + Ctx.getAsmInfo()->canRelaxRelocations(); + + if (Is64Bits) { + bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD; + if (NeedsPadding) + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::LEA64r) + .addReg(X86::RDI) + .addReg(X86::RIP) + .addImm(1) + .addReg(0) + .addExpr(Sym) + .addReg(0)); + const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("__tls_get_addr"); + if (NeedsPadding) { + if (!UseGot) + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); + } + if (UseGot) { + const MCExpr *Expr = MCSymbolRefExpr::create( + TlsGetAddr, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + EmitAndCountInstruction(MCInstBuilder(X86::CALL64m) + .addReg(X86::RIP) + .addImm(1) + .addReg(0) + .addExpr(Expr) + .addReg(0)); + } else { + EmitAndCountInstruction( + MCInstBuilder(X86::CALL64pcrel32) + .addExpr(MCSymbolRefExpr::create(TlsGetAddr, + MCSymbolRefExpr::VK_PLT, Ctx))); + } } else { - LEA.setOpcode(X86::LEA32r); - LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest - LEA.addOperand(MCOperand::createReg(0)); // base - LEA.addOperand(MCOperand::createImm(1)); // scale - LEA.addOperand(MCOperand::createReg(X86::EBX)); // index - LEA.addOperand(MCOperand::createExpr(symRef)); // disp - LEA.addOperand(MCOperand::createReg(0)); // seg - } - EmitAndCountInstruction(LEA); + if (SRVK == MCSymbolRefExpr::VK_TLSGD && !UseGot) { + EmitAndCountInstruction(MCInstBuilder(X86::LEA32r) + .addReg(X86::EAX) + .addReg(0) + .addImm(1) + .addReg(X86::EBX) + .addExpr(Sym) + .addReg(0)); + } else { + EmitAndCountInstruction(MCInstBuilder(X86::LEA32r) + .addReg(X86::EAX) + .addReg(X86::EBX) + .addImm(1) + .addReg(0) + .addExpr(Sym) + .addReg(0)); + } - if (needsPadding) { - EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); - EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); - EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); + const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("___tls_get_addr"); + if (UseGot) { + const MCExpr *Expr = + MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_GOT, Ctx); + EmitAndCountInstruction(MCInstBuilder(X86::CALL32m) + .addReg(X86::EBX) + .addImm(1) + .addReg(0) + .addExpr(Expr) + .addReg(0)); + } else { + EmitAndCountInstruction( + MCInstBuilder(X86::CALLpcrel32) + .addExpr(MCSymbolRefExpr::create(TlsGetAddr, + MCSymbolRefExpr::VK_PLT, Ctx))); + } } - - StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; - MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name); - const MCSymbolRefExpr *tlsRef = - MCSymbolRefExpr::create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, context); - - EmitAndCountInstruction( - MCInstBuilder(is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32) - .addExpr(tlsRef)); } /// Emit the largest nop instruction smaller than or equal to \p NumBytes @@ -778,7 +795,7 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; - Opc = IndexReg = Displacement = SegmentReg = 0; + IndexReg = Displacement = SegmentReg = 0; BaseReg = X86::RAX; ScaleVal = 1; switch (NumBytes) { @@ -963,6 +980,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I)) MI.addOperand(MaybeOperand.getValue()); + OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); OutStreamer->EmitInstruction(MI, getSubtargetInfo()); } @@ -1374,7 +1392,8 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { MBB = MBB->getPrevNode(); MBBI = MBB->end(); } - return --MBBI; + --MBBI; + return MBBI; } static const Constant *getConstantFromPool(const MachineInstr &MI, @@ -1668,6 +1687,77 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TLS_base_addr64: return LowerTlsAddr(MCInstLowering, *MI); + // Loading/storing mask pairs requires two kmov operations. The second one of these + // needs a 2 byte displacement relative to the specified address (with 32 bit spill + // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size, + // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD. + // + // The displacement value might wrap around in theory, thus the asserts in both + // cases. + case X86::MASKPAIR16LOAD: { + int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm(); + assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); + const X86RegisterInfo *RI = + MF->getSubtarget().getRegisterInfo(); + unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + + // Load the first mask register + MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm); + MIB.addReg(Reg0); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i)); + MIB.addOperand(Op.getValue()); + } + EmitAndCountInstruction(MIB); + + // Load the second mask register of the pair + MIB = MCInstBuilder(X86::KMOVWkm); + MIB.addReg(Reg1); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) { + MIB.addImm(Disp + 2); + } else { + auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i)); + MIB.addOperand(Op.getValue()); + } + } + EmitAndCountInstruction(MIB); + return; + } + + case X86::MASKPAIR16STORE: { + int64_t Disp = MI->getOperand(X86::AddrDisp).getImm(); + assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); + const X86RegisterInfo *RI = + MF->getSubtarget().getRegisterInfo(); + unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg(); + unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + + // Store the first mask register + MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue()); + MIB.addReg(Reg0); + EmitAndCountInstruction(MIB); + + // Store the second mask register of the pair + MIB = MCInstBuilder(X86::KMOVWmk); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) { + MIB.addImm(Disp + 2); + } else { + auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i)); + MIB.addOperand(Op.getValue()); + } + } + MIB.addReg(Reg1); + EmitAndCountInstruction(MIB); + return; + } + case X86::MOVPC32r: { // This is a pseudo op for a two instruction sequence with a label, which // looks like: @@ -1861,8 +1951,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodePSHUFBMask(C, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -1934,8 +2023,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodeVPERMILPMask(C, ElSize, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -1966,8 +2054,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } @@ -1984,8 +2071,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodeVPPERMMask(C, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } @@ -2002,7 +2088,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; if (auto *CF = dyn_cast(C)) { CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false); - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } } break; @@ -2099,7 +2185,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << "]"; - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } else if (auto *CV = dyn_cast(C)) { CS << "<"; for (int l = 0; l != NumLanes; ++l) { @@ -2111,7 +2197,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << ">"; - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } } break; @@ -2198,14 +2284,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { printConstant(C, CS); } CS << "]"; - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } } MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); - if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment)) - TmpInst.setFlags(TmpInst.getFlags() | X86::NO_SCHED_INFO); // Stackmap shadows cannot include branch targets, so we can count the bytes // in a call towards the shadow, but must ensure that the no thread returns diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp index 5433033671f3..05f846bfb219 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index e1183bd14796..d7e535598d81 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp index 5c09597d0442..c6da4b09dd60 100644 --- a/lib/Target/X86/X86MacroFusion.cpp +++ b/lib/Target/X86/X86MacroFusion.cpp @@ -1,9 +1,8 @@ //===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,59 +18,29 @@ using namespace llvm; -/// Check if the instr pair, FirstMI and SecondMI, should be fused -/// together. Given SecondMI, when FirstMI is unspecified, then check if -/// SecondMI may be part of a fused pair at all. -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const X86Subtarget &ST = static_cast(TSI); - // Check if this processor supports macro-fusion. - if (!ST.hasMacroFusion()) - return false; +namespace { - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; +// The classification for the first instruction. +enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid }; - unsigned FirstOpcode = FirstMI - ? FirstMI->getOpcode() - : static_cast(X86::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); +// The classification for the second instruction (jump). +enum class JumpKind { + // JE, JL, JG and variants. + ELG, + // JA, JB and variants. + AB, + // JS, JP, JO and variants. + SPO, + // Not a fusable jump. + Invalid, +}; - switch (SecondOpcode) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } +} // namespace - switch (FirstOpcode) { +static FirstInstrKind classifyFirst(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: - return false; + return FirstInstrKind::Invalid; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: @@ -84,6 +53,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::TEST16mr: case X86::TEST32mr: case X86::TEST64mr: + return FirstInstrKind::Test; case X86::AND16ri: case X86::AND16ri8: case X86::AND16rm: @@ -99,7 +69,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::AND8ri: case X86::AND8rm: case X86::AND8rr: - return true; + return FirstInstrKind::And; case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP16rm: @@ -119,6 +89,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::CMP8rm: case X86::CMP8rr: case X86::CMP8mr: + return FirstInstrKind::Cmp; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri8_DB: @@ -141,8 +112,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD8ri: + case X86::ADD8ri_DB: case X86::ADD8rm: case X86::ADD8rr: + case X86::ADD8rr_DB: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB16rm: @@ -158,7 +131,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::SUB8ri: case X86::SUB8rm: case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; + return FirstInstrKind::ALU; case X86::INC16r: case X86::INC32r: case X86::INC64r: @@ -167,10 +140,87 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::DEC32r: case X86::DEC64r: case X86::DEC8r: - return FuseKind == FuseInc; - case X86::INSTRUCTION_LIST_END: - return true; + return FirstInstrKind::IncDec; + } +} + +static JumpKind classifySecond(const MachineInstr &MI) { + X86::CondCode CC = X86::getCondFromBranch(MI); + if (CC == X86::COND_INVALID) + return JumpKind::Invalid; + + switch (CC) { + default: + return JumpKind::Invalid; + case X86::COND_E: + case X86::COND_NE: + case X86::COND_L: + case X86::COND_LE: + case X86::COND_G: + case X86::COND_GE: + return JumpKind::ELG; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_A: + case X86::COND_AE: + return JumpKind::AB; + case X86::COND_S: + case X86::COND_NS: + case X86::COND_P: + case X86::COND_NP: + case X86::COND_O: + case X86::COND_NO: + return JumpKind::SPO; + } +} + +/// Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const X86Subtarget &ST = static_cast(TSI); + + // Check if this processor supports any kind of fusion. + if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) + return false; + + const JumpKind BranchKind = classifySecond(SecondMI); + + if (BranchKind == JumpKind::Invalid) + return false; // Second cannot be fused with anything. + + if (FirstMI == nullptr) + return true; // We're only checking whether Second can be fused at all. + + const FirstInstrKind TestKind = classifyFirst(*FirstMI); + + if (ST.hasBranchFusion()) { + // Branch fusion can merge CMP and TEST with all conditional jumps. + return (TestKind == FirstInstrKind::Cmp || + TestKind == FirstInstrKind::Test); + } + + if (ST.hasMacroFusion()) { + // Macro Fusion rules are a bit more complex. See Agner Fog's + // Microarchitecture table 9.2 "Instruction Fusion". + switch (TestKind) { + case FirstInstrKind::Test: + case FirstInstrKind::And: + return true; + case FirstInstrKind::Cmp: + case FirstInstrKind::ALU: + return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB; + case FirstInstrKind::IncDec: + return BranchKind == JumpKind::ELG; + case FirstInstrKind::Invalid: + return false; + } } + + llvm_unreachable("unknown branch fusion type"); } namespace llvm { diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h index 97ef1d6d3b61..d4ae54f657a5 100644 --- a/lib/Target/X86/X86MacroFusion.h +++ b/lib/Target/X86/X86MacroFusion.h @@ -1,9 +1,8 @@ //===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp index b56d02b6bfb6..7f75598b0655 100644 --- a/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -1,9 +1,8 @@ //===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -569,11 +568,8 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, unsigned VReg, int64_t AddrDispShift) { DIExpression *Expr = const_cast(MI.getDebugExpression()); - if (AddrDispShift != 0) - Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift, - DIExpression::NoDeref, - DIExpression::WithStackValue); + Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift); // Replace DBG_VALUE instruction with modified version. MachineBasicBlock *MBB = MI.getParent(); @@ -701,7 +697,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction().optForSize()) + if (MF.getFunction().hasOptSize()) Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 85b9aecc2106..af974c805c36 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -1,9 +1,8 @@ //===-------- X86PadShortFunction.cpp - pad short functions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -98,7 +97,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - if (MF.getFunction().optForSize()) + if (MF.getFunction().hasOptSize()) return false; if (!MF.getSubtarget().padShortFunctions()) @@ -113,14 +112,11 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; - MachineBasicBlock *MBB; - unsigned int Cycles = 0; - // Pad the identified basic blocks with NOOPs for (DenseMap::iterator I = ReturnBBs.begin(); I != ReturnBBs.end(); ++I) { - MBB = I->first; - Cycles = I->second; + MachineBasicBlock *MBB = I->first; + unsigned Cycles = I->second; if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td index a1a4210b5ebf..5610f4bc8873 100644 --- a/lib/Target/X86/X86PfmCounters.td +++ b/lib/Target/X86/X86PfmCounters.td @@ -1,9 +1,8 @@ //===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp index 355291916ee8..78fede3dcde2 100644 --- a/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -160,7 +159,7 @@ const RegisterBankInfo::InstructionMapping & X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto Opc = MI.getOpcode(); + unsigned Opc = MI.getOpcode(); // Try the default logic for non-generic instructions that are either copies // or already have some operands assigned to banks. @@ -174,17 +173,22 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: case TargetOpcode::G_MUL: - case TargetOpcode::G_SHL: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: return getSameOperandsMapping(MI, false); - break; case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: return getSameOperandsMapping(MI, true); - break; + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: { + unsigned NumOperands = MI.getNumOperands(); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + + auto Mapping = getValueMapping(getPartialMappingIdx(Ty, false), 3); + return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands); + + } default: break; } diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h index e227880427f3..c1f3001c6180 100644 --- a/lib/Target/X86/X86RegisterBankInfo.h +++ b/lib/Target/X86/X86RegisterBankInfo.h @@ -1,9 +1,8 @@ //===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/X86/X86RegisterBanks.td b/lib/Target/X86/X86RegisterBanks.td index 6d17cd53a0c1..74c515850ab1 100644 --- a/lib/Target/X86/X86RegisterBanks.td +++ b/lib/Target/X86/X86RegisterBanks.td @@ -1,9 +1,8 @@ //=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 55842a4a2091..2e2f1f9e438a 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -164,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case X86::RFP32RegClassID: case X86::RFP64RegClassID: case X86::RFP80RegClassID: + case X86::VR512_0_15RegClassID: case X86::VR512RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. @@ -216,6 +216,21 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, } } +bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // Prevent rewriting a copy where the destination size is larger than the + // input size. See PR41619. + // FIXME: Should this be factored into the base implementation somehow. + if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 && + SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit) + return false; + + return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, + SrcRC, SrcSubReg); +} + const TargetRegisterClass * X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { const Function &F = MF.getFunction(); @@ -497,6 +512,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const X86FrameLowering *TFI = getFrameLowering(MF); + // Set the floating point control register as reserved. + Reserved.set(X86::FPCW); + // Set the stack-pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); ++I) @@ -747,7 +765,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } -unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const X86FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? FramePtr : StackPtr; } @@ -760,3 +778,12 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } + +unsigned +X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const { + const X86Subtarget &Subtarget = MF.getSubtarget(); + unsigned StackReg = getStackRegister(); + if (Subtarget.isTarget64BitILP32()) + StackReg = getX86SubSuperRegister(StackReg, 32); + return StackReg; +} diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 29401dadead0..b82920898069 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -1,9 +1,8 @@ //===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,7 +49,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(const Triple &TT); + explicit X86RegisterInfo(const Triple &TT); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -75,6 +74,11 @@ public: getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. const TargetRegisterClass * @@ -129,15 +133,16 @@ public: RegScavenger *RS = nullptr) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; - unsigned getStackRegister() const { return StackPtr; } - unsigned getBaseRegister() const { return BasePtr; } + unsigned getPtrSizedStackRegister(const MachineFunction &MF) const; + Register getStackRegister() const { return StackPtr; } + Register getBaseRegister() const { return BasePtr; } /// Returns physical register used as frame pointer. /// This will always returns the frame pointer register, contrary to /// getFrameRegister() which returns the "base pointer" in situations /// involving a stack, frame and base pointer. - unsigned getFramePtr() const { return FramePtr; } + Register getFramePtr() const { return FramePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } }; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index aa20273f89ab..0528b90c1fd5 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -1,9 +1,8 @@ //===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,6 +28,8 @@ let Namespace = "X86" in { def sub_32bit : SubRegIndex<32>; def sub_xmm : SubRegIndex<128>; def sub_ymm : SubRegIndex<256>; + def sub_mask_0 : SubRegIndex<-1>; + def sub_mask_1 : SubRegIndex<-1, -1>; } //===----------------------------------------------------------------------===// @@ -278,7 +279,7 @@ def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>; // pseudo registers, but we still mark them as aliasing FP registers. That // way both kinds can be live without exceeding the stack depth. ST registers // are only live around inline assembly. -def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>; +def ST0 : X86Reg<"st", 0>, DwarfRegNum<[33, 12, 11]>; def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>; def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>; def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>; @@ -288,7 +289,10 @@ def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>; def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; // Floating-point status word -def FPSW : X86Reg<"fpsw", 0>; +def FPSW : X86Reg<"fpsr", 0>; + +// Floating-point control word +def FPCW : X86Reg<"fpcr", 0>; // Status flags register. // @@ -539,6 +543,9 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { let isAllocatable = 0; } +// Helper to allow %st to print as %st(0) when its encoded in the instruction. +def RSTi : RegisterOperand; + // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; @@ -547,17 +554,6 @@ def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128 def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; -// Special classes that help the assembly parser choose some alternate -// instructions to favor 2-byte VEX encodings. -def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], - 128, (sequence "XMM%u", 0, 7)>; -def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], - 128, (sequence "XMM%u", 8, 15)>; -def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 0, 7)>; -def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 8, 15)>; - // Status flags registers. def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { let CopyCost = -1; // Don't allow copying of status registers. @@ -576,6 +572,10 @@ def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> { def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; +// Represents the lower 16 registers that have VEX/legacy encodable subregs. +def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 15)>; + // Scalar AVX-512 floating point registers. def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; @@ -596,6 +596,16 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} +// Mask register pairs +def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1], + [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>; + +def VK1PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK2PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK4PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK8PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK16PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} + def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp index 08994cccb21e..b435b22e8ac7 100644 --- a/lib/Target/X86/X86RetpolineThunks.cpp +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -1,9 +1,8 @@ //======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 971a50196e45..7574e4b8f896 100755 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -1,9 +1,8 @@ //=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -82,6 +81,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -159,7 +160,6 @@ defm : BWWriteResPair; def : WriteRes; // LEA instructions can't fold loads. defm : BWWriteResPair; // Conditional move. -defm : BWWriteResPair; // // Conditional (CF + ZF flag) move. defm : X86WriteRes; // x87 conditional move. def : WriteRes; // Setcc. @@ -186,7 +186,7 @@ defm : BWWriteResPair; // Integer shifts and rotates. defm : BWWriteResPair; defm : BWWriteResPair; -defm : BWWriteResPair; +defm : BWWriteResPair; defm : BWWriteResPair; // SHLD/SHRD. @@ -732,10 +732,10 @@ def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> { } def: InstRW<[BWWriteResGroup20], (instrs CWD, JCXZ, JECXZ, JRCXZ, - ADC8i8, SBB8i8)>; -def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri", - "SBB8ri", - "SET(A|BE)r")>; + ADC8i8, SBB8i8, + ADC16i16, SBB16i16, + ADC32i32, SBB32i32, + ADC64i32, SBB64i32)>; def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> { let Latency = 2; @@ -814,7 +814,6 @@ def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> { let ResourceCycles = [1,1,1,1]; } def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>; -def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>; def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> { let Latency = 4; @@ -890,8 +889,7 @@ def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr", - "MUL_(FPrST0|FST0r|FrST0)")>; +def: InstRW<[BWWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>; def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> { let Latency = 5; @@ -965,6 +963,7 @@ def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> { } def: InstRW<[BWWriteResGroup59], (instrs CVTPS2PDrm, VCVTPS2PDrm, CVTSS2SDrm, VCVTSS2SDrm, + CVTSS2SDrm_Int, VCVTSS2SDrm_Int, VPSLLVQrm, VPSRLVQrm)>; @@ -1103,6 +1102,14 @@ def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> { def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def BWWriteResGroup87_1 : SchedWriteRes<[BWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup87_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { let Latency = 7; let NumMicroOps = 5; @@ -1592,4 +1599,140 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>; def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Haswell and Broadwell Pipeline" > "Register allocation and +// renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def BWWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def BWWriteZeroIdiom : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def BWWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, + VXORPDrr)>; + +def BWWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + +def BWWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>; + +def BWWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>; + +def BWWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def BWWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteVZeroIdiomALUY], (instrs VPSUBBYrr, + VPSUBDYrr, + VPSUBQYrr, + VPSUBWYrr, + VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def BWWritePCMPGTQ : SchedWriteRes<[BWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def BWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar, [BWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[BWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// CMOVs that use both Z and C flag require an extra uop. +def BWWriteCMOVA_CMOVBErr : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 2; + let ResourceCycles = [1,1]; + let NumMicroOps = 2; +} + +def BWWriteCMOVA_CMOVBErm : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { + let Latency = 7; + let ResourceCycles = [1,1,1]; + let NumMicroOps = 3; +} + +def BWCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar, [BWWriteCMOVA_CMOVBErr]>, + SchedVar +]>; + +def BWCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar, [BWWriteCMOVA_CMOVBErm]>, + SchedVar +]>; + +def : InstRW<[BWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[BWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def BWWriteSETA_SETBEr : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 2; + let ResourceCycles = [1,1]; + let NumMicroOps = 2; +} + +def BWWriteSETA_SETBEm : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> { + let Latency = 3; + let ResourceCycles = [1,1,1,1]; + let NumMicroOps = 4; +} + +def BWSETA_SETBErr : SchedWriteVariant<[ + SchedVar, [BWWriteSETA_SETBEr]>, + SchedVar +]>; + +def BWSETA_SETBErm : SchedWriteVariant<[ + SchedVar, [BWWriteSETA_SETBEm]>, + SchedVar +]>; + +def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 06a32fb0b1cd..284d1567c5c6 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -1,9 +1,8 @@ //=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -87,6 +86,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -151,7 +152,7 @@ defm : X86WriteRes; // Integer shifts and rotates. defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; // SHLD/SHRD. @@ -164,7 +165,6 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; // Conditional move. -defm : HWWriteResPair; // Conditional (CF + ZF flag) move. defm : X86WriteRes; // x87 conditional move. def : WriteRes; // Setcc. def : WriteRes { @@ -1126,7 +1126,6 @@ def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { let ResourceCycles = [1,1]; } def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>; -def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>; def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> { let Latency = 7; @@ -1172,7 +1171,6 @@ def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { let ResourceCycles = [1,1,1,1]; } def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>; -def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>; def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { let Latency = 8; @@ -1182,6 +1180,14 @@ def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def HWWriteResGroup46_1 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup46_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { let Latency = 8; let NumMicroOps = 5; @@ -1391,8 +1397,8 @@ def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { let ResourceCycles = [1,1,1]; } def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm, - CVTSD2SSrm, - VCVTSD2SSrm)>; + CVTSD2SSrm, CVTSD2SSrm_Int, + VCVTSD2SSrm, VCVTSD2SSrm_Int)>; def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> { let Latency = 9; @@ -1442,8 +1448,7 @@ def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr", - "MUL_(FPrST0|FST0r|FrST0)")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_(FPrST0|FST0r|FrST0)")>; def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> { let Latency = 11; @@ -1847,4 +1852,170 @@ def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm, def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Haswell and Broadwell Pipeline" > "Register allocation and +// renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def HWWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def HWWriteZeroIdiom : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def HWWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, + VXORPDrr)>; + +def HWWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + +def HWWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>; + +def HWWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>; + +def HWWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def HWWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteVZeroIdiomALUY], (instrs VPSUBBYrr, + VPSUBDYrr, + VPSUBQYrr, + VPSUBWYrr, + VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def HWWritePCMPGTQ : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def HWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar, [HWWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[HWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// The 0x83 ADC/SBB opcodes have special support for immediate 0 to only require +// a single uop. It does not apply to the GR8 encoding. And only applies to the +// 8-bit immediate since using larger immediate for 0 would be silly. +// Unfortunately, this optimization does not apply to the AX/EAX/RAX short +// encodings we convert to in MCInstLowering so we exclude AX/EAX/RAX here since +// we schedule before that point. +// TODO: Should we disable using the short encodings on these CPUs? +def HWFastADC0 : MCSchedPredicate< + CheckAll<[ + CheckImmOperand<2, 0>, // Second MCOperand is Imm and has value 0. + CheckNot>, // First MCOperand is not register AX + CheckNot>, // First MCOperand is not register EAX + CheckNot> // First MCOperand is not register RAX + ]> +>; + +def HWWriteADC0 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def HWWriteADC : SchedWriteVariant<[ + SchedVar, + SchedVar +]>; + +def : InstRW<[HWWriteADC], (instrs ADC16ri8, ADC32ri8, ADC64ri8, + SBB16ri8, SBB32ri8, SBB64ri8)>; + +// CMOVs that use both Z and C flag require an extra uop. +def HWWriteCMOVA_CMOVBErr : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let ResourceCycles = [1,2]; + let NumMicroOps = 3; +} + +def HWWriteCMOVA_CMOVBErm : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { + let Latency = 8; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def HWCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar, [HWWriteCMOVA_CMOVBErr]>, + SchedVar +]>; + +def HWCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar, [HWWriteCMOVA_CMOVBErm]>, + SchedVar +]>; + +def : InstRW<[HWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[HWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def HWWriteSETA_SETBEr : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 2; + let ResourceCycles = [1,1]; + let NumMicroOps = 2; +} + +def HWWriteSETA_SETBEm : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let ResourceCycles = [1,1,1,1]; + let NumMicroOps = 4; +} + +def HWSETA_SETBErr : SchedWriteVariant<[ + SchedVar, [HWWriteSETA_SETBEr]>, + SchedVar +]>; + +def HWSETA_SETBErm : SchedWriteVariant<[ + SchedVar, [HWWriteSETA_SETBEm]>, + SchedVar +]>; + +def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td index 1c7f24375f61..41bd776648f7 100644 --- a/lib/Target/X86/X86SchedPredicates.td +++ b/lib/Target/X86/X86SchedPredicates.td @@ -1,9 +1,8 @@ //===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -61,3 +60,27 @@ def IsThreeOperandsLEABody : // X86GenInstrInfo. def IsThreeOperandsLEAFn : TIIPredicate<"isThreeOperandsLEA", IsThreeOperandsLEABody>; + +// A predicate to check for COND_A and COND_BE CMOVs which have an extra uop +// on recent Intel CPUs. +def IsCMOVArr_Or_CMOVBErr : CheckAny<[ + CheckImmOperand_s<3, "X86::COND_A">, + CheckImmOperand_s<3, "X86::COND_BE"> +]>; + +def IsCMOVArm_Or_CMOVBErm : CheckAny<[ + CheckImmOperand_s<7, "X86::COND_A">, + CheckImmOperand_s<7, "X86::COND_BE"> +]>; + +// A predicate to check for COND_A and COND_BE SETCCs which have an extra uop +// on recent Intel CPUs. +def IsSETAr_Or_SETBEr : CheckAny<[ + CheckImmOperand_s<1, "X86::COND_A">, + CheckImmOperand_s<1, "X86::COND_BE"> +]>; + +def IsSETAm_Or_SETBEm : CheckAny<[ + CheckImmOperand_s<5, "X86::COND_A">, + CheckImmOperand_s<5, "X86::COND_BE"> +]>; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index 9dbf0976989f..d40bdf728a48 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -1,9 +1,8 @@ //=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -77,6 +76,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -159,7 +160,6 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; // Conditional move. -defm : SBWriteResPair; // Conditional (CF + ZF flag) move. defm : X86WriteRes; // x87 conditional move. def : WriteRes; // Setcc. def : WriteRes { @@ -615,13 +615,6 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr, MMX_PSIGNDrr, MMX_PSIGNWrr)>; -def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SBWriteResGroup9], (instregex "SET(A|BE)r")>; - def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> { let Latency = 2; let NumMicroOps = 2; @@ -705,12 +698,6 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> { } def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>; -def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { - let Latency = 5; - let NumMicroOps = 1; - let ResourceCycles = [1]; -} - def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> { let Latency = 5; let NumMicroOps = 1; @@ -772,13 +759,6 @@ def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> { } def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>; -def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { - let Latency = 3; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>; - def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> { let Latency = 5; let NumMicroOps = 4; @@ -1148,6 +1128,12 @@ def SBWriteFZeroIdiom : SchedWriteVariant<[ def : InstRW<[SBWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr)>; +def SBWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar, [SBWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SBWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + def SBWriteVZeroIdiomLogicX : SchedWriteVariant<[ SchedVar, [SBWriteZeroLatency]>, SchedVar @@ -1166,10 +1152,68 @@ def : InstRW<[SBWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, PCMPGTDrr, VPCMPGTDrr, PCMPGTWrr, VPCMPGTWrr)>; +def SBWritePCMPGTQ : SchedWriteRes<[SBPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + def SBWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ SchedVar, [SBWriteZeroLatency]>, - SchedVar + SchedVar ]>; def : InstRW<[SBWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr)>; +// CMOVs that use both Z and C flag require an extra uop. +def SBWriteCMOVA_CMOVBErr : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 3; + let ResourceCycles = [2,1]; + let NumMicroOps = 3; +} + +def SBWriteCMOVA_CMOVBErm : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> { + let Latency = 8; + let ResourceCycles = [1,2,1]; + let NumMicroOps = 4; +} + +def SBCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar, [SBWriteCMOVA_CMOVBErr]>, + SchedVar +]>; + +def SBCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar, [SBWriteCMOVA_CMOVBErm]>, + SchedVar +]>; + +def : InstRW<[SBCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[SBCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def SBWriteSETA_SETBEr : SchedWriteRes<[SBPort05]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SBWriteSETA_SETBEm : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 3; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def SBSETA_SETBErr : SchedWriteVariant<[ + SchedVar, [SBWriteSETA_SETBEr]>, + SchedVar +]>; + +def SBSETA_SETBErm : SchedWriteVariant<[ + SchedVar, [SBWriteSETA_SETBEm]>, + SchedVar +]>; + +def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 2c9eb7516085..8f3e4ae62d53 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -1,9 +1,8 @@ //=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -81,6 +80,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -157,7 +158,6 @@ defm : SKLWriteResPair; def : WriteRes; // LEA instructions can't fold loads. defm : SKLWriteResPair; // Conditional move. -defm : SKLWriteResPair; // Conditional (CF + ZF flag) move. defm : X86WriteRes; // x87 conditional move. def : WriteRes; // Setcc. def : WriteRes { @@ -183,7 +183,7 @@ defm : SKLWriteResPair; // Integer shifts and rotates. defm : SKLWriteResPair; defm : SKLWriteResPair; -defm : SKLWriteResPair; +defm : SKLWriteResPair; defm : SKLWriteResPair; // SHLD/SHRD. @@ -659,8 +659,7 @@ def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr", - "VPBLENDD(Y?)rri", - "(V?)PSUB(B|D|Q|W)(Y?)rr")>; + "VPBLENDD(Y?)rri")>; def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> { let Latency = 1; @@ -698,13 +697,6 @@ def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> { def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP, MMX_MOVDQ2Qrr)>; -def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKLWriteResGroup15], (instregex "SET(A|BE)r")>; - def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -735,9 +727,10 @@ def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> { } def: InstRW<[SKLWriteResGroup23], (instrs CWD, JCXZ, JECXZ, JRCXZ, - ADC8i8, SBB8i8)>; -def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri", - "SBB8ri")>; + ADC8i8, SBB8i8, + ADC16i16, SBB16i16, + ADC32i32, SBB32i32, + ADC64i32, SBB64i32)>; def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> { let Latency = 2; @@ -776,8 +769,7 @@ def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)", - "VPBROADCAST(B|W)rr", - "(V?)PCMPGTQ(Y?)rr")>; + "VPBROADCAST(B|W)rr")>; def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> { let Latency = 3; @@ -839,13 +831,6 @@ def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> { } def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>; -def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { - let Latency = 3; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>; - def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> { let Latency = 3; let NumMicroOps = 4; @@ -1183,6 +1168,14 @@ def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06 def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def SKLWriteResGroup100_1 : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup100_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { let Latency = 7; let NumMicroOps = 5; @@ -1747,4 +1740,150 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>; def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Skylake Pipeline" > "Register allocation and renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def SKLWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def SKLWriteZeroIdiom : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def SKLWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, + VXORPDrr)>; + +def SKLWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + +def SKLWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>; + +def SKLWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteVZeroIdiomLogicY], (instrs VPXORYrr)>; + +def SKLWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def SKLWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def SKLWritePSUB : SchedWriteRes<[SKLPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKLWriteVZeroIdiomPSUB : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + VPSUBBYrr, + VPSUBDYrr, + VPSUBQYrr, + VPSUBWYrr)>; + +def SKLWritePCMPGTQ : SchedWriteRes<[SKLPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKLWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar, [SKLWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKLWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// CMOVs that use both Z and C flag require an extra uop. +def SKLWriteCMOVA_CMOVBErr : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKLWriteCMOVA_CMOVBErm : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 7; + let ResourceCycles = [1,2]; + let NumMicroOps = 3; +} + +def SKLCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar, [SKLWriteCMOVA_CMOVBErr]>, + SchedVar +]>; + +def SKLCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar, [SKLWriteCMOVA_CMOVBErm]>, + SchedVar +]>; + +def : InstRW<[SKLCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[SKLCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def SKLWriteSETA_SETBEr : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKLWriteSETA_SETBEm : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 3; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def SKLSETA_SETBErr : SchedWriteVariant<[ + SchedVar, [SKLWriteSETA_SETBEr]>, + SchedVar +]>; + +def SKLSETA_SETBErm : SchedWriteVariant<[ + SchedVar, [SKLWriteSETA_SETBEm]>, + SchedVar +]>; + +def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index ec8e4db02d8a..58caf1dacfcb 100755 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -1,9 +1,8 @@ //=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -81,6 +80,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -158,7 +159,6 @@ defm : SKXWriteResPair; def : WriteRes; // LEA instructions can't fold loads. defm : SKXWriteResPair; // Conditional move. -defm : SKXWriteResPair; // Conditional (CF + ZF flag) move. defm : X86WriteRes; // x87 conditional move. def : WriteRes; // Setcc. def : WriteRes { @@ -176,7 +176,7 @@ defm : X86WriteRes; // Integer shifts and rotates. defm : SKXWriteResPair; defm : SKXWriteResPair; -defm : SKXWriteResPair; +defm : SKXWriteResPair; defm : SKXWriteResPair; // SHLD/SHRD. @@ -680,8 +680,7 @@ def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr", "VPBLENDMD(Z128|Z256)rr", "VPBLENDMQ(Z128|Z256)rr", "VPBLENDMW(Z128|Z256)rr", - "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr", - "(V?)PSUB(B|D|Q|W)rr", + "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rrk", "VPTERNLOGD(Z|Z128|Z256)rri", "VPTERNLOGQ(Z|Z128|Z256)rri")>; @@ -722,13 +721,6 @@ def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> { def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP, MMX_MOVDQ2Qrr)>; -def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKXWriteResGroup15], (instregex "SET(A|BE)r")>; - def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -759,9 +751,10 @@ def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> { } def: InstRW<[SKXWriteResGroup23], (instrs CWD, JCXZ, JECXZ, JRCXZ, - ADC8i8, SBB8i8)>; -def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri", - "SBB8ri")>; + ADC8i8, SBB8i8, + ADC16i16, SBB16i16, + ADC32i32, SBB32i32, + ADC64i32, SBB64i32)>; def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> { let Latency = 2; @@ -834,7 +827,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0 "VPCMPD(Z|Z128|Z256)rri", "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr", "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr", - "(V?)PCMPGTQ(Y?)rr", "VPCMPQ(Z|Z128|Z256)rri", "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri", "VPCMPW(Z|Z128|Z256)rri", @@ -900,13 +892,6 @@ def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> { } def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>; -def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { - let Latency = 3; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>; - def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> { let Latency = 3; let NumMicroOps = 4; @@ -1446,6 +1431,14 @@ def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06 def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def SKXWriteResGroup107_1 : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup107_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { let Latency = 7; let NumMicroOps = 5; @@ -2463,4 +2456,171 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>; def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Skylake Pipeline" > "Register allocation and renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def SKXWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def SKXWriteZeroIdiom : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def SKXWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, + XORPDrr, VXORPDrr, + VXORPSZ128rr, + VXORPDZ128rr)>; + +def SKXWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, + VXORPSZ256rr, VXORPDZ256rr)>; + +def SKXWriteFZeroIdiomZ : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr)>; + +def SKXWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, + VPXORDZ128rr, VPXORQZ128rr)>; + +def SKXWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteVZeroIdiomLogicY], (instrs VPXORYrr, + VPXORDZ256rr, VPXORQZ256rr)>; + +def SKXWriteVZeroIdiomLogicZ : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr)>; + +def SKXWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def SKXWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def SKXWritePSUB : SchedWriteRes<[SKXPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKXWriteVZeroIdiomPSUB : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; + +def : InstRW<[SKXWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, VPSUBBZ128rr, + PSUBDrr, VPSUBDrr, VPSUBDZ128rr, + PSUBQrr, VPSUBQrr, VPSUBQZ128rr, + PSUBWrr, VPSUBWrr, VPSUBWZ128rr, + VPSUBBYrr, VPSUBBZ256rr, + VPSUBDYrr, VPSUBDZ256rr, + VPSUBQYrr, VPSUBQZ256rr, + VPSUBWYrr, VPSUBWZ256rr, + VPSUBBZrr, + VPSUBDZrr, + VPSUBQZrr, + VPSUBWZrr)>; +def SKXWritePCMPGTQ : SchedWriteRes<[SKXPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKXWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar, [SKXWriteZeroLatency]>, + SchedVar +]>; +def : InstRW<[SKXWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// CMOVs that use both Z and C flag require an extra uop. +def SKXWriteCMOVA_CMOVBErr : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKXWriteCMOVA_CMOVBErm : SchedWriteRes<[SKXPort23,SKXPort06]> { + let Latency = 7; + let ResourceCycles = [1,2]; + let NumMicroOps = 3; +} + +def SKXCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar, [SKXWriteCMOVA_CMOVBErr]>, + SchedVar +]>; + +def SKXCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar, [SKXWriteCMOVA_CMOVBErm]>, + SchedVar +]>; + +def : InstRW<[SKXCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[SKXCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def SKXWriteSETA_SETBEr : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKXWriteSETA_SETBEm : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { + let Latency = 3; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def SKXSETA_SETBErr : SchedWriteVariant<[ + SchedVar, [SKXWriteSETA_SETBEr]>, + SchedVar +]>; + +def SKXSETA_SETBErm : SchedWriteVariant<[ + SchedVar, [SKXWriteSETA_SETBEm]>, + SchedVar +]>; + +def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 25aa83f96d3a..55ca85ec1e3d 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -1,9 +1,8 @@ //===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -18,6 +17,12 @@ def ReadAfterVecLd : SchedRead; def ReadAfterVecXLd : SchedRead; def ReadAfterVecYLd : SchedRead; +// Instructions that move data between general purpose registers and vector +// registers may be subject to extra latency due to data bypass delays. +// This SchedRead describes a bypass delay caused by data being moved from the +// integer unit to the floating point unit. +def ReadInt2Fpu : SchedRead; + // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. def WriteRMW : SchedWrite; @@ -158,7 +163,6 @@ defm WritePOPCNT : X86SchedWritePair; // Bit population count. defm WriteLZCNT : X86SchedWritePair; // Leading zero count. defm WriteTZCNT : X86SchedWritePair; // Trailing zero count. defm WriteCMOV : X86SchedWritePair; // Conditional move. -defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move. def WriteFCMOV : SchedWrite; // X87 conditional move. def WriteSETCC : SchedWrite; // Set register based on condition code. def WriteSETCCStore : SchedWrite; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 1589ff2ef402..b0334655de7e 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -1,9 +1,8 @@ //===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -47,6 +46,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -112,7 +113,6 @@ defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : X86WriteRes; // x87 conditional move. def : WriteRes; @@ -740,7 +740,7 @@ def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> { let Latency = 45; let ResourceCycles = [45]; } -def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>; +def : InstRW<[AtomWrite01_45], (instrs MONITOR32rrr, MONITOR64rrr)>; def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> { let Latency = 46; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index 5798e1b2671b..8cc01c3acece 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -1,9 +1,8 @@ //=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -209,7 +208,10 @@ multiclass __pdWriteResPair; } @@ -218,7 +220,7 @@ multiclass PdWriteResExPair Res = [], int UOps = 1, int LoadUOps = 0> { defm : __pdWriteResPair; + /*LoadLat*/4, /*LoadRes*/3, LoadUOps>; } multiclass PdWriteResXMMPair Res = [], int UOps = 1, int LoadUOps = 0> { defm : __pdWriteResPair; + /*LoadLat*/5, /*LoadRes*/3, LoadUOps>; } multiclass PdWriteResYMMPair ExePorts, int Lat, - list Res, int UOps = 2, + list Res = [], int UOps = 2, int LoadUOps = 0> { defm : __pdWriteResPair; + /*LoadLat*/5, /*LoadRes*/3, LoadUOps>; } //===----------------------------------------------------------------------===// @@ -251,6 +253,11 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +// Transfer from int domain to ivec domain incurs additional latency of 8..10cy +// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller +// and Excavator pipeline", "Data delay between different execution domains" +def : ReadAdvance; + // A folded store needs a cycle on the PdStore for the store data. def : WriteRes; @@ -258,15 +265,15 @@ def : WriteRes; // Loads, stores, and moves, not folded with other operations. //////////////////////////////////////////////////////////////////////////////// -def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 5; let ResourceCycles = [2]; } def : WriteRes; def : WriteRes; -def : WriteRes; +def : WriteRes { let ResourceCycles = [2]; } // Load/store MXCSR. // FIXME: These are copy and pasted from WriteLoad/Store. def : WriteRes { let Latency = 5; } -def : WriteRes { let NumMicroOps = 2; } +def : WriteRes { let NumMicroOps = 2; let ResourceCycles = [18]; } // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; @@ -300,6 +307,7 @@ def : InstRW<[PdWriteXLAT], (instrs XLAT)>; def PdWriteLARrr : SchedWriteRes<[PdEX01]> { let Latency = 184; + let ResourceCycles = [375]; let NumMicroOps = 45; } def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr", @@ -307,22 +315,31 @@ def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr", // Nops don't have dependencies, so there's no actual latency, but we set this // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. -def : WriteRes; +def : WriteRes { let ResourceCycles = [2]; } //////////////////////////////////////////////////////////////////////////////// // Arithmetic. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResExPair; +defm : PdWriteResExPair; + +def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> { + let Latency = 6; + let ResourceCycles = [3, 2, 1]; + let NumMicroOps = 1; +} +def : SchedAlias; def PdWriteLXADD : SchedWriteRes<[PdEX01]> { let Latency = 6; + let ResourceCycles = [88]; let NumMicroOps = 4; } def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>; def PdWriteBMI1 : SchedWriteRes<[PdEX01]> { let Latency = 2; + let ResourceCycles = [2]; let NumMicroOps = 2; } def : InstRW<[PdWriteBMI1], @@ -332,8 +349,9 @@ def : InstRW<[PdWriteBMI1], BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr, TZMSK32rr, TZMSK64rr)>; -def PdWriteBMI1m : SchedWriteRes<[PdEX01]> { +def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> { let Latency = 6; + let ResourceCycles = [3, 3]; let NumMicroOps = 2; } def : InstRW<[PdWriteBMI1m], @@ -345,26 +363,34 @@ def : InstRW<[PdWriteBMI1m], defm : PdWriteResExPair; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> { + let ResourceCycles = [3]; +} +def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [3]; let NumMicroOps = 3; } def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [23]; let NumMicroOps = 5; } def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>; def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [21]; let NumMicroOps = 6; } def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm], @@ -372,42 +398,40 @@ def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm], def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [26]; let NumMicroOps = 18; } def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>; def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [69]; let NumMicroOps = 22; } def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>; -def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> { - let Latency = 2; - let NumMicroOps = 2; -} -def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>; - def PdWriteXADD : SchedWriteRes<[PdEX1]> { - let Latency = 2; - let NumMicroOps = 4; + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; } def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>; def PdWriteXADDm : SchedWriteRes<[PdEX1]> { -let Latency = 6; -let NumMicroOps = 4; + let Latency = 6; + let ResourceCycles = [20]; + let NumMicroOps = 4; } def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : X86WriteResUnsupported; // BMI2 MULX @@ -422,36 +446,48 @@ defm : PdWriteResExPair; defm : PdWriteResExPair; -defm : PdWriteResExPair; +defm : PdWriteResExPair; def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> { let Latency = 5; - let ResourceCycles = [4]; + let ResourceCycles = [10]; let NumMicroOps = 5; } def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>; def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> { let Latency = 6; - let ResourceCycles = [4]; + let ResourceCycles = [12]; let NumMicroOps = 7; } def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>; def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> { let Latency = 10; - let ResourceCycles = [4]; + let ResourceCycles = [17]; let NumMicroOps = 11; } def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>; defm : PdWriteResExPair; // Conditional move. -defm : PdWriteResExPair; // Conditional (CF + ZF flag) move. -def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm, - CMOVGE16rm, CMOVGE32rm, CMOVGE64rm, - CMOVL16rm, CMOVL32rm, CMOVL64rm, - CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>; +def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> { + let Latency = 5; + let ResourceCycles = [3, 3]; + let NumMicroOps = 2; +} + +def PdWriteCMOVmVar : SchedWriteVariant<[ + SchedVar>, [PdWriteCMOVm]>, + SchedVar>, [PdWriteCMOVm]>, + SchedVar>, [PdWriteCMOVm]>, + SchedVar>, [PdWriteCMOVm]>, + SchedVar>, [PdWriteCMOVm]>, + SchedVar>, [PdWriteCMOVm]>, + SchedVar +]>; + +def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; defm : PdWriteRes; // x87 conditional move. @@ -462,107 +498,143 @@ def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> { let ResourceCycles = [2]; let NumMicroOps = 2; } -def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm, - SETLEm, SETLm)>; -defm : PdWriteRes; +def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[ + SchedVar>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar +]>; +def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>; + +defm : PdWriteRes; -def WriteLAHF : SchedWriteRes<[PdEX01]> { +def PdWriteLAHF : SchedWriteRes<[PdEX01]> { let Latency = 2; + let ResourceCycles = [4]; let NumMicroOps = 4; } -def : InstRW<[WriteLAHF], (instrs LAHF)>; +def : InstRW<[PdWriteLAHF], (instrs LAHF)>; -def WriteSAHF : SchedWriteRes<[PdEX01]> { +def PdWriteSAHF : SchedWriteRes<[PdEX01]> { let Latency = 2; + let ResourceCycles = [2]; let NumMicroOps = 2; } -def : InstRW<[WriteSAHF], (instrs SAHF)>; +def : InstRW<[PdWriteSAHF], (instrs SAHF)>; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> { + let Latency = 7; + let ResourceCycles = [42, 1]; + let NumMicroOps = 4; +} +def : SchedAlias; +def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> { + let Latency = 7; + let ResourceCycles = [44, 1]; + let NumMicroOps = 10; +} +def : SchedAlias; // This is for simple LEAs with one or two input operands. // FIXME: SAGU 3-operand LEA def : WriteRes { let NumMicroOps = 2; } // Bit counts. -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; -defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; // BMI1 BEXTR, BMI2 BZHI -defm : PdWriteResExPair; -defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; defm : PdWriteResExPair; +def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let ResourceCycles = [4]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>; + +def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let ResourceCycles = [5]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>; + //////////////////////////////////////////////////////////////////////////////// // Integer shifts and rotates. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResExPair; +defm : PdWriteResExPair; defm : PdWriteResExPair; -defm : PdWriteResExPair; +defm : PdWriteResExPair; defm : PdWriteResExPair; def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> { let Latency = 12; + let ResourceCycles = [24]; let NumMicroOps = 26; } def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>; def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> { let Latency = 12; + let ResourceCycles = [23]; let NumMicroOps = 23; } def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>; def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> { let Latency = 11; + let ResourceCycles = [22]; let NumMicroOps = 24; } def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>; def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> { let Latency = 10; + let ResourceCycles = [20]; let NumMicroOps = 22; } def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>; def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> { let Latency = 10; + let ResourceCycles = [19]; let NumMicroOps = 19; } def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>; -def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> { +def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; + let ResourceCycles = [14]; let NumMicroOps = 17; } -def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>; +def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>; -def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> { +def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; + let ResourceCycles = [13]; let NumMicroOps = 16; } -def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>; - -def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> { - let Latency = 7; - let NumMicroOps = 16; -} -def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>; +def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>; def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> { let Latency = 7; + let ResourceCycles = [14]; let NumMicroOps = 15; } def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>; @@ -570,31 +642,35 @@ def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>; def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> { let Latency = 9; + let ResourceCycles = [18]; let NumMicroOps = 20; } def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>; def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> { let Latency = 11; + let ResourceCycles = [21]; let NumMicroOps = 21; } def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>; def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> { let Latency = 8; + let ResourceCycles = [15]; let NumMicroOps = 16; } def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>; def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> { let Latency = 13; + let ResourceCycles = [25]; let NumMicroOps = 25; } def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>; // SHLD/SHRD. -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> { let Latency = 3; @@ -604,8 +680,8 @@ def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> { def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>; def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> { - let Latency = 4; - let ResourceCycles = [8]; + let Latency = 3; + let ResourceCycles = [6]; let NumMicroOps = 7; } def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL, @@ -623,19 +699,20 @@ defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { +def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> { let Latency = 2; + let ResourceCycles = [1, 3, 1]; let NumMicroOps = 2; } def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>; @@ -649,33 +726,41 @@ defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; +def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> { + let Latency = 5; + let ResourceCycles = [3, 1, 10]; +} +def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m, + SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m, + SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>; + defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; @@ -690,29 +775,35 @@ def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; +def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> { + let Latency = 5; + let ResourceCycles = [3, 1, 10]; +} +def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>; + defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> { - let Latency = 25; - let ResourceCycles = [1, 3]; + let Latency = 27; + let ResourceCycles = [1, 14]; let NumMicroOps = 17; } def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>; @@ -722,118 +813,140 @@ defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> { + let Latency = 9; + let ResourceCycles = [3, 1, 18]; +} +def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m, + DIVR_FI16m, DIVR_FI32m, + DIV_F32m, DIV_F64m, + DIVR_F32m, DIVR_F64m)>; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> { + let Latency = 10; + let ResourceCycles = [2, 1]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>; + +def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 10; + let ResourceCycles = [10, 1]; let NumMicroOps = 2; } -def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr, - VFRCZSDrr, VFRCZSSrr)>; +def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>; def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 15; - let NumMicroOps = 2; + let ResourceCycles = [2, 1]; + let NumMicroOps = 3; } def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm, VFRCZSDrm, VFRCZSSrm)>; def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 10; - let ResourceCycles = [2, 1]; + let ResourceCycles = [3, 1]; let NumMicroOps = 4; } def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>; def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 15; - let ResourceCycles = [2, 1]; + let ResourceCycles = [4, 1]; let NumMicroOps = 8; } def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 7; + let ResourceCycles = [1, 3]; let NumMicroOps = 2; } def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 2; + let ResourceCycles = [1, 2]; } def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>; def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 7; + let ResourceCycles = [1, 4]; let NumMicroOps = 2; } def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>; def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 4; + let ResourceCycles = [1, 6]; let NumMicroOps = 8; } def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>; def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 8; // 4 + 4 + let ResourceCycles = [1, 8]; let NumMicroOps = 10; } def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>; @@ -842,99 +955,100 @@ def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>; // Conversions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>; // FIXME: f+3 ST, LD+STC latency -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; // FIXME: .Folded version is one NumMicroOp *less*.. -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; // FIXME: .Folded version is one NumMicroOp *less*.. -def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 13; + let ResourceCycles = [1, 3, 1]; let NumMicroOps = 2; } -def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>; +def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } -def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr, +def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr, MMX_CVTPI2PDirr)>; -def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 4; let NumMicroOps = 2; } -def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>; +def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; defm : X86WriteResUnsupported; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; defm : X86WriteResUnsupported; //////////////////////////////////////////////////////////////////////////////// // Vector integer operations. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { let NumMicroOps = 8; @@ -948,24 +1062,33 @@ defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> { +} +def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>; + +def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 4; +} +def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>; + +defm : PdWriteRes; +defm : PdWriteRes; defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; @@ -978,55 +1101,67 @@ defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; -def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> { +def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> { let Latency = 4; - let ResourceCycles = [2, 1, 2, 1]; } -def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr, - VPMACSSDQLrr)>; +def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr, + VPMACSSDQLrr)>; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> { + let Latency = 8; + let ResourceCycles = [1, 4]; + let NumMicroOps = 10; +} +def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; +def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 2; + let ResourceCycles = [1, 3]; +} +def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>; + defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; @@ -1034,14 +1169,15 @@ defm : X86WriteResPairUnsupported; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> { let Latency = 3; + let ResourceCycles = [1, 3]; } def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>; @@ -1049,19 +1185,19 @@ def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>; // SSE42 String instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; //////////////////////////////////////////////////////////////////////////////// // MOVMSK Instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteRes; +defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; defm : X86WriteResUnsupported; // defm : X86WriteResUnsupported; @@ -1079,12 +1215,12 @@ defm : PdWriteResXMMPair; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair; -defm : PdWriteResYMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : PdWriteResXMMPair; -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; @@ -1106,10 +1242,11 @@ def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm, // Carry-less multiplication instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> { - let Latency = 13; + let Latency = 12; + let ResourceCycles = [1, 7]; let NumMicroOps = 6; } def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>; @@ -1120,9 +1257,15 @@ def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>; def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> { let Latency = 3; - let ResourceCycles = [1, 4]; + let ResourceCycles = [1, 2]; +} +def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>; + +def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 3; + let ResourceCycles = [1, 3]; } -def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; +def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>; //////////////////////////////////////////////////////////////////////////////// // AVX instructions. diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 33a6b01546d7..2d26232b4132 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -1,9 +1,8 @@ //=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -109,6 +108,11 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +/// "Additional 6 cycle transfer operation which moves a floating point +/// operation input value from the integer unit to the floating point unit. +/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -174,6 +178,8 @@ multiclass JWriteResYMMPair; @@ -215,7 +221,6 @@ defm : JWriteResIntPair; defm : JWriteResIntPair; defm : JWriteResIntPair; // Conditional move. -defm : JWriteResIntPair; // Conditional (CF + ZF flag) move. defm : X86WriteRes; // x87 conditional move. def : WriteRes; // Setcc. def : WriteRes; @@ -262,14 +267,13 @@ defm : X86WriteRes; // Loads, stores, and moves, not folded with other operations. //////////////////////////////////////////////////////////////////////////////// -def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 3; } def : WriteRes; def : WriteRes; def : WriteRes; // Load/store MXCSR. -// FIXME: These are copy and pasted from WriteLoad/Store. -def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 3; } def : WriteRes; // Treat misc copies as a move. @@ -400,8 +404,8 @@ defm : X86WriteResPairUnsupported; defm : JWriteResFpuPair; defm : JWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : JWriteResFpuPair; -defm : JWriteResYMMPair; +defm : JWriteResFpuPair; // +1cy latency. +defm : JWriteResYMMPair; // +1cy latency. defm : X86WriteResPairUnsupported; defm : JWriteResFpuPair; defm : JWriteResYMMPair; @@ -425,12 +429,13 @@ defm : JWriteResFpuPair; defm : JWriteResYMMPair; defm : X86WriteResPairUnsupported; -// FIXME: f+3 ST, LD+STC latency -defm : JWriteResFpuPair; +defm : X86WriteRes; +defm : X86WriteRes; defm : JWriteResFpuPair; defm : JWriteResYMMPair; defm : X86WriteResPairUnsupported; -defm : JWriteResFpuPair; +defm : X86WriteRes; +defm : X86WriteRes; defm : JWriteResFpuPair; defm : JWriteResYMMPair; defm : X86WriteResPairUnsupported; @@ -487,11 +492,11 @@ defm : JWriteResFpuPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; +defm : JWriteResFpuPair; // +1cy latency. defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; +defm : JWriteResFpuPair; // +1cy latency. defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; @@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : X86WriteRes; +defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; @@ -575,10 +580,10 @@ defm : JWriteResFpuPair; -defm : JWriteResYMMPair; -defm : JWriteResFpuPair; -defm : JWriteResFpuPair; +defm : JWriteResFpuPair; // +1cy latency. +defm : JWriteResYMMPair; // +1cy latency. +defm : JWriteResFpuPair; +defm : JWriteResFpuPair; // +1cy latency. defm : X86WriteResPairUnsupported; //////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index fcaff7cf810f..34c251a5c5bb 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -1,9 +1,8 @@ //=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -53,6 +52,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -130,7 +131,6 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : X86WriteRes; // x87 conditional move. def : WriteRes; def : WriteRes { diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index a866f843106b..65f6d89df610 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -1,9 +1,8 @@ //=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -95,6 +94,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // The Integer PRF for Zen is 168 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: "Software Optimization Guide for AMD Family 17h Processors" @@ -214,7 +215,6 @@ defm : ZnWriteResPair; defm : ZnWriteResFpuPair; defm : ZnWriteResPair; -defm : ZnWriteResPair; def : WriteRes; def : WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 008a9ec2ba3c..50690953eef5 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -44,24 +43,6 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( return false; } -namespace { - -// Represents a cover of a buffer of Size bytes with Count() blocks of type AVT -// (of size UBytes() bytes), as well as how many bytes remain (BytesLeft() is -// always smaller than the block size). -struct RepMovsRepeats { - RepMovsRepeats(uint64_t Size) : Size(Size) {} - - uint64_t Count() const { return Size / UBytes(); } - uint64_t BytesLeft() const { return Size % UBytes(); } - uint64_t UBytes() const { return AVT.getSizeInBits() / 8; } - - const uint64_t Size; - MVT AVT = MVT::i8; -}; - -} // namespace - SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, SDValue Size, unsigned Align, bool isVolatile, @@ -201,98 +182,137 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( return Chain; } -SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( - SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - // This requires the copy size to be a constant, preferably - // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast(Size); - const X86Subtarget &Subtarget = - DAG.getMachineFunction().getSubtarget(); - if (!ConstantSize) - return SDValue(); - RepMovsRepeats Repeats(ConstantSize->getZExtValue()); - if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold()) +/// Emit a single REP MOVS{B,W,D,Q} instruction. +static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, SDValue Chain, SDValue Dst, + SDValue Src, SDValue Size, MVT AVT) { + const bool Use64BitRegs = Subtarget.isTarget64BitLP64(); + const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX; + const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI; + const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI; + + SDValue InFlag; + Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag}; + return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); +} + +/// Emit a single REP MOVSB instruction for a particular constant size. +static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, SDValue Chain, SDValue Dst, + SDValue Src, uint64_t Size) { + return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, + DAG.getIntPtrConstant(Size, dl), MVT::i8); +} + +/// Returns the best type to use with repmovs depending on alignment. +static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget, + uint64_t Align) { + assert((Align != 0) && "Align is normalized"); + assert(isPowerOf2_64(Align) && "Align is a power of 2"); + switch (Align) { + case 1: + return MVT::i8; + case 2: + return MVT::i16; + case 4: + return MVT::i32; + default: + return Subtarget.is64Bit() ? MVT::i64 : MVT::i32; + } +} + +/// Returns a REP MOVS instruction, possibly with a few load/stores to implement +/// a constant size memory copy. In some cases where we know REP MOVS is +/// inefficient we return an empty SDValue so the calling code can either +/// generate a load/store sequence or call the runtime memcpy function. +static SDValue emitConstantSizeRepmov( + SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT, + unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { + + /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very + /// efficient. + if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); - /// If not DWORD aligned, it is more efficient to call the library. However - /// if calling the library is not allowed (AlwaysInline), then soldier on as - /// the code generated here is better than the long load-store sequence we - /// would otherwise get. + /// If we have enhanced repmovs we use it. + if (Subtarget.hasERMSB()) + return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size); + + assert(!Subtarget.hasERMSB() && "No efficient RepMovs"); + /// We assume runtime memcpy will do a better job for unaligned copies when + /// ERMS is not present. if (!AlwaysInline && (Align & 3) != 0) return SDValue(); + const MVT BlockType = getOptimalRepmovsType(Subtarget, Align); + const uint64_t BlockBytes = BlockType.getSizeInBits() / 8; + const uint64_t BlockCount = Size / BlockBytes; + const uint64_t BytesLeft = Size % BlockBytes; + SDValue RepMovs = + emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, + DAG.getIntPtrConstant(BlockCount, dl), BlockType); + + /// RepMov can process the whole length. + if (BytesLeft == 0) + return RepMovs; + + assert(BytesLeft && "We have leftover at this point"); + + /// In case we optimize for size we use repmovsb even if it's less efficient + /// so we can save the loads/stores of the leftover. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size); + + // Handle the last 1 - 7 bytes. + SmallVector Results; + Results.push_back(RepMovs); + unsigned Offset = Size - BytesLeft; + EVT DstVT = Dst.getValueType(); + EVT SrcVT = Src.getValueType(); + Results.push_back(DAG.getMemcpy( + Chain, dl, + DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)), + DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), + DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile, + /*AlwaysInline*/ true, /*isTailCall*/ false, + DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); +} + +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // If to a segment-relative address space, use the default lowering. - if (DstPtrInfo.getAddrSpace() >= 256 || - SrcPtrInfo.getAddrSpace() >= 256) + if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); - // If the base register might conflict with our physical registers, bail out. + // If the base registers conflict with our physical registers, use the default + // lowering. const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, X86::ECX, X86::ESI, X86::EDI}; if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); - // If the target has enhanced REPMOVSB, then it's at least as fast to use - // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle - // BytesLeft. - if (!Subtarget.hasERMSB() && !(Align & 1)) { - if (Align & 2) - // WORD aligned - Repeats.AVT = MVT::i16; - else if (Align & 4) - // DWORD aligned - Repeats.AVT = MVT::i32; - else - // QWORD aligned - Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; - - if (Repeats.BytesLeft() > 0 && - DAG.getMachineFunction().getFunction().optForMinSize()) { - // When aggressively optimizing for size, avoid generating the code to - // handle BytesLeft. - Repeats.AVT = MVT::i8; - } - } - - bool Use64BitRegs = Subtarget.isTarget64BitLP64(); - SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, - DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, - Dst, InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RSI : X86::ESI, - Src, InFlag); - InFlag = Chain.getValue(1); - - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag }; - SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget(); - SmallVector Results; - Results.push_back(RepMovs); - if (Repeats.BytesLeft()) { - // Handle the last 1 - 7 bytes. - unsigned Offset = Repeats.Size - Repeats.BytesLeft(); - EVT DstVT = Dst.getValueType(); - EVT SrcVT = Src.getValueType(); - EVT SizeVT = Size.getValueType(); - Results.push_back(DAG.getMemcpy(Chain, dl, - DAG.getNode(ISD::ADD, dl, DstVT, Dst, - DAG.getConstant(Offset, dl, - DstVT)), - DAG.getNode(ISD::ADD, dl, SrcVT, Src, - DAG.getConstant(Offset, dl, - SrcVT)), - DAG.getConstant(Repeats.BytesLeft(), dl, - SizeVT), - Align, isVolatile, AlwaysInline, false, - DstPtrInfo.getWithOffset(Offset), - SrcPtrInfo.getWithOffset(Offset))); - } + /// Handle constant sizes, + if (ConstantSDNode *ConstantSize = dyn_cast(Size)) + return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src, + ConstantSize->getZExtValue(), + Size.getValueType(), Align, isVolatile, + AlwaysInline, DstPtrInfo, SrcPtrInfo); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); + return SDValue(); } diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index f4a285a5f916..0f2d979f91e3 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 720be8afa62c..a202fc63637b 100644 --- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -1,9 +1,8 @@ //===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h index b08c31935d28..296341517579 100644 --- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h +++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -1,9 +1,8 @@ //===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp index a729161a1beb..40f5dbe57e4b 100644 --- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -1,9 +1,8 @@ //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -123,10 +122,7 @@ namespace { class X86SpeculativeLoadHardeningPass : public MachineFunctionPass { public: - X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { - initializeX86SpeculativeLoadHardeningPassPass( - *PassRegistry::getPassRegistry()); - } + X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 speculative load hardening"; @@ -661,7 +657,7 @@ X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) { // jmpq *%rax // ``` // We still want to harden the edge to `L1`. - if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) { + if (X86::getCondFromBranch(MI) == X86::COND_INVALID) { Info.CondBrs.clear(); Info.UncondBr = &MI; continue; @@ -752,7 +748,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( for (X86::CondCode Cond : Conds) { int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; - auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes); + auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); // Note that we intentionally use an empty debug location so that @@ -760,7 +756,8 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg) .addReg(CurStateReg) - .addReg(PS->PoisonReg); + .addReg(PS->PoisonReg) + .addImm(Cond); // If this is the last cmov and the EFLAGS weren't originally // live-in, mark them as killed. if (!LiveEFLAGS && Cond == Conds.back()) @@ -789,7 +786,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB(); int &SuccCount = SuccCounts[&Succ]; - X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode()); + X86::CondCode Cond = X86::getCondFromBranch(*CondBr); X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond); UncondCodeSeq.push_back(Cond); @@ -1177,12 +1174,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // Now cmov over the predicate if the comparison wasn't equal. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; - auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes); + auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg) .addReg(PS->InitialReg) - .addReg(PS->PoisonReg); + .addReg(PS->PoisonReg) + .addImm(X86::COND_NE); CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); ++NumInstsInserted; LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n"); @@ -1963,6 +1961,14 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( LLVM_DEBUG( dbgs() << " Skipping hardening base of explicit stack frame load: "; MI.dump(); dbgs() << "\n"); + } else if (BaseMO.getReg() == X86::RSP) { + // Some idempotent atomic operations are lowered directly to a locked + // OR with 0 to the top of stack(or slightly offset from top) which uses an + // explicit RSP register as the base. + assert(IndexMO.getReg() == X86::NoRegister && + "Explicit RSP access with dynamic index!"); + LLVM_DEBUG( + dbgs() << " Cannot harden base of explicit RSP offset in a load!"); } else if (BaseMO.getReg() == X86::RIP || BaseMO.getReg() == X86::NoRegister) { // For both RIP-relative addressed loads or absolute loads, we cannot @@ -2464,7 +2470,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( // If we have no red zones or if the function returns twice (possibly without // using the `ret` instruction) like setjmp, we need to save the expected // return address prior to the call. - if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone) || + if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) || MF.exposesReturnsTwice()) { // If we don't have red zones, we need to compute the expected return // address prior to the call and store it in a register that lives across @@ -2546,12 +2552,13 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( // Now conditionally update the predicate state we just extracted if we ended // up at a different return address than expected. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; - auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes); + auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg) .addReg(NewStateReg, RegState::Kill) - .addReg(PS->PoisonReg); + .addReg(PS->PoisonReg) + .addImm(X86::COND_NE); CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); ++NumInstsInserted; LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n"); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 0c9ce8802e1b..d5bb56603df9 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -1,9 +1,8 @@ //===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,6 +14,7 @@ #include "X86CallLowering.h" #include "X86LegalizerInfo.h" +#include "X86MacroFusion.h" #include "X86RegisterBankInfo.h" #include "X86Subtarget.h" #include "MCTargetDesc/X86BaseInfo.h" @@ -176,10 +176,13 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, if (TM.shouldAssumeDSOLocal(M, GV)) return X86II::MO_NO_FLAG; + // Functions on COFF can be non-DSO local for two reasons: + // - They are marked dllimport + // - They are extern_weak, and a stub is needed if (isTargetCOFF()) { - assert(GV->hasDLLImportStorageClass() && - "shouldAssumeDSOLocal gave inconsistent answer"); - return X86II::MO_DLLIMPORT; + if (GV->hasDLLImportStorageClass()) + return X86II::MO_DLLIMPORT; + return X86II::MO_COFFSTUB; } const Function *F = dyn_cast_or_null(GV); @@ -367,3 +370,8 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } + +void X86Subtarget::getPostRAMutations( + std::vector> &Mutations) const { + Mutations.push_back(createX86MacroFusionDAGMutation()); +} diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index b1103f823e7f..24ccc9cb7843 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -1,9 +1,8 @@ //===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -89,6 +88,9 @@ protected: /// True if the processor supports X87 instructions. bool HasX87 = false; + /// True if the processor supports CMPXCHG8B. + bool HasCmpxchg8b = false; + /// True if this processor has NOPL instruction /// (generally pentium pro+). bool HasNOPL = false; @@ -295,6 +297,9 @@ protected: /// True if the processor supports macrofusion. bool HasMacroFusion = false; + /// True if the processor supports branch fusion. + bool HasBranchFusion = false; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB = false; @@ -348,9 +353,18 @@ protected: /// Processor has AVX-512 Vector Neural Network Instructions bool HasVNNI = false; + /// Processor has AVX-512 bfloat16 floating-point extensions + bool HasBF16 = false; + + /// Processor supports ENQCMD instructions + bool HasENQCMD = false; + /// Processor has AVX-512 Bit Algorithms instructions bool HasBITALG = false; + /// Processor has AVX-512 vp2intersect instructions + bool HasVP2INTERSECT = false; + /// Processor supports MPX - Memory Protection Extensions bool HasMPX = false; @@ -388,6 +402,12 @@ protected: /// Try harder to combine to horizontal vector ops if they are fast. bool HasFastHorizontalOps = false; + /// Prefer a left/right scalar logical shifts pair over a shift+and pair. + bool HasFastScalarShiftMasks = false; + + /// Prefer a left/right vector logical shifts pair over a shift+and pair. + bool HasFastVectorShiftMasks = false; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpolineIndirectCalls = false; @@ -547,6 +567,7 @@ public: void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } bool hasX87() const { return HasX87; } + bool hasCmpxchg8b() const { return HasCmpxchg8b; } bool hasNOPL() const { return HasNOPL; } // SSE codegen depends on cmovs, and all SSE1+ processors support them. // All 64-bit processors support cmov. @@ -621,7 +642,7 @@ public: int getGatherOverhead() const { return GatherOverhead; } int getScatterOverhead() const { return ScatterOverhead; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } - bool hasCmpxchg16b() const { return HasCmpxchg16b; } + bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } bool useLeaForSP() const { return UseLeaForSP; } bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } @@ -638,7 +659,10 @@ public: bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } bool hasFastBEXTR() const { return HasFastBEXTR; } bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } + bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } + bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } bool hasMacroFusion() const { return HasMacroFusion; } + bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } @@ -657,6 +681,8 @@ public: bool hasVLX() const { return HasVLX; } bool hasPKU() const { return HasPKU; } bool hasVNNI() const { return HasVNNI; } + bool hasBF16() const { return HasBF16; } + bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } bool hasBITALG() const { return HasBITALG; } bool hasMPX() const { return HasMPX; } bool hasSHSTK() const { return HasSHSTK; } @@ -669,6 +695,7 @@ public: bool hasSGX() const { return HasSGX; } bool threewayBranchProfitable() const { return ThreewayBranchProfitable; } bool hasINVPCID() const { return HasINVPCID; } + bool hasENQCMD() const { return HasENQCMD; } bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } bool useRetpolineIndirectBranches() const { return UseRetpolineIndirectBranches; @@ -744,10 +771,6 @@ public: return TargetTriple.isWindowsMSVCEnvironment(); } - bool isTargetKnownWindowsMSVC() const { - return TargetTriple.isKnownWindowsMSVCEnvironment(); - } - bool isTargetWindowsCoreCLR() const { return TargetTriple.isWindowsCoreCLREnvironment(); } @@ -834,11 +857,11 @@ public: /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } - // TODO: Update the regression tests and return true. - bool supportPrintSchedInfo() const override { return false; } - bool enableEarlyIfConversion() const override; + void getPostRAMutations(std::vector> + &Mutations) const override; + AntiDepBreakMode getAntiDepBreakMode() const override { return TargetSubtargetInfo::ANTIDEP_CRITICAL; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index afcb49dc2263..0cbf13899a29 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -1,9 +1,8 @@ //===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "X86TargetMachine.h" #include "MCTargetDesc/X86MCTargetDesc.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86.h" #include "X86CallLowering.h" #include "X86LegalizerInfo.h" @@ -38,6 +38,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -70,9 +71,10 @@ extern "C" void LLVMInitializeX86Target() { initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); initializeFixupLEAPassPass(PR); - initializeShadowCallStackPass(PR); + initializeFPSPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); + initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); @@ -194,7 +196,7 @@ static CodeModel::Model getEffectiveX86CodeModel(Optional CM, bool JIT, bool Is64Bit) { if (CM) { if (*CM == CodeModel::Tiny) - report_fatal_error("Target does not support the tiny CodeModel"); + report_fatal_error("Target does not support the tiny CodeModel", false); return *CM; } if (JIT) @@ -357,6 +359,13 @@ public: return DAG; } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + DAG->addMutation(createX86MacroFusionDAGMutation()); + return DAG; + } + void addIRPasses() override; bool addInstSelector() override; bool addIRTranslator() override; @@ -371,6 +380,8 @@ public: void addPreEmitPass() override; void addPreEmitPass2() override; void addPreSched2() override; + + std::unique_ptr getCSEConfig() const override; }; class X86ExecutionDomainFix : public ExecutionDomainFix { @@ -490,7 +501,6 @@ void X86PassConfig::addPreEmitPass() { addPass(createBreakFalseDeps()); } - addPass(createShadowCallStackPass()); addPass(createX86IndirectBranchTrackingPass()); if (UseVZeroUpper) @@ -512,6 +522,13 @@ void X86PassConfig::addPreEmitPass2() { // correct CFA calculation rule where needed by inserting appropriate CFI // instructions. const Triple &TT = TM->getTargetTriple(); - if (!TT.isOSDarwin() && !TT.isOSWindows()) + const MCAsmInfo *MAI = TM->getMCAsmInfo(); + if (!TT.isOSDarwin() && + (!TT.isOSWindows() || + MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI)) addPass(createCFIInstrInserter()); } + +std::unique_ptr X86PassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index f5b45da0c3dc..b999e2e86af6 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -1,9 +1,8 @@ //===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 505c4fa07b77..92e0779c2e74 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index d045094edb1e..13d7b4ad70d6 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -1,9 +1,8 @@ //===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 36929a4f5439..3dc59aeb263e 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -1651,17 +1650,77 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry SSE2CostTbl[] = { - { ISD::SETCC, MVT::v2i64, 8 }, - { ISD::SETCC, MVT::v4i32, 1 }, - { ISD::SETCC, MVT::v8i16, 1 }, - { ISD::SETCC, MVT::v16i8, 1 }, + unsigned ExtraCost = 0; + if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) { + // Some vector comparison predicates cost extra instructions. + if (MTy.isVector() && + !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || + (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || + ST->hasBWI())) { + switch (cast(I)->getPredicate()) { + case CmpInst::Predicate::ICMP_NE: + // xor(cmpeq(x,y),-1) + ExtraCost = 1; + break; + case CmpInst::Predicate::ICMP_SGE: + case CmpInst::Predicate::ICMP_SLE: + // xor(cmpgt(x,y),-1) + ExtraCost = 1; + break; + case CmpInst::Predicate::ICMP_ULT: + case CmpInst::Predicate::ICMP_UGT: + // cmpgt(xor(x,signbit),xor(y,signbit)) + // xor(cmpeq(pmaxu(x,y),x),-1) + ExtraCost = 2; + break; + case CmpInst::Predicate::ICMP_ULE: + case CmpInst::Predicate::ICMP_UGE: + if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || + (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { + // cmpeq(psubus(x,y),0) + // cmpeq(pminu(x,y),x) + ExtraCost = 1; + } else { + // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) + ExtraCost = 3; + } + break; + default: + break; + } + } + } + + static const CostTblEntry AVX512BWCostTbl[] = { + { ISD::SETCC, MVT::v32i16, 1 }, + { ISD::SETCC, MVT::v64i8, 1 }, + + { ISD::SELECT, MVT::v32i16, 1 }, + { ISD::SELECT, MVT::v64i8, 1 }, }; - static const CostTblEntry SSE42CostTbl[] = { - { ISD::SETCC, MVT::v2f64, 1 }, - { ISD::SETCC, MVT::v4f32, 1 }, - { ISD::SETCC, MVT::v2i64, 1 }, + static const CostTblEntry AVX512CostTbl[] = { + { ISD::SETCC, MVT::v8i64, 1 }, + { ISD::SETCC, MVT::v16i32, 1 }, + { ISD::SETCC, MVT::v8f64, 1 }, + { ISD::SETCC, MVT::v16f32, 1 }, + + { ISD::SELECT, MVT::v8i64, 1 }, + { ISD::SELECT, MVT::v16i32, 1 }, + { ISD::SELECT, MVT::v8f64, 1 }, + { ISD::SELECT, MVT::v16f32, 1 }, + }; + + static const CostTblEntry AVX2CostTbl[] = { + { ISD::SETCC, MVT::v4i64, 1 }, + { ISD::SETCC, MVT::v8i32, 1 }, + { ISD::SETCC, MVT::v16i16, 1 }, + { ISD::SETCC, MVT::v32i8, 1 }, + + { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb + { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb + { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb + { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb }; static const CostTblEntry AVX1CostTbl[] = { @@ -1672,50 +1731,83 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SETCC, MVT::v8i32, 4 }, { ISD::SETCC, MVT::v16i16, 4 }, { ISD::SETCC, MVT::v32i8, 4 }, + + { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd + { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps + { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd + { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps + { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps + { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps }; - static const CostTblEntry AVX2CostTbl[] = { - { ISD::SETCC, MVT::v4i64, 1 }, - { ISD::SETCC, MVT::v8i32, 1 }, - { ISD::SETCC, MVT::v16i16, 1 }, - { ISD::SETCC, MVT::v32i8, 1 }, + static const CostTblEntry SSE42CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + { ISD::SETCC, MVT::v2i64, 1 }, }; - static const CostTblEntry AVX512CostTbl[] = { - { ISD::SETCC, MVT::v8i64, 1 }, - { ISD::SETCC, MVT::v16i32, 1 }, - { ISD::SETCC, MVT::v8f64, 1 }, - { ISD::SETCC, MVT::v16f32, 1 }, + static const CostTblEntry SSE41CostTbl[] = { + { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd + { ISD::SELECT, MVT::v4f32, 1 }, // blendvps + { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb + { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb + { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb + { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb }; - static const CostTblEntry AVX512BWCostTbl[] = { - { ISD::SETCC, MVT::v32i16, 1 }, - { ISD::SETCC, MVT::v64i8, 1 }, + static const CostTblEntry SSE2CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 2 }, + { ISD::SETCC, MVT::f64, 1 }, + { ISD::SETCC, MVT::v2i64, 8 }, + { ISD::SETCC, MVT::v4i32, 1 }, + { ISD::SETCC, MVT::v8i16, 1 }, + { ISD::SETCC, MVT::v16i8, 1 }, + + { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por + }; + + static const CostTblEntry SSE1CostTbl[] = { + { ISD::SETCC, MVT::v4f32, 2 }, + { ISD::SETCC, MVT::f32, 1 }, + + { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps }; if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } @@ -1784,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq + { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd + { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq + { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq + { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq }; static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, @@ -1825,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::SSUBSAT, MVT::v32i8, 1 }, { ISD::UADDSAT, MVT::v16i16, 1 }, { ISD::UADDSAT, MVT::v32i8, 1 }, + { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd { ISD::USUBSAT, MVT::v16i16, 1 }, { ISD::USUBSAT, MVT::v32i8, 1 }, { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd @@ -1861,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert @@ -1885,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, }; static const CostTblEntry SSE42CostTbl[] = { { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd + { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ }; @@ -1945,14 +2044,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::BITREVERSE, MVT::i64, 14 } + { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::SADDO, MVT::i64, 1 }, + { ISD::UADDO, MVT::i64, 1 }, }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, - { ISD::BITREVERSE, MVT::i8, 11 } + { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::SADDO, MVT::i32, 1 }, + { ISD::SADDO, MVT::i16, 1 }, + { ISD::SADDO, MVT::i8, 1 }, + { ISD::UADDO, MVT::i32, 1 }, + { ISD::UADDO, MVT::i16, 1 }, + { ISD::UADDO, MVT::i8, 1 }, }; + Type *OpTy = RetTy; unsigned ISD = ISD::DELETED_NODE; switch (IID) { default: @@ -1987,11 +2095,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, case Intrinsic::sqrt: ISD = ISD::FSQRT; break; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + // SSUBO has same costs so don't duplicate. + ISD = ISD::SADDO; + OpTy = RetTy->getContainedType(0); + break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: + // USUBO has same costs so don't duplicate. + ISD = ISD::UADDO; + OpTy = RetTy->getContainedType(0); + break; } if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = TLI->getTypeLegalizationCost(DL, OpTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -2226,6 +2346,9 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned Alignment, unsigned AddressSpace) { + bool IsLoad = (Instruction::Load == Opcode); + bool IsStore = (Instruction::Store == Opcode); + VectorType *SrcVTy = dyn_cast(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask @@ -2233,10 +2356,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = - VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || - (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || - !isPowerOf2_32(NumElem)) { + VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); + if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) || + (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { // Scalarization int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); int ScalarCompareCost = getCmpSelInstrCost( @@ -2244,8 +2366,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, int BranchCost = getCFInstrCost(Instruction::Br); int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - int ValueSplitCost = getScalarizationOverhead( - SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); + int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore); int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); @@ -2259,8 +2380,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) + - getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), @@ -2268,11 +2389,13 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); } + + // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. if (!ST->hasAVX512()) - return Cost + LT.first*4; // Each maskmov costs 4 + return Cost + LT.first * (IsLoad ? 2 : 8); // AVX-512 masked load/store is cheapper - return Cost+LT.first; + return Cost + LT.first; } int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -2281,7 +2404,7 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. - unsigned NumVectorInstToHideOverhead = 10; + const unsigned NumVectorInstToHideOverhead = 10; // Cost modeling of Strided Access Computation is hidden by the indexing // modes of X86 regardless of the stride value. We dont believe that there @@ -2369,6 +2492,48 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, return LT.first * Entry->Cost; } + static const CostTblEntry AVX2BoolReduction[] = { + { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp + { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp + { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp + { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp + }; + + static const CostTblEntry AVX1BoolReduction[] = { + { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp + { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp + { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp + { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp + { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp + { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp + { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp + { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp + }; + + static const CostTblEntry SSE2BoolReduction[] = { + { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp + { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp + { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp + { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp + { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp + { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp + { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp + { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp + }; + + // Handle bool allof/anyof patterns. + if (ValTy->getVectorElementType()->isIntegerTy(1)) { + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; + } + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); } @@ -2390,15 +2555,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SSE42CostTblPairWise[] = { + static const CostTblEntry SSE1CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 4}, + }; + + static const CostTblEntry SSE2CostTblPairWise[] = { {ISD::FMINNUM, MVT::v2f64, 3}, + {ISD::SMIN, MVT::v2i64, 6}, + {ISD::UMIN, MVT::v2i64, 8}, + {ISD::SMIN, MVT::v4i32, 6}, + {ISD::UMIN, MVT::v4i32, 8}, + {ISD::SMIN, MVT::v8i16, 4}, + {ISD::UMIN, MVT::v8i16, 6}, + {ISD::SMIN, MVT::v16i8, 8}, + {ISD::UMIN, MVT::v16i8, 6}, + }; + + static const CostTblEntry SSE41CostTblPairWise[] = { {ISD::FMINNUM, MVT::v4f32, 2}, - {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" - {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" + {ISD::SMIN, MVT::v2i64, 9}, + {ISD::UMIN, MVT::v2i64,10}, {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" {ISD::SMIN, MVT::v8i16, 2}, {ISD::UMIN, MVT::v8i16, 2}, + {ISD::SMIN, MVT::v16i8, 3}, + {ISD::UMIN, MVT::v16i8, 3}, + }; + + static const CostTblEntry SSE42CostTblPairWise[] = { + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" }; static const CostTblEntry AVX1CostTblPairWise[] = { @@ -2411,8 +2598,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, {ISD::UMIN, MVT::v4i32, 1}, {ISD::SMIN, MVT::v8i16, 1}, {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v16i8, 2}, + {ISD::UMIN, MVT::v16i8, 2}, + {ISD::SMIN, MVT::v4i64, 7}, + {ISD::UMIN, MVT::v4i64, 7}, {ISD::SMIN, MVT::v8i32, 3}, {ISD::UMIN, MVT::v8i32, 3}, + {ISD::SMIN, MVT::v16i16, 3}, + {ISD::UMIN, MVT::v16i16, 3}, + {ISD::SMIN, MVT::v32i8, 3}, + {ISD::UMIN, MVT::v32i8, 3}, }; static const CostTblEntry AVX2CostTblPairWise[] = { @@ -2435,15 +2630,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, {ISD::UMIN, MVT::v16i32, 1}, }; - static const CostTblEntry SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE1CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 4}, + }; + + static const CostTblEntry SSE2CostTblNoPairWise[] = { {ISD::FMINNUM, MVT::v2f64, 3}, + {ISD::SMIN, MVT::v2i64, 6}, + {ISD::UMIN, MVT::v2i64, 8}, + {ISD::SMIN, MVT::v4i32, 6}, + {ISD::UMIN, MVT::v4i32, 8}, + {ISD::SMIN, MVT::v8i16, 4}, + {ISD::UMIN, MVT::v8i16, 6}, + {ISD::SMIN, MVT::v16i8, 8}, + {ISD::UMIN, MVT::v16i8, 6}, + }; + + static const CostTblEntry SSE41CostTblNoPairWise[] = { {ISD::FMINNUM, MVT::v4f32, 3}, - {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" - {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" + {ISD::SMIN, MVT::v2i64, 9}, + {ISD::UMIN, MVT::v2i64,11}, {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" + {ISD::SMIN, MVT::v16i8, 3}, + {ISD::UMIN, MVT::v16i8, 3}, + }; + + static const CostTblEntry SSE42CostTblNoPairWise[] = { + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" }; static const CostTblEntry AVX1CostTblNoPairWise[] = { @@ -2456,8 +2673,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, {ISD::UMIN, MVT::v4i32, 1}, {ISD::SMIN, MVT::v8i16, 1}, {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v16i8, 2}, + {ISD::UMIN, MVT::v16i8, 2}, + {ISD::SMIN, MVT::v4i64, 7}, + {ISD::UMIN, MVT::v4i64, 7}, {ISD::SMIN, MVT::v8i32, 2}, {ISD::UMIN, MVT::v8i32, 2}, + {ISD::SMIN, MVT::v16i16, 2}, + {ISD::UMIN, MVT::v16i16, 2}, + {ISD::SMIN, MVT::v32i8, 2}, + {ISD::UMIN, MVT::v32i8, 2}, }; static const CostTblEntry AVX2CostTblNoPairWise[] = { @@ -2496,6 +2721,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } else { if (ST->hasAVX512()) if (const auto *Entry = @@ -2513,6 +2750,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); @@ -2864,26 +3113,106 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, } bool X86TTIImpl::canMacroFuseCmp() { - return ST->hasMacroFusion(); + return ST->hasMacroFusion() || ST->hasBranchFusion(); } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + if (!ST->hasAVX()) + return false; + // The backend can't handle a single element vector. if (isa(DataTy) && DataTy->getVectorNumElements() == 1) return false; Type *ScalarTy = DataTy->getScalarType(); - int DataWidth = isa(ScalarTy) ? - DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) || - ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI()); + if (ScalarTy->isPointerTy()) + return true; + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64 || + ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { return isLegalMaskedLoad(DataType); } +bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { + unsigned DataSize = DL.getTypeStoreSize(DataType); + // The only supported nontemporal loads are for aligned vectors of 16 or 32 + // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 + // (the equivalent stores only require AVX). + if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) + return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); + + return false; +} + +bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) { + unsigned DataSize = DL.getTypeStoreSize(DataType); + + // SSE4A supports nontemporal stores of float and double at arbitrary + // alignment. + if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) + return true; + + // Besides the SSE4A subtarget exception above, only aligned stores are + // available nontemporaly on any other subtarget. And only stores with a size + // of 4..32 bytes (powers of 2, only) are permitted. + if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || + !isPowerOf2_32(DataSize)) + return false; + + // 32-byte vector nontemporal stores are supported by AVX (the equivalent + // loads require AVX2). + if (DataSize == 32) + return ST->hasAVX(); + else if (DataSize == 16) + return ST->hasSSE1(); + return true; +} + +bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { + if (!isa(DataTy)) + return false; + + if (!ST->hasAVX512()) + return false; + + // The backend can't handle a single element vector. + if (DataTy->getVectorNumElements() == 1) + return false; + + Type *ScalarTy = DataTy->getVectorElementType(); + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64 || + ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); +} + +bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { + return isLegalMaskedExpandLoad(DataTy); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // Some CPUs have better gather performance than others. + // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()))) + return false; + // This function is called now in two cases: from the Loop Vectorizer // and from the Scalarizer. // When the Loop Vectorizer asks about legality of the feature, @@ -2902,14 +3231,17 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { return false; } Type *ScalarTy = DataTy->getScalarType(); - int DataWidth = isa(ScalarTy) ? - DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + if (ScalarTy->isPointerTy()) + return true; - // Some CPUs have better gather performance than others. - // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only - // enable gather with a -march. - return (DataWidth == 32 || DataWidth == 64) && - (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64; } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { @@ -2938,44 +3270,51 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, const FeatureBitset &CalleeBits = TM.getSubtargetImpl(*Callee)->getFeatureBits(); - // FIXME: This is likely too limiting as it will include subtarget features - // that we might not care about for inlining, but it is conservatively - // correct. - return (CallerBits & CalleeBits) == CalleeBits; + FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; + FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; + return (RealCallerBits & RealCalleeBits) == RealCalleeBits; } -const X86TTIImpl::TTI::MemCmpExpansionOptions * -X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { - // Only enable vector loads for equality comparison. - // Right now the vector version is not as fast, see #33329. - static const auto ThreeWayOptions = [this]() { - TTI::MemCmpExpansionOptions Options; - if (ST->is64Bit()) { - Options.LoadSizes.push_back(8); - } - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); - return Options; - }(); - static const auto EqZeroOptions = [this]() { - TTI::MemCmpExpansionOptions Options; +bool X86TTIImpl::areFunctionArgsABICompatible( + const Function *Caller, const Function *Callee, + SmallPtrSetImpl &Args) const { + if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) + return false; + + // If we get here, we know the target features match. If one function + // considers 512-bit vectors legal and the other does not, consider them + // incompatible. + // FIXME Look at the arguments and only consider 512 bit or larger vectors? + const TargetMachine &TM = getTLI()->getTargetMachine(); + + return TM.getSubtarget(*Caller).useAVX512Regs() == + TM.getSubtarget(*Callee).useAVX512Regs(); +} + +X86TTIImpl::TTI::MemCmpExpansionOptions +X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = 2; + if (IsZeroCmp) { + // Only enable vector loads for equality comparison. Right now the vector + // version is not as fast for three way compare (see #33329). // TODO: enable AVX512 when the DAG is ready. // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); - if (ST->hasAVX2()) Options.LoadSizes.push_back(32); - if (ST->hasSSE2()) Options.LoadSizes.push_back(16); - if (ST->is64Bit()) { - Options.LoadSizes.push_back(8); - } - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); + const unsigned PreferredWidth = ST->getPreferVectorWidth(); + if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); + if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); // All GPR and vector loads can be unaligned. SIMD compare requires integer // vectors (SSE2/AVX2). Options.AllowOverlappingLoads = true; - return Options; - }(); - return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; + } + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; } bool X86TTIImpl::enableInterleavedAccessVectorization() { diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 1637592c81f8..25d9c33eb16d 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -36,6 +35,64 @@ class X86TTIImpl : public BasicTTIImplBase { const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } + const FeatureBitset InlineFeatureIgnoreList = { + // This indicates the CPU is 64 bit capable not that we are in 64-bit + // mode. + X86::Feature64Bit, + + // These features don't have any intrinsics or ABI effect. + X86::FeatureNOPL, + X86::FeatureCMPXCHG16B, + X86::FeatureLAHFSAHF, + + // Codegen control options. + X86::FeatureFast11ByteNOP, + X86::FeatureFast15ByteNOP, + X86::FeatureFastBEXTR, + X86::FeatureFastHorizontalOps, + X86::FeatureFastLZCNT, + X86::FeatureFastPartialYMMorZMMWrite, + X86::FeatureFastScalarFSQRT, + X86::FeatureFastSHLDRotate, + X86::FeatureFastScalarShiftMasks, + X86::FeatureFastVectorShiftMasks, + X86::FeatureFastVariableShuffle, + X86::FeatureFastVectorFSQRT, + X86::FeatureLEAForSP, + X86::FeatureLEAUsesAG, + X86::FeatureLZCNTFalseDeps, + X86::FeatureBranchFusion, + X86::FeatureMacroFusion, + X86::FeatureMergeToThreeWayBranch, + X86::FeaturePadShortFunctions, + X86::FeaturePOPCNTFalseDeps, + X86::FeatureSSEUnalignedMem, + X86::FeatureSlow3OpsLEA, + X86::FeatureSlowDivide32, + X86::FeatureSlowDivide64, + X86::FeatureSlowIncDec, + X86::FeatureSlowLEA, + X86::FeatureSlowPMADDWD, + X86::FeatureSlowPMULLD, + X86::FeatureSlowSHLD, + X86::FeatureSlowTwoMemOps, + X86::FeatureSlowUAMem16, + + // Perf-tuning flags. + X86::FeatureHasFastGather, + X86::FeatureSlowUAMem32, + + // Based on whether user set the -mprefer-vector-width command line. + X86::FeaturePrefer256Bit, + + // CPU name enums. These just follow CPU string. + X86::ProcIntelAtom, + X86::ProcIntelGLM, + X86::ProcIntelGLP, + X86::ProcIntelSLM, + X86::ProcIntelTRM, + }; + public: explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -129,14 +186,21 @@ public: bool canMacroFuseCmp(); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); + bool isLegalNTLoad(Type *DataType, unsigned Alignment); + bool isLegalNTStore(Type *DataType, unsigned Alignment); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); + bool isLegalMaskedExpandLoad(Type *DataType); + bool isLegalMaskedCompressStore(Type *DataType); bool hasDivRemOp(Type *DataType, bool IsSigned); bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const; + bool areFunctionArgsABICompatible(const Function *Caller, + const Function *Callee, + SmallPtrSetImpl &Args) const; + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index f882b760927c..a07d2f20acab 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -1,9 +1,8 @@ //===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp index d298aaa97ecd..9e499db1d7ee 100644 --- a/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/lib/Target/X86/X86WinAllocaExpander.cpp @@ -1,9 +1,8 @@ //===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -85,10 +84,6 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) { unsigned AmountReg = MI->getOperand(0).getReg(); MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg); - // Look through copies. - while (Def && Def->isCopy() && Def->getOperand(1).isReg()) - Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg()); - if (!Def || (Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) || !Def->getOperand(1).isImm()) @@ -210,15 +205,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { return; } + // These two variables differ on x32, which is a 64-bit target with a + // 32-bit alloca. bool Is64Bit = STI->is64Bit(); + bool Is64BitAlloca = MI->getOpcode() == X86::WIN_ALLOCA_64; assert(SlotSize == 4 || SlotSize == 8); - unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX; switch (L) { - case TouchAndSub: + case TouchAndSub: { assert(Amount >= SlotSize); // Use a push to touch the top of the stack. + unsigned RegA = Is64Bit ? X86::RAX : X86::EAX; BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(RegA, RegState::Undef); Amount -= SlotSize; @@ -227,15 +225,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { // Fall through to make any remaining adjustment. LLVM_FALLTHROUGH; + } case Sub: assert(Amount > 0); if (Amount == SlotSize) { // Use push to save size. + unsigned RegA = Is64Bit ? X86::RAX : X86::EAX; BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(RegA, RegState::Undef); } else { // Sub. - BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr) + BuildMI(*MBB, I, DL, + TII->get(getSubOpcode(Is64BitAlloca, Amount)), StackPtr) .addReg(StackPtr) .addImm(Amount); } @@ -243,16 +244,17 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { case Probe: if (!NoStackArgProbe) { // The probe lowering expects the amount in RAX/EAX. + unsigned RegA = Is64BitAlloca ? X86::RAX : X86::EAX; BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA) .addReg(MI->getOperand(0).getReg()); // Do the probe. STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL, - /*InPrologue=*/false); + /*InProlog=*/false); } else { // Sub - BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::SUB64rr : X86::SUB32rr), - StackPtr) + BuildMI(*MBB, I, DL, + TII->get(Is64BitAlloca ? X86::SUB64rr : X86::SUB32rr), StackPtr) .addReg(StackPtr) .addReg(MI->getOperand(0).getReg()); } @@ -262,18 +264,10 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { unsigned AmountReg = MI->getOperand(0).getReg(); MI->eraseFromParent(); - // Delete the definition of AmountReg, possibly walking a chain of copies. - for (;;) { - if (!MRI->use_empty(AmountReg)) - break; - MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg); - if (!AmountDef) - break; - if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg()) - AmountReg = AmountDef->getOperand(1).isReg(); - AmountDef->eraseFromParent(); - break; - } + // Delete the definition of AmountReg. + if (MRI->use_empty(AmountReg)) + if (MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg)) + AmountDef->eraseFromParent(); } bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) { diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 185deda97c1f..f68d17d7256d 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -1,9 +1,8 @@ //===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -41,9 +40,7 @@ class WinEHStatePass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHStatePass() : FunctionPass(ID) { - initializeWinEHStatePassPass(*PassRegistry::getPassRegistry()); - } + WinEHStatePass() : FunctionPass(ID) { } bool runOnFunction(Function &Fn) override; @@ -87,15 +84,15 @@ private: StructType *EHLinkRegistrationTy = nullptr; StructType *CXXEHRegistrationTy = nullptr; StructType *SEHRegistrationTy = nullptr; - Constant *SetJmp3 = nullptr; - Constant *CxxLongjmpUnwind = nullptr; + FunctionCallee SetJmp3 = nullptr; + FunctionCallee CxxLongjmpUnwind = nullptr; // Per-function state EHPersonality Personality = EHPersonality::Unknown; Function *PersonalityFn = nullptr; bool UseStackGuard = false; int ParentBaseState; - Constant *SehLongjmpUnwind = nullptr; + FunctionCallee SehLongjmpUnwind = nullptr; Constant *Cookie = nullptr; /// The stack allocation containing all EH data, including the link in the @@ -304,7 +301,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { CxxLongjmpUnwind = TheModule->getOrInsertFunction( "__CxxLongjmpUnwind", FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false)); - cast(CxxLongjmpUnwind->stripPointerCasts()) + cast(CxxLongjmpUnwind.getCallee()->stripPointerCasts()) ->setCallingConv(CallingConv::X86_StdCall); } else if (Personality == EHPersonality::MSVC_X86SEH) { // If _except_handler4 is in use, some additional guard checks and prologue @@ -357,7 +354,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind", FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType, /*isVarArg=*/false)); - cast(SehLongjmpUnwind->stripPointerCasts()) + cast(SehLongjmpUnwind.getCallee()->stripPointerCasts()) ->setCallingConv(CallingConv::X86_StdCall); } else { llvm_unreachable("unexpected personality function"); @@ -412,7 +409,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); auto AI = Trampoline->arg_begin(); Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++}; - CallInst *Call = Builder.CreateCall(CastPersonality, Args); + CallInst *Call = Builder.CreateCall(TargetFuncTy, CastPersonality, Args); // Can't use musttail due to prototype mismatch, but we can use tail. Call->setTailCall(true); // Set inreg so we pass it in EAX. @@ -433,7 +430,7 @@ void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder, // Next = [fs:00] Constant *FSZero = Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); - Value *Next = Builder.CreateLoad(FSZero); + Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), FSZero); Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0)); // [fs:00] = Link Builder.CreateStore(Link, FSZero); @@ -448,8 +445,8 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { } Type *LinkTy = getEHLinkRegistrationType(); // [fs:00] = Link->Next - Value *Next = - Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0)); + Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), + Builder.CreateStructGEP(LinkTy, Link, 0)); Constant *FSZero = Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); Builder.CreateStore(Next, FSZero); @@ -472,11 +469,11 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, SmallVector OptionalArgs; if (Personality == EHPersonality::MSVC_CXX) { - OptionalArgs.push_back(CxxLongjmpUnwind); + OptionalArgs.push_back(CxxLongjmpUnwind.getCallee()); OptionalArgs.push_back(State); OptionalArgs.push_back(emitEHLSDA(Builder, &F)); } else if (Personality == EHPersonality::MSVC_X86SEH) { - OptionalArgs.push_back(SehLongjmpUnwind); + OptionalArgs.push_back(SehLongjmpUnwind.getCallee()); OptionalArgs.push_back(State); if (UseStackGuard) OptionalArgs.push_back(Cookie); @@ -767,7 +764,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { if (!CS) continue; if (CS.getCalledValue()->stripPointerCasts() != - SetJmp3->stripPointerCasts()) + SetJmp3.getCallee()->stripPointerCasts()) continue; SetJmp3CallSites.push_back(CS); @@ -782,9 +779,9 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { IRBuilder<> Builder(CS.getInstruction()); Value *State; if (InCleanup) { - Value *StateField = - Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex); - State = Builder.CreateLoad(StateField); + Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(), + RegNode, StateFieldIndex); + State = Builder.CreateLoad(Builder.getInt32Ty(), StateField); } else { State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS)); } @@ -794,7 +791,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) { IRBuilder<> Builder(IP); - Value *StateField = - Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex); + Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(), + RegNode, StateFieldIndex); Builder.CreateStore(Builder.getInt32(State), StateField); } diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index faf66e5944ab..ff3d41fd5274 100644 --- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -1,9 +1,8 @@ //===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -12,6 +11,7 @@ /// //===----------------------------------------------------------------------===// +#include "TargetInfo/XCoreTargetInfo.h" #include "XCore.h" #include "XCoreRegisterInfo.h" #include "llvm/MC/MCContext.h" @@ -768,10 +768,6 @@ MCDisassembler::DecodeStatus XCoreDisassembler::getInstruction( return Fail; } -namespace llvm { - Target &getTheXCoreTarget(); -} - static MCDisassembler *createXCoreDisassembler(const Target &T, const MCSubtargetInfo &STI, MCContext &Ctx) { diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp deleted file mode 100644 index b03c1852281d..000000000000 --- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp +++ /dev/null @@ -1,90 +0,0 @@ -//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an XCore MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "XCoreInstPrinter.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "XCoreGenAsmWriter.inc" - -void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << StringRef(getRegisterName(RegNo)).lower(); -} - -void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { - printInstruction(MI, O); - printAnnotation(O, Annot); -} - -void XCoreInstPrinter:: -printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) { - report_fatal_error("can't handle InlineJT"); -} - -void XCoreInstPrinter:: -printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) { - report_fatal_error("can't handle InlineJT32"); -} - -static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI, - raw_ostream &OS) { - int Offset = 0; - const MCSymbolRefExpr *SRE; - - if (const MCBinaryExpr *BE = dyn_cast(Expr)) { - SRE = dyn_cast(BE->getLHS()); - const MCConstantExpr *CE = dyn_cast(BE->getRHS()); - assert(SRE && CE && "Binary expression must be sym+const."); - Offset = CE->getValue(); - } else { - SRE = dyn_cast(Expr); - assert(SRE && "Unexpected MCExpr type."); - } - assert(SRE->getKind() == MCSymbolRefExpr::VK_None); - - SRE->getSymbol().print(OS, MAI); - - if (Offset) { - if (Offset > 0) - OS << '+'; - OS << Offset; - } -} - -void XCoreInstPrinter:: -printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - return; - } - - if (Op.isImm()) { - O << Op.getImm(); - return; - } - - assert(Op.isExpr() && "unknown operand kind in printOperand"); - printExpr(Op.getExpr(), &MAI, O); -} diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h deleted file mode 100644 index a0b480026469..000000000000 --- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h +++ /dev/null @@ -1,47 +0,0 @@ -//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file contains the declaration of the XCoreInstPrinter class, -/// which is used to print XCore MCInst to a .s file. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H -#define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H - -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCInstPrinter.h" - -namespace llvm { - -class XCoreInstPrinter : public MCInstPrinter { -public: - XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - -private: - void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O); - void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O); -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H diff --git a/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp new file mode 100644 index 000000000000..d231e0981324 --- /dev/null +++ b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp @@ -0,0 +1,89 @@ +//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class prints an XCore MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "XCoreInstPrinter.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "XCoreGenAsmWriter.inc" + +void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << StringRef(getRegisterName(RegNo)).lower(); +} + +void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +void XCoreInstPrinter:: +printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) { + report_fatal_error("can't handle InlineJT"); +} + +void XCoreInstPrinter:: +printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) { + report_fatal_error("can't handle InlineJT32"); +} + +static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI, + raw_ostream &OS) { + int Offset = 0; + const MCSymbolRefExpr *SRE; + + if (const MCBinaryExpr *BE = dyn_cast(Expr)) { + SRE = dyn_cast(BE->getLHS()); + const MCConstantExpr *CE = dyn_cast(BE->getRHS()); + assert(SRE && CE && "Binary expression must be sym+const."); + Offset = CE->getValue(); + } else { + SRE = dyn_cast(Expr); + assert(SRE && "Unexpected MCExpr type."); + } + assert(SRE->getKind() == MCSymbolRefExpr::VK_None); + + SRE->getSymbol().print(OS, MAI); + + if (Offset) { + if (Offset > 0) + OS << '+'; + OS << Offset; + } +} + +void XCoreInstPrinter:: +printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + return; + } + + if (Op.isImm()) { + O << Op.getImm(); + return; + } + + assert(Op.isExpr() && "unknown operand kind in printOperand"); + printExpr(Op.getExpr(), &MAI, O); +} diff --git a/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h new file mode 100644 index 000000000000..4f0940323505 --- /dev/null +++ b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h @@ -0,0 +1,46 @@ +//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the XCoreInstPrinter class, +/// which is used to print XCore MCInst to a .s file. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H +#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class XCoreInstPrinter : public MCInstPrinter { +public: + XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + +private: + void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O); + void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp index 3178a4edbb3b..ae19e2a78eec 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h index 39581e424e8c..b1dd247f8468 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- XCoreMCAsmInfo.h - XCore asm properties ----------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp index 805f1c18b609..877f38e22f9b 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- XCoreMCTargetDesc.cpp - XCore Target Descriptions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,8 +11,9 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/XCoreMCTargetDesc.h" -#include "InstPrinter/XCoreInstPrinter.h" +#include "MCTargetDesc/XCoreInstPrinter.h" #include "MCTargetDesc/XCoreMCAsmInfo.h" +#include "TargetInfo/XCoreTargetInfo.h" #include "XCoreTargetStreamer.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCDwarf.h" diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h index 1dc384fadf69..3e56302f4add 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- XCoreMCTargetDesc.h - XCore Target Descriptions ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,8 +17,6 @@ namespace llvm { class Target; -Target &getTheXCoreTarget(); - } // end namespace llvm // Defines symbolic names for XCore registers. This defines a mapping from diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp index 41f4078cc328..5604f29db3e9 100644 --- a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp +++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp @@ -1,14 +1,12 @@ //===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "XCore.h" -#include "llvm/IR/Module.h" +#include "TargetInfo/XCoreTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.h b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.h new file mode 100644 index 000000000000..35f05f22e4ce --- /dev/null +++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.h @@ -0,0 +1,20 @@ +//===-- XCoreTargetInfo.h - XCore Target Implementation ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_XCORE_TARGETINFO_XCORETARGETINFO_H +#define LLVM_LIB_TARGET_XCORE_TARGETINFO_XCORETARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheXCoreTarget(); + +} + +#endif // LLVM_LIB_TARGET_XCORE_TARGETINFO_XCORETARGETINFO_H diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h index ba6ca843671e..b7b86be9ab51 100644 --- a/lib/Target/XCore/XCore.h +++ b/lib/Target/XCore/XCore.h @@ -1,9 +1,8 @@ //===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td index 04a1dd5e95be..a97b3dd1d0a2 100644 --- a/lib/Target/XCore/XCore.td +++ b/lib/Target/XCore/XCore.td @@ -1,9 +1,8 @@ //===-- XCore.td - Describe the XCore Target Machine -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp index 916bca6392de..9f615b9e7741 100644 --- a/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,7 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/XCoreInstPrinter.h" +#include "MCTargetDesc/XCoreInstPrinter.h" +#include "TargetInfo/XCoreTargetInfo.h" #include "XCore.h" #include "XCoreInstrInfo.h" #include "XCoreMCInstLower.h" @@ -67,11 +67,9 @@ namespace { } void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV); void EmitGlobalVariable(const GlobalVariable *GV) override; @@ -216,7 +214,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, MO.getMBB()->getSymbol()->print(O, MAI); break; case MachineOperand::MO_GlobalAddress: - getSymbol(MO.getGlobal())->print(O, MAI); + PrintSymbolOperand(MO, O); break; case MachineOperand::MO_ConstantPoolIndex: O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' @@ -233,8 +231,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant,const char *ExtraCode, - raw_ostream &O) { + const char *ExtraCode, raw_ostream &O) { // Print the operand if there is no operand modifier. if (!ExtraCode || !ExtraCode[0]) { printOperand(MI, OpNo, O); @@ -242,13 +239,13 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } // Otherwise fallback on the default implementation. - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); } -bool XCoreAsmPrinter:: -PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) { +bool XCoreAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNum, + const char *ExtraCode, + raw_ostream &O) { if (ExtraCode && ExtraCode[0]) { return true; // Unknown modifier. } diff --git a/lib/Target/XCore/XCoreCallingConv.td b/lib/Target/XCore/XCoreCallingConv.td index e149e6d9ec20..aec109b83fa2 100644 --- a/lib/Target/XCore/XCoreCallingConv.td +++ b/lib/Target/XCore/XCoreCallingConv.td @@ -1,9 +1,8 @@ //===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This describes the calling conventions for XCore architecture. diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index fff8a66d0e75..5066407c74aa 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -1,9 +1,8 @@ //===-- XCoreFrameLowering.cpp - Frame info for XCore Target --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h index e98e9cda11db..95c3a2973033 100644 --- a/lib/Target/XCore/XCoreFrameLowering.h +++ b/lib/Target/XCore/XCoreFrameLowering.h @@ -1,9 +1,8 @@ //===-- XCoreFrameLowering.h - Frame info for XCore Target ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp index 4b10e71be03d..e433d21c59b7 100644 --- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp +++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp @@ -1,9 +1,8 @@ //===-- XCoreFrameToArgsOffsetElim.cpp ----------------------------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp index 1688c38efc1d..5fd9e23258b0 100644 --- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 75d7ae7048a1..072278d9fc46 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -1,9 +1,8 @@ //===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -407,23 +406,16 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG) return Known.countMinTrailingZeros() >= 2; } -SDValue XCoreTargetLowering:: -LowerLOAD(SDValue Op, SelectionDAG &DAG) const { +SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + LLVMContext &Context = *DAG.getContext(); LoadSDNode *LD = cast(Op); assert(LD->getExtensionType() == ISD::NON_EXTLOAD && "Unexpected extension type"); assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT"); - if (allowsMisalignedMemoryAccesses(LD->getMemoryVT(), - LD->getAddressSpace(), - LD->getAlignment())) - return SDValue(); - auto &TD = DAG.getDataLayout(); - unsigned ABIAlignment = TD.getABITypeAlignment( - LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); - // Leave aligned load alone. - if (LD->getAlignment() >= ABIAlignment) + if (allowsMemoryAccess(Context, DAG.getDataLayout(), LD->getMemoryVT(), + *LD->getMemOperand())) return SDValue(); SDValue Chain = LD->getChain(); @@ -470,7 +462,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } // Lower to a call to __misaligned_load(BasePtr). - Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -490,23 +482,16 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } -SDValue XCoreTargetLowering:: -LowerSTORE(SDValue Op, SelectionDAG &DAG) const -{ +SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + LLVMContext &Context = *DAG.getContext(); StoreSDNode *ST = cast(Op); assert(!ST->isTruncatingStore() && "Unexpected store type"); assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT"); - if (allowsMisalignedMemoryAccesses(ST->getMemoryVT(), - ST->getAddressSpace(), - ST->getAlignment())) { - return SDValue(); - } - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment( - ST->getMemoryVT().getTypeForEVT(*DAG.getContext())); - // Leave aligned store alone. - if (ST->getAlignment() >= ABIAlignment) { + + if (allowsMemoryAccess(Context, DAG.getDataLayout(), ST->getMemoryVT(), + *ST->getMemOperand())) return SDValue(); - } + SDValue Chain = ST->getChain(); SDValue BasePtr = ST->getBasePtr(); SDValue Value = ST->getValue(); @@ -515,7 +500,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const if (ST->getAlignment() == 2) { SDValue Low = Value; SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value, - DAG.getConstant(16, dl, MVT::i32)); + DAG.getConstant(16, dl, MVT::i32)); SDValue StoreLow = DAG.getTruncStore( Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags()); @@ -528,7 +513,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const } // Lower to a call to __misaligned_store(BasePtr, Value). - Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -541,7 +526,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain).setCallee( - CallingConv::C, Type::getVoidTy(*DAG.getContext()), + CallingConv::C, Type::getVoidTy(Context), DAG.getExternalSymbol("__misaligned_store", getPointerTy(DAG.getDataLayout())), std::move(Args)); @@ -1009,6 +994,27 @@ LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +MachineMemOperand::Flags +XCoreTargetLowering::getMMOFlags(const Instruction &I) const { + // Because of how we convert atomic_load and atomic_store to normal loads and + // stores in the DAG, we need to ensure that the MMOs are marked volatile + // since DAGCombine hasn't been updated to account for atomic, but non + // volatile loads. (See D57601) + if (auto *SI = dyn_cast(&I)) + if (SI->isAtomic()) + return MachineMemOperand::MOVolatile; + if (auto *LI = dyn_cast(&I)) + if (LI->isAtomic()) + return MachineMemOperand::MOVolatile; + if (auto *AI = dyn_cast(&I)) + if (AI->isAtomic()) + return MachineMemOperand::MOVolatile; + if (auto *AI = dyn_cast(&I)) + if (AI->isAtomic()) + return MachineMemOperand::MOVolatile; + return MachineMemOperand::MONone; +} + //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -1772,11 +1778,10 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::STORE: { // Replace unaligned store of unaligned load with memmove. - StoreSDNode *ST = cast(N); + StoreSDNode *ST = cast(N); if (!DCI.isBeforeLegalize() || - allowsMisalignedMemoryAccesses(ST->getMemoryVT(), - ST->getAddressSpace(), - ST->getAlignment()) || + allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + ST->getMemoryVT(), *ST->getMemOperand()) || ST->isVolatile() || ST->isIndexed()) { break; } @@ -1785,12 +1790,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits(); assert((StoreBits % 8) == 0 && "Store size in bits must be a multiple of 8"); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment( - ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext())); unsigned Alignment = ST->getAlignment(); - if (Alignment >= ABIAlignment) { - break; - } if (LoadSDNode *LD = dyn_cast(ST->getValue())) { if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() && diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index 7a99389e54a7..b4f25feda7fe 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -1,9 +1,8 @@ //===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -189,6 +188,8 @@ namespace llvm { SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const; + MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + // Inline asm support std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, diff --git a/lib/Target/XCore/XCoreInstrFormats.td b/lib/Target/XCore/XCoreInstrFormats.td index 379cc39aa617..deb899ddb1af 100644 --- a/lib/Target/XCore/XCoreInstrFormats.td +++ b/lib/Target/XCore/XCoreInstrFormats.td @@ -1,9 +1,8 @@ //===-- XCoreInstrFormats.td - XCore Instruction Formats ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp index b0de048672df..bbad8e354586 100644 --- a/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/lib/Target/XCore/XCoreInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- XCoreInstrInfo.cpp - XCore Instruction Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h index 9d9ee33ce222..b9621f136589 100644 --- a/lib/Target/XCore/XCoreInstrInfo.h +++ b/lib/Target/XCore/XCoreInstrInfo.h @@ -1,9 +1,8 @@ //===-- XCoreInstrInfo.h - XCore Instruction Information --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index b87ba6548962..18f02e1d80f0 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -1,9 +1,8 @@ //===-- XCoreInstrInfo.td - Target Description for XCore ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 7455cd997ad6..a18fb28f2fe9 100644 --- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -1,9 +1,8 @@ //===-- XCoreLowerThreadLocal - Lower thread local variables --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp index 21270192b234..cd28fa5cd144 100644 --- a/lib/Target/XCore/XCoreMCInstLower.cpp +++ b/lib/Target/XCore/XCoreMCInstLower.cpp @@ -1,9 +1,8 @@ //===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/XCore/XCoreMCInstLower.h b/lib/Target/XCore/XCoreMCInstLower.h index abcb80fcf766..0eaa84ef736b 100644 --- a/lib/Target/XCore/XCoreMCInstLower.h +++ b/lib/Target/XCore/XCoreMCInstLower.h @@ -1,9 +1,8 @@ //===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp index b7b0daab9806..0b4fcffbc655 100644 --- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp +++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h index 6c05ab3f10df..aebe11b15b54 100644 --- a/lib/Target/XCore/XCoreMachineFunctionInfo.h +++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h @@ -1,9 +1,8 @@ //===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index e119d9555f9d..3752274e2cdf 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- XCoreRegisterInfo.cpp - XCore Register Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -284,7 +283,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Offset += StackSize; - unsigned FrameReg = getFrameRegister(MF); + Register FrameReg = getFrameRegister(MF); // Special handling of DBG_VALUE instructions. if (MI.isDebugValue()) { @@ -322,7 +321,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } -unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const XCoreFrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? XCore::R10 : XCore::SP; diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h index 2e9fd98ed34f..35a42e1a1457 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.h +++ b/lib/Target/XCore/XCoreRegisterInfo.h @@ -1,9 +1,8 @@ //===-- XCoreRegisterInfo.h - XCore Register Information Impl ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -44,7 +43,7 @@ public: RegScavenger *RS = nullptr) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; //! Return whether to emit frame moves static bool needsFrameMoves(const MachineFunction &MF); diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td index 6694b2882aca..d9502939bae3 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.td +++ b/lib/Target/XCore/XCoreRegisterInfo.td @@ -1,9 +1,8 @@ //===-- XCoreRegisterInfo.td - XCore Register defs ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp index 646309e02de8..c86756e345a9 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- XCoreSelectionDAGInfo.cpp - XCore SelectionDAG Info ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h index 7cd0d8216e91..5dcef08391c9 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.h +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- XCoreSelectionDAGInfo.h - XCore SelectionDAG Info -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp index 99ad2c88504f..ffeb0862c945 100644 --- a/lib/Target/XCore/XCoreSubtarget.cpp +++ b/lib/Target/XCore/XCoreSubtarget.cpp @@ -1,9 +1,8 @@ //===-- XCoreSubtarget.cpp - XCore Subtarget Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h index ed9936ebf2b8..68139da9d1d0 100644 --- a/lib/Target/XCore/XCoreSubtarget.h +++ b/lib/Target/XCore/XCoreSubtarget.h @@ -1,9 +1,8 @@ //===-- XCoreSubtarget.h - Define Subtarget for the XCore -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 2aa9932e2465..2a8cd6b657b7 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,7 @@ #include "XCoreTargetMachine.h" #include "MCTargetDesc/XCoreMCTargetDesc.h" +#include "TargetInfo/XCoreTargetInfo.h" #include "XCore.h" #include "XCoreTargetObjectFile.h" #include "XCoreTargetTransformInfo.h" diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index 965b9b2c4d65..9c3bdcf78f9c 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -1,9 +1,8 @@ //===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp index c60a262e719c..fe743b28b4b4 100644 --- a/lib/Target/XCore/XCoreTargetObjectFile.cpp +++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h index 5eb423a7435e..fd172c55919f 100644 --- a/lib/Target/XCore/XCoreTargetObjectFile.h +++ b/lib/Target/XCore/XCoreTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- XCoreTargetObjectFile.h - XCore Object Info -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreTargetStreamer.h b/lib/Target/XCore/XCoreTargetStreamer.h index 3563dbc5cb7b..3543fc52ea7f 100644 --- a/lib/Target/XCore/XCoreTargetStreamer.h +++ b/lib/Target/XCore/XCoreTargetStreamer.h @@ -1,9 +1,8 @@ //===-- XCoreTargetStreamer.h - XCore Target Streamer ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h index aa068b333425..3fecaaa59722 100644 --- a/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- XCoreTargetTransformInfo.h - XCore specific TTI ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file -- cgit v1.3